From da87d648b3dae56635008ef8f86a9fa41518cbbc Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Tue, 17 Aug 2021 16:51:34 -0700 Subject: [PATCH 001/530] `F.avg_pool3` CUDA backward: gpuAtomicAddNoReturn -> fastAtomicAdd (#63387) Summary: Rel: https://github.com/pytorch/pytorch/issues/62695 In the following two tables, I set `kernel_size` to 3 and `stride` to 2. In benchmark, input tensors have the shape of (N, C, n_features, n_features, n_features). Tested on RTX3080 w/ CUDA11.4 Update 1. ## This PR | N | C | n_features | dtype | time | |----:|----:|-------------:|:--------------|------------:| | 32 | 3 | 8 | torch.float16 | 7.46846e-05 | | 32 | 3 | 8 | torch.float32 | 8.18968e-05 | | 32 | 3 | 32 | torch.float16 | 0.000156748 | | 32 | 3 | 32 | torch.float32 | 0.000165236 | | 32 | 3 | 128 | torch.float16 | 0.00549854 | | 32 | 3 | 128 | torch.float32 | 0.008926 | ## master (6acd87f) | N | C | n_features | dtype | time | |----:|----:|-------------:|:--------------|------------:| | 32 | 3 | 8 | torch.float16 | 7.60436e-05 | | 32 | 3 | 8 | torch.float32 | 7.55072e-05 | | 32 | 3 | 32 | torch.float16 | 0.000189292 | | 32 | 3 | 32 | torch.float32 | 0.000168645 | | 32 | 3 | 128 | torch.float16 | 0.00699538 | | 32 | 3 | 128 | torch.float32 | 0.00890226 | master's time divided by PR's time is as follows: | N | C | n_features | master / PR | |---:|---:|---------------:|----------------:| | 32 | 3 | 8 | 1.018 | | 32 | 3 | 32 | 1.208 | | 32 | 3 | 128 | 1.272| cc: xwang233 ptrblck ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/63387 Reviewed By: mruberry Differential Revision: D30381434 Pulled By: ngimel fbshipit-source-id: 3b97aee4b0d457a0277a0d31ac56d4151134c099 --- aten/src/ATen/native/cuda/AveragePool3d.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu index 671b354734db0..6c712af93cc68 100644 --- a/aten/src/ATen/native/cuda/AveragePool3d.cu +++ b/aten/src/ATen/native/cuda/AveragePool3d.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -210,7 +211,7 @@ __global__ void avg_pool3d_cuda_update_grad_input_atomic( int dT, int dH, int dW, int padT, int padH, int padW, bool count_include_pad, - int offsetZ, int divisor_override) + int offsetZ, int divisor_override, const int gradInput_numel) { int oCol = blockIdx.x * blockDim.x + threadIdx.x; int oRow = blockIdx.y * blockDim.y + threadIdx.y; @@ -253,7 +254,8 @@ __global__ void avg_pool3d_cuda_update_grad_input_atomic( { for (int iCol = wstart; iCol < wend; ++iCol) { - gpuAtomicAddNoReturn(&gradInput[slice][iFrame][iRow][iCol], val); + const int index = slice * gradInput.stride(0) + iFrame * gradInput.stride(1) + iRow * gradInput.stride(2) + iCol * gradInput.stride(3); + fastAtomicAdd(gradInput.data(), index, gradInput_numel, val, true); } } } @@ -568,7 +570,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) ( dT, dH, dW, padT, padH, padW, count_include_pad, - offsetZ, divisor); + offsetZ, divisor, work_grad_input.numel()); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { From 975542c3146df9ad5c4fc74fc4651a22e194954e Mon Sep 17 00:00:00 2001 From: zhouzhuojie Date: Tue, 17 Aug 2021 16:53:08 -0700 Subject: [PATCH 002/530] Add more ciflow labels for more workflows (#63410) Summary: - Add more ciflow labels and enable it for more workflows. - Only the 'ciflow/default' workflows are run by default on pull_request time - Other labels can be manually triggered by (adding the labels + unassign pytorchbot), OR wait for pytorchbot's comment opt-in rollout - The label design is a logical operator `OR`, i.e. adding ('ciflow/cuda' + 'ciflow/win') will trigger the union of them. (design feedback is needed here) Typical default workflows for normal PRs.
Generated label rules ![image](https://user-images.githubusercontent.com/658840/129779905-eb5e56dd-a696-4040-9eb6-71ecb6487dc1.png) ``` { "label_rules": { "ciflow/all": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda10.2-py3.6-gcc7", "linux-xenial-cuda11.1-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-win-vs2019-cuda11.3-py3", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3", "win-vs2019-cuda11.1-py3" ], "ciflow/bazel": [ "linux-xenial-py3.6-gcc7-bazel-test" ], "ciflow/coverage": [ "linux-bionic-py3.8-gcc9-coverage" ], "ciflow/cpu": [ "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", "win-vs2019-cpu-py3" ], "ciflow/cuda": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-xenial-cuda10.2-py3.6-gcc7", "linux-xenial-cuda11.1-py3.6-gcc7", "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-win-vs2019-cuda11.3-py3", "win-vs2019-cuda10.1-py3", "win-vs2019-cuda11.1-py3" ], "ciflow/default": [ "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda11.1-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3" ], "ciflow/libtorch": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7" ], "ciflow/linux": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda10.2-py3.6-gcc7", "linux-xenial-cuda11.1-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-linux-xenial-cuda11.3-py3.6-gcc7" ], "ciflow/scheduled": [ "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-win-vs2019-cuda11.3-py3" ], "ciflow/slow": [ "linux-bionic-cuda10.2-py3.9-gcc7", "linux-xenial-cuda10.2-py3.6-gcc7" ], "ciflow/win": [ "periodic-win-vs2019-cuda11.3-py3", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3", "win-vs2019-cuda11.1-py3" ] }, "version": "v1" } ```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63410 Reviewed By: ngimel Differential Revision: D30378553 Pulled By: zhouzhuojie fbshipit-source-id: 4e0953740793e5e72b95018f8ab2ce4a6a364c38 --- .github/generated-ciflow-ruleset.json | 65 +++++++++++ .github/scripts/generate_ci_workflows.py | 102 +++++++++++++----- ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 12 ++- ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 12 ++- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 17 ++- ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 2 +- ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 +- ...rated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 17 ++- .../generated-linux-xenial-py3.6-gcc5.4.yml | 17 ++- ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 2 +- ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 +- ...iodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 +- ...rated-periodic-win-vs2019-cuda11.3-py3.yml | 2 +- .../generated-win-vs2019-cpu-py3.yml | 13 ++- .../generated-win-vs2019-cuda10.1-py3.yml | 13 ++- .../generated-win-vs2019-cuda11.1-py3.yml | 14 ++- 16 files changed, 240 insertions(+), 54 deletions(-) diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 70d5fd45298e8..80b2cabfff788 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -1,19 +1,84 @@ { "label_rules": { + "ciflow/all": [ + "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "linux-bionic-cuda10.2-py3.9-gcc7", + "linux-bionic-py3.8-gcc9-coverage", + "linux-xenial-cuda10.2-py3.6-gcc7", + "linux-xenial-cuda11.1-py3.6-gcc7", + "linux-xenial-py3.6-gcc5.4", + "linux-xenial-py3.6-gcc7-bazel-test", + "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-win-vs2019-cuda11.3-py3", + "win-vs2019-cpu-py3", + "win-vs2019-cuda10.1-py3", + "win-vs2019-cuda11.1-py3" + ], + "ciflow/bazel": [ + "linux-xenial-py3.6-gcc7-bazel-test" + ], + "ciflow/coverage": [ + "linux-bionic-py3.8-gcc9-coverage" + ], + "ciflow/cpu": [ + "linux-bionic-py3.8-gcc9-coverage", + "linux-xenial-py3.6-gcc5.4", + "linux-xenial-py3.6-gcc7-bazel-test", + "win-vs2019-cpu-py3" + ], + "ciflow/cuda": [ + "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "linux-bionic-cuda10.2-py3.9-gcc7", + "linux-xenial-cuda10.2-py3.6-gcc7", + "linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-win-vs2019-cuda11.3-py3", + "win-vs2019-cuda10.1-py3", + "win-vs2019-cuda11.1-py3" + ], "ciflow/default": [ "linux-bionic-py3.8-gcc9-coverage", + "linux-xenial-cuda11.1-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3" ], + "ciflow/libtorch": [ + "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7" + ], + "ciflow/linux": [ + "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "linux-bionic-cuda10.2-py3.9-gcc7", + "linux-bionic-py3.8-gcc9-coverage", + "linux-xenial-cuda10.2-py3.6-gcc7", + "linux-xenial-cuda11.1-py3.6-gcc7", + "linux-xenial-py3.6-gcc5.4", + "linux-xenial-py3.6-gcc7-bazel-test", + "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-linux-xenial-cuda11.3-py3.6-gcc7" + ], "ciflow/scheduled": [ "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-linux-xenial-cuda11.3-py3.6-gcc7", "periodic-win-vs2019-cuda11.3-py3" ], "ciflow/slow": [ + "linux-bionic-cuda10.2-py3.9-gcc7", "linux-xenial-cuda10.2-py3.6-gcc7" + ], + "ciflow/win": [ + "periodic-win-vs2019-cuda11.3-py3", + "win-vs2019-cpu-py3", + "win-vs2019-cuda10.1-py3", + "win-vs2019-cuda11.1-py3" ] }, "version": "v1" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 0d6844bf8dadc..fce50ac7811e5 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -29,24 +29,16 @@ LINUX_CUDA_TEST_RUNNER, } +CUDA_RUNNERS = { + WINDOWS_CUDA_TEST_RUNNER, + LINUX_CUDA_TEST_RUNNER, +} +CPU_RUNNERS = { + WINDOWS_CPU_TEST_RUNNER, + LINUX_CPU_TEST_RUNNER, +} + -# TODO: ------------- Remove the comment once fully rollout ------------------- -# Rollout Strategy: -# 1. Manual Phase -# step 1. Add 'ciflow/default' label to the PR -# step 2. Once there's an [unassigned] event from PR, it should rerun -# step 3. Remove 'ciflow/default' label -# step 4. Trigger the [unassigned] event again, it should not rerun -# 2. Probot Phase 1 (manual on 1 workflow) -# step 1. Probot automatically add labels based on the context -# step 2. Manually let probot trigger [unassigned] event -# 3. Probot Phase 2 (auto on 1 workflows) -# step 1. Modify the workflows so that they only listen on [unassigned] events -# step 2. Probot automatically adds labels automatically based on the context -# step 3. Probot automatically triggers [unassigned] event -# 4. Probot Phase 3 (auto on many workflows) -# step 1. Enable it for all workflows -# ----------------------------------------------------------------------- @dataclass class CIFlowConfig: enabled: bool = False @@ -67,11 +59,11 @@ def gen_root_job_condition(self) -> None: # Once fully rollout, we can have strict constraints # e.g. ADD env.GITHUB_ACTOR == '{self.trigger_actor} # REMOVE github.event.action !='{self.trigger_action}' - label_conditions = [f"github.event.action == '{self.trigger_action}'"] + \ - [f"contains(github.event.pull_request.labels.*.name, '{label}')" for label in self.labels] + label_conditions = [ + f"contains(github.event.pull_request.labels.*.name, '{label}')" for label in sorted(self.labels)] self.root_job_condition = f"(github.event_name != 'pull_request') || " \ f"(github.event.action !='{self.trigger_action}') || " \ - f"({' && '.join(label_conditions)})" + f"({' || '.join(label_conditions)})" def reset_root_job(self) -> None: self.root_job_name = '' @@ -156,6 +148,9 @@ def __post_init__(self) -> None: else: self.num_test_shards_on_pull_request = self.num_test_shards + # Add ciflow/all to labels + self.ciflow_config.labels.add('ciflow/all') + self.assert_valid() def assert_valid(self) -> None: @@ -165,6 +160,20 @@ def assert_valid(self) -> None: if self.arch == 'windows': assert self.test_runner_type in WINDOWS_RUNNERS, err_message + if self.ciflow_config.enabled: + # make sure if ciflow/default is set, we then need to set trigger_action_only to False + assert self.ciflow_config.trigger_action_only != ('ciflow/default' in self.ciflow_config.labels) + assert self.on_pull_request + assert 'ciflow/all' in self.ciflow_config.labels + if self.arch == 'linux': + assert 'ciflow/linux' in self.ciflow_config.labels + if self.arch == 'windows': + assert 'ciflow/win' in self.ciflow_config.labels + if self.test_runner_type in CUDA_RUNNERS: + assert 'ciflow/cuda' in self.ciflow_config.labels + if self.test_runner_type in CPU_RUNNERS: + assert 'ciflow/cpu' in self.ciflow_config.labels + def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml" with open(output_file_path, "w") as output_file: @@ -183,6 +192,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: test_runner_type=WINDOWS_CPU_TEST_RUNNER, on_pull_request=True, num_test_shards=2, + ciflow_config=CIFlowConfig( + enabled=True, + labels={'ciflow/default', 'ciflow/cpu', 'ciflow/win'} + ), ), CIWorkflow( arch="windows", @@ -192,6 +205,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: on_pull_request=True, only_run_smoke_tests_on_pull_request=True, num_test_shards=2, + ciflow_config=CIFlowConfig( + enabled=True, + labels={'ciflow/default', 'ciflow/cuda', 'ciflow/win'} + ), ), CIWorkflow( arch="windows", @@ -199,6 +216,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: cuda_version="11.1", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels={'ciflow/cuda', 'ciflow/win'} + ), ), CIWorkflow( arch="windows", @@ -211,7 +234,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/scheduled'} + labels={'ciflow/scheduled', 'ciflow/win', 'ciflow/cuda'} ), ), ] @@ -225,6 +248,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: on_pull_request=True, enable_doc_jobs=True, num_test_shards=2, + ciflow_config=CIFlowConfig( + enabled=True, + labels={'ciflow/default', 'ciflow/linux', 'ciflow/cpu'} + ), ), # CIWorkflow( # arch="linux", @@ -268,6 +295,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, num_test_shards=2, + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels={'ciflow/slow', 'ciflow/linux', 'ciflow/cuda'} + ), ), CIWorkflow( arch="linux", @@ -284,7 +317,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels=set(['ciflow/slow']), + labels=set(['ciflow/slow', 'ciflow/linux', 'ciflow/cuda']), ), ), CIWorkflow( @@ -293,6 +326,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, is_libtorch=True, + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']), + ), ), CIWorkflow( arch="linux", @@ -300,6 +339,11 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, num_test_shards=2, + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + labels=set(['ciflow/default', 'ciflow/linux', 'ciflow/cuda']), + ), ), CIWorkflow( arch="linux", @@ -307,6 +351,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, is_libtorch=True, + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']), + ), ), CIWorkflow( arch="linux", @@ -319,7 +369,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/scheduled'} + labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/cuda'} ), ), CIWorkflow( @@ -333,7 +383,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/scheduled'}, + labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/libtorch', 'ciflow/cuda'}, ), ), # CIWorkflow( @@ -364,7 +414,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, - labels=set(['ciflow/default']), + labels={'ciflow/default', 'ciflow/coverage', 'ciflow/linux', 'ciflow/cpu'}, ), ), # CIWorkflow( @@ -433,7 +483,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: on_pull_request=True, ciflow_config=CIFlowConfig( enabled=True, - labels=set(['ciflow/default']), + labels={'ciflow/default', 'ciflow/bazel', 'ciflow/cpu', 'ciflow/linux'}, ), ), ] diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index a783b9b1886ec..780de8e1919e9 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -4,7 +4,8 @@ name: libtorch-linux-xenial-cuda10.2-py3.6-gcc7 on: - # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + pull_request: + types: [unassigned] push: branches: - master @@ -28,9 +29,16 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + steps: + - name: noop + run: echo running ciflow_should_run calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge + needs: [ciflow_should_run] env: DOCKER_BUILDKIT: 1 timeout-minutes: 90 @@ -104,7 +112,7 @@ jobs: build: runs-on: linux.2xlarge - needs: [calculate-docker-image, ] + needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-build diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index da2bbc1400388..3d586ae322e8e 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -4,7 +4,8 @@ name: libtorch-linux-xenial-cuda11.1-py3.6-gcc7 on: - # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + pull_request: + types: [unassigned] push: branches: - master @@ -28,9 +29,16 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + steps: + - name: noop + run: echo running ciflow_should_run calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge + needs: [ciflow_should_run] env: DOCKER_BUILDKIT: 1 timeout-minutes: 90 @@ -104,7 +112,7 @@ jobs: build: runs-on: linux.2xlarge - needs: [calculate-docker-image, ] + needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index a9011b7047832..7aa572d83321b 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -4,7 +4,8 @@ name: linux-bionic-cuda10.2-py3.9-gcc7 on: - # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + pull_request: + types: [unassigned] push: branches: - master @@ -28,9 +29,16 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} + steps: + - name: noop + run: echo running ciflow_should_run calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge + needs: [ciflow_should_run] env: DOCKER_BUILDKIT: 1 timeout-minutes: 90 @@ -104,7 +112,7 @@ jobs: build: runs-on: linux.2xlarge - needs: [calculate-docker-image, ] + needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-build @@ -215,6 +223,7 @@ jobs: generate-test-matrix: if: ${{ github.repository_owner == 'pytorch' }} runs-on: ubuntu-18.04 + needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu ENABLE_JIT_LEGACY_TEST: '' @@ -242,7 +251,7 @@ jobs: run: .github/scripts/generate_pytorch_test_matrix.py test: - needs: [calculate-docker-image, build, generate-test-matrix, ] + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} fail-fast: false @@ -400,7 +409,7 @@ jobs: # logs (like test); we can always move it back to the other one, but it # doesn't create the best experience render_test_results: - needs: [generate-test-matrix, test, ] + needs: [generate-test-matrix, test, ciflow_should_run] if: ${{ needs.test.result != 'skipped' || failure() }} runs-on: linux.2xlarge strategy: diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 3663c591ab806..eda7568a809dd 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -31,7 +31,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/default')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 38fe8593fc3eb..c50cac76a2c3e 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -31,7 +31,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml index a5f0488644596..cf2395e9ca829 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -4,7 +4,8 @@ name: linux-xenial-cuda11.1-py3.6-gcc7 on: - # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + pull_request: + types: [opened, synchronize, reopened, unassigned] push: branches: - master @@ -28,9 +29,16 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + steps: + - name: noop + run: echo running ciflow_should_run calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge + needs: [ciflow_should_run] env: DOCKER_BUILDKIT: 1 timeout-minutes: 90 @@ -104,7 +112,7 @@ jobs: build: runs-on: linux.2xlarge - needs: [calculate-docker-image, ] + needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-build @@ -215,6 +223,7 @@ jobs: generate-test-matrix: if: ${{ github.repository_owner == 'pytorch' }} runs-on: ubuntu-18.04 + needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu ENABLE_JIT_LEGACY_TEST: '' @@ -242,7 +251,7 @@ jobs: run: .github/scripts/generate_pytorch_test_matrix.py test: - needs: [calculate-docker-image, build, generate-test-matrix, ] + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} fail-fast: false @@ -400,7 +409,7 @@ jobs: # logs (like test); we can always move it back to the other one, but it # doesn't create the best experience render_test_results: - needs: [generate-test-matrix, test, ] + needs: [generate-test-matrix, test, ciflow_should_run] if: ${{ needs.test.result != 'skipped' || failure() }} runs-on: linux.2xlarge strategy: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 14e22d85edc26..dd3cb50cfc903 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -5,6 +5,7 @@ name: linux-xenial-py3.6-gcc5.4 on: pull_request: + types: [opened, synchronize, reopened, unassigned] push: branches: - master @@ -28,9 +29,16 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + steps: + - name: noop + run: echo running ciflow_should_run calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge + needs: [ciflow_should_run] env: DOCKER_BUILDKIT: 1 timeout-minutes: 90 @@ -104,7 +112,7 @@ jobs: build: runs-on: linux.2xlarge - needs: [calculate-docker-image, ] + needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.6-gcc5.4-build @@ -215,6 +223,7 @@ jobs: generate-test-matrix: if: ${{ github.repository_owner == 'pytorch' }} runs-on: ubuntu-18.04 + needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.2xlarge ENABLE_JIT_LEGACY_TEST: '' @@ -242,7 +251,7 @@ jobs: run: .github/scripts/generate_pytorch_test_matrix.py test: - needs: [calculate-docker-image, build, generate-test-matrix, ] + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} fail-fast: false @@ -400,7 +409,7 @@ jobs: # logs (like test); we can always move it back to the other one, but it # doesn't create the best experience render_test_results: - needs: [generate-test-matrix, test, ] + needs: [generate-test-matrix, test, ciflow_should_run] if: ${{ needs.test.result != 'skipped' || failure() }} runs-on: linux.2xlarge strategy: @@ -460,7 +469,7 @@ jobs: pytorch_python_doc_build: runs-on: linux.2xlarge - needs: [calculate-docker-image, build, ] + needs: [calculate-docker-image, build, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} steps: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 87c6df024b6e4..dbfba5f1fa74d 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -31,7 +31,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/default')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 4aa29abb09d6d..8352b229f4fae 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -29,7 +29,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml index 5ec1ddb8516eb..dcbd19d661eb1 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -29,7 +29,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml index 78c536c0bbd11..9487ea5a8fdb3 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml @@ -34,7 +34,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 2769f7c498eef..fb2a097c64452 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -5,6 +5,7 @@ name: win-vs2019-cpu-py3 on: pull_request: + types: [opened, synchronize, reopened, unassigned] push: branches: - master @@ -31,12 +32,19 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + steps: + - name: noop + run: echo running ciflow_should_run build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: "windows.4xlarge" defaults: run: working-directory: pytorch-${{ github.run_id }} + needs: [ciflow_should_run] env: JOB_BASE_NAME: win-vs2019-cpu-py3-build http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" @@ -90,6 +98,7 @@ jobs: generate-test-matrix: if: ${{ github.repository_owner == 'pytorch' }} + needs: [ciflow_should_run] runs-on: ubuntu-18.04 env: TEST_RUNNER_TYPE: windows.4xlarge @@ -121,7 +130,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: False PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix, ] + needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} fail-fast: false @@ -198,7 +207,7 @@ jobs: # logs (like test); we can always move it back to the other one, but it # doesn't create the best experience render_test_results: - needs: [generate-test-matrix, test, ] + needs: [generate-test-matrix, test, ciflow_should_run] if: ${{ needs.test.result != 'skipped' || failure() }} runs-on: linux.2xlarge strategy: diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index d94ba7850ee32..2fbc8650f43e1 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -5,6 +5,7 @@ name: win-vs2019-cuda10.1-py3 on: pull_request: + types: [opened, synchronize, reopened, unassigned] push: branches: - master @@ -33,12 +34,19 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + steps: + - name: noop + run: echo running ciflow_should_run build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: "windows.4xlarge" defaults: run: working-directory: pytorch-${{ github.run_id }} + needs: [ciflow_should_run] env: JOB_BASE_NAME: win-vs2019-cuda10.1-py3-build http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" @@ -100,6 +108,7 @@ jobs: generate-test-matrix: if: ${{ github.repository_owner == 'pytorch' }} + needs: [ciflow_should_run] runs-on: ubuntu-18.04 env: TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu @@ -131,7 +140,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: True PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix, ] + needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} fail-fast: false @@ -216,7 +225,7 @@ jobs: # logs (like test); we can always move it back to the other one, but it # doesn't create the best experience render_test_results: - needs: [generate-test-matrix, test, ] + needs: [generate-test-matrix, test, ciflow_should_run] if: ${{ needs.test.result != 'skipped' || failure() }} runs-on: linux.2xlarge strategy: diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml index 9c9b733aef445..8b52a07055608 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml @@ -4,6 +4,8 @@ name: win-vs2019-cuda11.1-py3 on: + pull_request: + types: [unassigned] push: branches: - master @@ -32,12 +34,19 @@ concurrency: cancel-in-progress: true jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + steps: + - name: noop + run: echo running ciflow_should_run build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: "windows.4xlarge" defaults: run: working-directory: pytorch-${{ github.run_id }} + needs: [ciflow_should_run] env: JOB_BASE_NAME: win-vs2019-cuda11.1-py3-build http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" @@ -99,6 +108,7 @@ jobs: generate-test-matrix: if: ${{ github.repository_owner == 'pytorch' }} + needs: [ciflow_should_run] runs-on: ubuntu-18.04 env: TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu @@ -130,7 +140,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: False PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix, ] + needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} fail-fast: false @@ -215,7 +225,7 @@ jobs: # logs (like test); we can always move it back to the other one, but it # doesn't create the best experience render_test_results: - needs: [generate-test-matrix, test, ] + needs: [generate-test-matrix, test, ciflow_should_run] if: ${{ needs.test.result != 'skipped' || failure() }} runs-on: linux.2xlarge strategy: From cd5e9dcc1dd2d240aa034b1f02b9886b44b1efa6 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 17 Aug 2021 16:54:09 -0700 Subject: [PATCH 003/530] [quant][graphmode][fx][fix] Fix quantization for tuple arguments (#63376) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63376 Previously when tuple is an argument for a quantizable op it would be transformed to a list by mistake, this PR fixes that. Test Plan: python test/test_quantization.py TestQuantizeFx.test_preserve_tuple Imported from OSS Reviewed By: raghuramank100 Differential Revision: D30357642 fbshipit-source-id: 82d10805d9c00c003cc99983dca68b6455ff7b2e --- test/quantization/fx/test_quantize_fx.py | 22 ++++++++++++++++++++++ torch/quantization/fx/prepare.py | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 7940eb73114c6..2f5f7c4a27f6e 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -2807,6 +2807,28 @@ def forward(self, x): m = convert_fx(m, is_reference=True) m(torch.rand(2, 1, 5, 5)) + def test_preserve_tuple(self): + """ Test tuple input type is preserved + """ + from typing import List + + class LSTM(nn.Module): + def __init__(self): + super().__init__() + self.lstm = nn.LSTM(50, 50, 1) + + def forward(self, inputs: torch.Tensor, state: List[torch.Tensor]): + h = state[0] + c = state[1] + return self.lstm(inputs, (h, c)) + + m = LSTM().eval() + m = prepare_fx(m, {"": default_qconfig}) + # make sure the arg[1] of lstm module is a tuple + for n in m.graph.nodes: + if n.target == "lstm": + self.assertEqual(type(n.args[1]), tuple) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index ab137487b3cc8..873d11acaa82e 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -324,7 +324,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg( graph, node_name_to_target_dtype, qhandler, prepare_custom_config_dict) new_arg_to_return.append(new_inner_arg) - return new_arg_to_return + return type(arg)(new_arg_to_return) if not isinstance(arg, Node): return arg From 5b8862abf14add4e280b92c415d840f02582eba4 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 17 Aug 2021 17:12:32 -0700 Subject: [PATCH 004/530] [DDP] Support step_param for AdamW (#63382) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63382 Per title ghstack-source-id: 135966156 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D30255446 fbshipit-source-id: e6ffbf339db0bc5b4702d02b74a462309df07c75 --- test/distributed/test_c10d_nccl.py | 18 ++++++- test/test_functional_optim.py | 13 ++++- torch/distributed/optim/functional_adamw.py | 49 +++++++++++++++++++ .../_internal/distributed/distributed_test.py | 30 +++++++++++- 4 files changed, 106 insertions(+), 4 deletions(-) diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index 2e5045153b149..285053d6d1c7e 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -53,9 +53,11 @@ if not IS_WINDOWS: from torch.distributed.optim.functional_sgd import _FunctionalSGD from torch.distributed.optim.functional_adam import _FunctionalAdam + from torch.distributed.optim.functional_adamw import _FunctionalAdamW _SUPPORTED_OPTIM_MAPPING = { _FunctionalSGD: torch.optim.SGD, - _FunctionalAdam: torch.optim.Adam + _FunctionalAdam: torch.optim.Adam, + _FunctionalAdamW: torch.optim.AdamW, } if TEST_WITH_TSAN: @@ -1737,6 +1739,20 @@ def test_hook_then_sgd_nccl_grad_as_bucket_view(self): gradient_as_bucket_view=True ) + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_hook_then_adamw_nccl(self): + adamw_lr = 1e-2 + adamw_betas = (0.9, 0.99) + adamw_eps = 1e-6 + self._test_hook_then_optimizer( + _FunctionalAdamW, + adamw_lr, + betas=adamw_betas, + eps=adamw_eps, + gradient_as_bucket_view=True + ) + @requires_nccl() @skip_if_lt_x_gpu(2) def test_hook_then_adam_nccl(self): diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py index c37823427fc1d..59af691faf36c 100644 --- a/test/test_functional_optim.py +++ b/test/test_functional_optim.py @@ -3,15 +3,17 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch.optim import SGD, Adam +from torch.optim import SGD, Adam, AdamW from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS if not IS_WINDOWS: from torch.distributed.optim.functional_sgd import _FunctionalSGD from torch.distributed.optim.functional_adam import _FunctionalAdam + from torch.distributed.optim.functional_adamw import _FunctionalAdamW _SUPPORTED_OPTIM_MAPPING = { SGD: _FunctionalSGD, - Adam: _FunctionalAdam + Adam: _FunctionalAdam, + AdamW: _FunctionalAdamW, } @@ -102,6 +104,13 @@ def test_functional_optim_parity_sgd(self): def test_functional_optim_parity_adam(self): self._test_functional_optim_parity(Adam, 1e-2, betas=(0.9, 0.999), eps=1e-6) + @unittest.skipIf( + IS_WINDOWS, + "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137", + ) + def test_functional_optim_parity_adam_w(self): + self._test_functional_optim_parity(AdamW, 1e-2, betas=(0.9, 0.999), eps=1e-6) + if __name__ == "__main__": run_tests() diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py index 5623a0b8d6841..0159aa35a5539 100644 --- a/torch/distributed/optim/functional_adamw.py +++ b/torch/distributed/optim/functional_adamw.py @@ -53,6 +53,55 @@ def __init__( # param group as it's not a common use case. self.param_group = {"params": params} + def step_param(self, param: Tensor, grad: Optional[Tensor]): + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + max_exp_avg_sqs = [] + state_steps: List[int] = [] + if grad is not None: + params_with_grad.append(param) + grads.append(grad) + # Lazy state initialization + if param not in self.state: + self.state[param] = {} + state = self.state[param] + state['step'] = torch.tensor(0.0) + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format) + if self.amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format) + + state = self.state[param] + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + + if self.amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + # update the steps for each param group update + state['step'] += 1 + # record the step after step update + state_steps.append(state['step'].item()) + with torch.no_grad(): + F.adamw(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=self.amsgrad, + beta1=self.defaults['beta1'], + beta2=self.defaults['beta2'], + lr=self.defaults['lr'], + weight_decay=self.defaults['weight_decay'], + eps=self.defaults['eps']) + def step(self, gradients: List[Optional[Tensor]]): params = self.param_group['params'] params_with_grad = [] diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 54a22b01bd667..6ef94c99aa739 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -70,9 +70,11 @@ import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer from torch.distributed.optim.functional_sgd import _FunctionalSGD from torch.distributed.optim.functional_adam import _FunctionalAdam + from torch.distributed.optim.functional_adamw import _FunctionalAdamW _SUPPORTED_OPTIM_MAPPING = { _FunctionalSGD: torch.optim.SGD, - _FunctionalAdam: torch.optim.Adam + _FunctionalAdam: torch.optim.Adam, + _FunctionalAdamW: torch.optim.AdamW, } from torch.utils.data.distributed import DistributedSampler @@ -3999,6 +4001,32 @@ def _test_ddp_hook_with_optimizer_parity( ) dist.barrier() + @sandcastle_skip_if( + BACKEND != "nccl" and BACKEND != "gloo", + "Only Nccl & Gloo backend support DistributedDataParallel", + ) + @sandcastle_skip_if( + IS_WINDOWS, + "FunctionalAdam not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137" + ) + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_hook_with_optimizer_parity_adamw(self): + for grad_as_bucket_view, static_graph in itertools.product( + [True, False], [True, False] + ): + adamw_lr = 1e-2 + adamw_betas = (0.9, 0.99) + adamw_eps = 1e-6 + self._test_ddp_hook_with_optimizer_parity( + grad_as_bucket_view, + static_graph, + _FunctionalAdamW, + adamw_lr, + betas=adamw_betas, + eps=adamw_eps, + ) + @sandcastle_skip_if( BACKEND != "nccl" and BACKEND != "gloo", "Only Nccl & Gloo backend support DistributedDataParallel", From dcf90b797c42ab237ca68d0e816c0d76122ee931 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 17 Aug 2021 17:12:32 -0700 Subject: [PATCH 005/530] [BE] remove _SUPPORTED_OPTIM_MAP from tests (#63383) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63383 Per title ghstack-source-id: 135966157 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D30358921 fbshipit-source-id: 965e054e525194b1ee55980340df275bab355c9b --- test/distributed/test_c10d_nccl.py | 9 +++------ test/test_functional_optim.py | 14 ++------------ .../_internal/distributed/distributed_test.py | 10 ++++------ 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index 285053d6d1c7e..f7f6681b43a76 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -49,16 +49,12 @@ sandcastle_skip_if, ) from torch.utils.checkpoint import checkpoint +from torch.distributed.optim import functional_optim_map if not IS_WINDOWS: from torch.distributed.optim.functional_sgd import _FunctionalSGD from torch.distributed.optim.functional_adam import _FunctionalAdam from torch.distributed.optim.functional_adamw import _FunctionalAdamW - _SUPPORTED_OPTIM_MAPPING = { - _FunctionalSGD: torch.optim.SGD, - _FunctionalAdam: torch.optim.Adam, - _FunctionalAdamW: torch.optim.AdamW, - } if TEST_WITH_TSAN: print( @@ -1639,7 +1635,8 @@ def _test_hook_then_optimizer( gpu_model_allreduce = self._gpu_model_with_ddp_comm_hook( process_group, default.allreduce_hook, gradient_as_bucket_view, hook_state ) - sgd = _SUPPORTED_OPTIM_MAPPING.get(functional_optim_cls)( + mapping = {v: k for k, v in functional_optim_map.items()} + sgd = mapping.get(functional_optim_cls)( gpu_model_allreduce.parameters(), *functional_optim_args, **functional_optim_kwargs, diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py index 59af691faf36c..98a3f06805dba 100644 --- a/test/test_functional_optim.py +++ b/test/test_functional_optim.py @@ -5,17 +5,7 @@ import torch.nn.functional as F from torch.optim import SGD, Adam, AdamW from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS - -if not IS_WINDOWS: - from torch.distributed.optim.functional_sgd import _FunctionalSGD - from torch.distributed.optim.functional_adam import _FunctionalAdam - from torch.distributed.optim.functional_adamw import _FunctionalAdamW - _SUPPORTED_OPTIM_MAPPING = { - SGD: _FunctionalSGD, - Adam: _FunctionalAdam, - AdamW: _FunctionalAdamW, - } - +from torch.distributed.optim import functional_optim_map class MyModule(torch.nn.Module): def __init__(self): @@ -39,7 +29,7 @@ def _test_functional_optim_parity(self, optim_cls, *args, **kwargs): optim_params = module_optim.parameters() functional_params = module_functional.parameters() optim = optim_cls(optim_params, *args, **kwargs) - functional_optim_cls = _SUPPORTED_OPTIM_MAPPING.get(optim_cls, None) + functional_optim_cls = functional_optim_map.get(optim_cls, None) if not functional_optim_cls: raise ValueError(f"Functional optimizer not implemented for {optim_cls}") optim_functional = functional_optim_cls( diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 6ef94c99aa739..2a126ab894a06 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -66,16 +66,13 @@ sandcastle_skip_if, ) +from torch.distributed.optim import functional_optim_map + if not IS_WINDOWS: import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer from torch.distributed.optim.functional_sgd import _FunctionalSGD from torch.distributed.optim.functional_adam import _FunctionalAdam from torch.distributed.optim.functional_adamw import _FunctionalAdamW - _SUPPORTED_OPTIM_MAPPING = { - _FunctionalSGD: torch.optim.SGD, - _FunctionalAdam: torch.optim.Adam, - _FunctionalAdamW: torch.optim.AdamW, - } from torch.utils.data.distributed import DistributedSampler @@ -3949,7 +3946,8 @@ def _test_ddp_hook_with_optimizer_parity( if static_graph: ddp_model_with_no_hook._set_static_graph() - optimizer_no_hook = _SUPPORTED_OPTIM_MAPPING.get(functional_optim_cls)( + mapping = {v: k for k, v in functional_optim_map.items()} + optimizer_no_hook = mapping.get(functional_optim_cls)( ddp_model_with_no_hook.parameters(), *functional_optim_args, **functional_optim_kwargs, From f12f667e12d8c6f3356267ec87ae15e7fdd0ed76 Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Tue, 17 Aug 2021 19:54:30 -0700 Subject: [PATCH 006/530] [torch] Set default log level for torch elastic (#63214) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63214 The default log level in fb and oss is different: in oss we use WARNING and in fb we use INFO. Test Plan: unittests, f291441502 Reviewed By: cbalioglu Differential Revision: D30296298 fbshipit-source-id: 89067352be767255fbc66e790ec333582de64c6c --- torch/distributed/elastic/utils/log_level.py | 14 ++++++++++++++ torch/distributed/elastic/utils/logging.py | 4 +++- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 torch/distributed/elastic/utils/log_level.py diff --git a/torch/distributed/elastic/utils/log_level.py b/torch/distributed/elastic/utils/log_level.py new file mode 100644 index 0000000000000..87ea0f7d64182 --- /dev/null +++ b/torch/distributed/elastic/utils/log_level.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +def get_log_level() -> str: + """ + Return default log level for pytorch. + """ + return "WARNING" diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py index 19c68c03cf552..e4f1345e4c339 100644 --- a/torch/distributed/elastic/utils/logging.py +++ b/torch/distributed/elastic/utils/logging.py @@ -12,6 +12,8 @@ import warnings from typing import Optional +from torch.distributed.elastic.utils.log_level import get_log_level + def get_logger(name: Optional[str] = None): """ @@ -32,7 +34,7 @@ def get_logger(name: Optional[str] = None): def _setup_logger(name: Optional[str] = None): log = logging.getLogger(name) - log.setLevel(os.environ.get("LOGLEVEL", "WARNING")) + log.setLevel(os.environ.get("LOGLEVEL", get_log_level())) return log From 3fd8e09102c32d47a5af15c86be979eb75008d49 Mon Sep 17 00:00:00 2001 From: Shen Li Date: Tue, 17 Aug 2021 20:12:51 -0700 Subject: [PATCH 007/530] Fix RPC Python User Function Error Handling (#63406) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63406 The `RemoteException` will be thrown on the caller side when converting the response message to IValue. Since it is a Python error, the error message needs to be extracted explicitly and clear the `PyErr`. Test Plan: Imported from OSS Reviewed By: rohan-varma, ngimel Differential Revision: D30372741 Pulled By: mrshenli fbshipit-source-id: 1f72a7ee0c39cc2ef070f99884c142f7b3e0543d --- .../csrc/distributed/rpc/python_functions.cpp | 20 +++++++++++++++++++ .../_internal/distributed/rpc/rpc_test.py | 16 ++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/torch/csrc/distributed/rpc/python_functions.cpp b/torch/csrc/distributed/rpc/python_functions.cpp index 272377166fc5f..2d6533d797175 100644 --- a/torch/csrc/distributed/rpc/python_functions.cpp +++ b/torch/csrc/distributed/rpc/python_functions.cpp @@ -152,6 +152,26 @@ c10::intrusive_ptr toPyJitFuture( IValue ivalue; try { ivalue = toPyIValue(message); + } catch (py::error_already_set& e) { + py::gil_scoped_acquire acquire; + // FIXME: this is a temporary solution to add a special-case for + // ValueError and TypeError, as those are already used in our tests. + // We should have a more comprehensive coverage for other types of + // exceptions as well. + if (e.matches(PyExc_ValueError)) { + child->setErrorIfNeeded( + std::make_exception_ptr(pybind11::value_error(e.what()))); + } else if (e.matches(PyExc_TypeError)) { + child->setErrorIfNeeded( + std::make_exception_ptr(pybind11::type_error(e.what()))); + } else { + // py::error_already_set requires GIL to destruct, take special care. + child->setErrorIfNeeded( + std::make_exception_ptr(std::runtime_error(e.what()))); + } + e.restore(); + PyErr_Clear(); + return; } catch (std::exception& e) { child->setErrorIfNeeded(std::current_exception()); return; diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index c95b7216c4a67..ae57ea5f40f8c 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -1039,6 +1039,20 @@ def test_multi_rpc(self): ) self.assertEqual(ret, torch.ones(n, n) * 2) + @dist_init + def test_future_wait_twice(self): + dst = worker_name((self.rank + 1) % self.world_size) + futs = [] + for i in range(20): + futs.append(rpc.rpc_async(dst, raise_func)) + + with self.assertRaisesRegex(ValueError, "Expected error"): + torch.futures.wait_all(futs) + + for fut in futs: + with self.assertRaisesRegex(ValueError, "Expected error"): + fut.wait() + def _run_uneven_workload(self, num_repeat=30): # worker0 drives and waits for worker1 and worker2 # throughout the test. @@ -3210,7 +3224,7 @@ def test_function_not_on_callee(self): # Ensure that we have the attribute on this module. Otherwise, the test could fail due to a caller-side pickling error. self.assertTrue(hasattr(this_module, "foo_add")) with self.assertRaisesRegex( - AttributeError, "RPC pickler does not serialize" + RuntimeError, "RPC pickler does not serialize" ): rpc.rpc_sync(callee_worker, foo_add, args=()) From 15144ade25bfd5528cad982e82e47f2b57b65af6 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 17 Aug 2021 21:35:55 -0700 Subject: [PATCH 008/530] [fx2trt] Add quantize_per_tensor support (#63447) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63447 Only available in TRT 8.0 and above Test Plan: buck run mode/opt caffe2/torch/fb/fx2trt:test_quantize_per_tensor Reviewed By: 842974287 Differential Revision: D30322844 fbshipit-source-id: dfd925e3432de128f2925b1aa55d6125e63359af --- .../fx2trt/converters/acc_ops_converters.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 506bf2cdbec93..0bca6e28c83b6 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -1106,3 +1106,35 @@ def acc_ops_permute(network, target, args, kwargs, name): layer.second_transpose = tuple(permutation) layer.name = name return layer.get_output(0) + +@tensorrt_converter(acc_ops.quantize_per_tensor) +def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): + input_val = kwargs["input"] + + if not isinstance(input_val, trt.tensorrt.ITensor): + raise RuntimeError(f"{name} received input {input_val} that is not part " + "of the TensorRT region!") + + q_scale = acc_utils.get_field_from_acc_out_ty(kwargs["acc_out_ty"], "q_scale") + q_zero_point = acc_utils.get_field_from_acc_out_ty(kwargs["acc_out_ty"], "q_zero_point") + dtype = acc_utils.get_field_from_acc_out_ty(kwargs["acc_out_ty"], "dtype") + if dtype not in (torch.quint8, torch.qint8, torch.qint32): + raise RuntimeError("Only support (torch.quint8, torch.qint8, torch.qint32) " + f"quantized type in quantize_per_tensor, get {dtype}.") + + if q_zero_point != 0: + raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}") + + # temporarily set q_scale to 1 to make sure the q_scale is different + # for quantize and dequantize to avoid the error + # TODO: follow up with nvidia TensorRT team to repro and fix the problem + q_scale = 1 + scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32))) + scale_layer.name = input_val.name + ".quant.scale" + scale = scale_layer.get_output(0) + assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " + "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + layer = network.add_quantize(input=input_val, scale=scale) + layer.axis = 0 + layer.name = input_val.name + ".quant" + return layer.get_output(0) From 2fd14735d677d1c2cf87e4c76aea2111bc30c17b Mon Sep 17 00:00:00 2001 From: Pavithran Ramachandran Date: Tue, 17 Aug 2021 22:26:22 -0700 Subject: [PATCH 009/530] [easy][PyTorchEdge] print error message when failing to load model file (#63404) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63404 # Context Loading a model file using `fopen` might error out for multiple reasons. Repro'ing the error on devices takes some time and efforts. Logging the error no# will help in debugging and fixing the error quickly. # Mitigation Printout the error message of the `fopen` to help users debug the issue. Test Plan: ``` (base) [pavithran@devvm1803.vll0 /data/users/pavithran/fbsource] buck run xplat/caffe2/fb/lite_predictor:lite_predictor -- --model=/home/pavithran/models/prod/GAaNhAoTIV6cIvgJAHn30m8NR1QgbmQwAAAA.ptl --use_bundled_input=0 Building: finished in 0.5 sec (100%) 354/354 jobs, 0/354 updated Total time: 0.6 sec Run with 24 threads Run with 24 threads Loading model... terminate called after throwing an instance of 'c10::Error' what(): open file failed because of errno 2 on fopen: No such file or directory, file path: /home/pavithran/models/prod/GAaNhAoTIV6cIvgJAHn30m8NR1QgbmQwAAAA.ptl Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:15 (most recent call first): (no backtrace available) ``` Reviewed By: dhruvbird Differential Revision: D30372308 fbshipit-source-id: 5346e828f53f6bc5d871b403586566a3332a389a --- caffe2/serialize/file_adapter.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/caffe2/serialize/file_adapter.cc b/caffe2/serialize/file_adapter.cc index 701270b566145..1fddce970a84f 100644 --- a/caffe2/serialize/file_adapter.cc +++ b/caffe2/serialize/file_adapter.cc @@ -1,7 +1,8 @@ #include "caffe2/serialize/file_adapter.h" #include +#include #include - +#include #include "caffe2/core/common.h" namespace caffe2 { @@ -10,7 +11,20 @@ namespace serialize { FileAdapter::RAIIFile::RAIIFile(const std::string& file_name) { fp_ = fopen(file_name.c_str(), "rb"); if (fp_ == nullptr) { - AT_ERROR("open file failed, file path: ", file_name); + char buf[1024]; + buf[0] = '\0'; +#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)) + strerror_s(buf, sizeof(buf), errno); +#else + strerror_r(errno, buf, sizeof(buf)); +#endif + AT_ERROR( + "open file failed because of errno ", + errno, + " on fopen: ", + buf, + ", file path: ", + file_name); } } From d431c77d76b8c92dfddfdfec2ce29b2e46c441c2 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Tue, 17 Aug 2021 23:10:48 -0700 Subject: [PATCH 010/530] [sharded_tensor] fix typing issue for placement (#63426) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63426 placement should either be a string or a _remote_device, this fixes the type to match the behaviors ghstack-source-id: 136041125 Reviewed By: pritamdamania87 Differential Revision: D30379702 fbshipit-source-id: 34e226494240923b433e3a39cc08c84d42cdad6b --- torch/distributed/_sharding_spec/_internals.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch/distributed/_sharding_spec/_internals.py b/torch/distributed/_sharding_spec/_internals.py index a519a9a3e2b7b..3f2ab2f1a4ea8 100644 --- a/torch/distributed/_sharding_spec/_internals.py +++ b/torch/distributed/_sharding_spec/_internals.py @@ -1,5 +1,6 @@ -from typing import List +from typing import List, Union from dataclasses import dataclass +from torch.distributed.remote_device import _remote_device import torch @@ -24,7 +25,7 @@ class ShardMetadata(object): shard_offsets: List[int] shard_lengths: List[int] - placement: torch.distributed._remote_device + placement: Union[str, _remote_device] def __post_init__(self): if isinstance(self.placement, str): From f8a84a80cdd15aaea12147d7cb35d199aa302dc5 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Wed, 18 Aug 2021 01:58:05 -0700 Subject: [PATCH 011/530] [5/N] Run opt-asan with detect_leaks=0 (#63361) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63361 Python multiprocessing doesn't support LSAN and causes false positives instead. As a result, disabling LSAN for these tests so that we can still run with opt-asan ghstack-source-id: 135962489 Test Plan: waitforbuildbot Reviewed By: rohan-varma Differential Revision: D30352269 fbshipit-source-id: f6ab5abce7bdef00cd5e1f5977424d2b151174af --- torch/testing/_internal/common_distributed.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index fdad0ad0222fa..74ed9a069604c 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -526,10 +526,6 @@ def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None: self.file_name = file_name self.run_test(test_name, parent_pipe, signal_send_pipe, event_listener_thread) - # exit to avoid run teardown() for fork processes - # Use os._exit() as it is the recommended way for child processes. - os._exit(0) - def run_test( self, test_name: str, parent_pipe, signal_pipe=None, event_listener_thread=None ) -> None: From 0f2f6a79cbc74ef2776c3555eb05b70aa80212bf Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Wed, 18 Aug 2021 03:59:51 -0700 Subject: [PATCH 012/530] clarify the documentation of `torch.meshgrid` (#62977) Summary: Also warn about the behavior differences from `numpy.meshgrid`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62977 Reviewed By: mruberry, ngimel Differential Revision: D30220930 Pulled By: dagitses fbshipit-source-id: ae6587b41792721cae2135376c58121b4634e296 --- docs/source/_static/img/meshgrid.png | Bin 0 -> 117332 bytes torch/functional.py | 65 ++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 docs/source/_static/img/meshgrid.png diff --git a/docs/source/_static/img/meshgrid.png b/docs/source/_static/img/meshgrid.png new file mode 100644 index 0000000000000000000000000000000000000000..97ad0661fc218c9c8a6a9e68a02dbccc2d8f686b GIT binary patch literal 117332 zcmeFZXEm4Z|;lx?#_PBbM`rB@3mLkYp=8RI`h;}Uz3*VG8F&-puMYg z#~1(rk*Yupl$>;$BiEKnA=70ADmA6#(!u zfb5?-0H6)z{BPYDDDe+a3DPsm7-sH9;BD{ajP@aMhW7%l-9rywpW9ciTnzNz-@owmaYp^mNbcVMCW}NM{Nf2*LQEX~ z|J02{?0@n5KY9h&q5swCAFBQ}n8F1pIRh_e2NJp$NGVDv`~&{~xc5KEDUhJcsd+ir z`FMEU^YCz0{C_KO1^E9R!oPtlz%SVGzu5CH-v4u##C%06QqTVgQjv<2Pr?NNPzKz+ zqjE0*xYk6OoMq;@`QS0_pA3h)wXWZ5EmK4WcAk?|K=PWd1#o-gCgnlYYn)yHALZ{y z*DT7Z) zH}AhoE?72^1w&W1Yc=1C{ZAAp6p0Qid!4!kX=wJxs@&0->zh-q@*WJ#E73S*kn_zLkTZiKZ|kShdooM>Nz+pBG00Ys9g?ZCg5;hEkmf(q z%9BLS+fwT88J3 zXdTAV!S(|e@(AI2L=_LD08oFOIXmGxUGjUK}NW_`Jnwde{TUs26 zZ>lC^g2a`AQpKCMG=(S_LZM;h+Y8o`%LU25Z&*0wWu0j(%|T7lSA-zO z?1jvas8wEf9A5Lrr!A1|Mf~{tp2&pj6w+=J%$%q3@*DNndBI_=~M3l;(LUhQmf(d0v_@r__DSwVPT z(Db*UhOPxpC6Won5leckUfKP{jC^fqXqek+f{Igi*lFC#$4AGZTAfE!6UEpF*-_`Q zrrzY)vV%7-?B&fKujs9QAjCSb*0$Or>w{8Cx6&(BnfuThO^o<$LSkG(>>QV3f2M5p zV?IO*G0nLT#-4C7RzTS=tD;(2L&ZtBT{Q1&oC_yW(6LM%ZpWX;Tv(48h`B|uO&6OZ z2vQ)t)MN#96l-INN{HReD4>(BZ(d6aw}z-%apEgbS{3rjLD8Z81)${ z58+juDM1&ap|}}zJDDTG38zmUQYYsN#_3=F#rsbbkSA6;Yr{^gSO_yCN3^B?Lz z&R8{Pz?8d?8>13R5HJ4Gax&#`_wHn&TJJ^u2cI6_hstj-rAgRF@9pk>6=CCxxq=x? zVz~Oe*qS0UZ{CB=L;moqz_zMnxQ!i=K<>cX6#G;vki4|T&#C*#O^R>wB-w~hxgb(E zufdelD$brK;GHzIUTJ`quK`veiP^zvm7^bRJ%r85YnTiaBv93y(ymZ42r&AJxXVb{ z7@v~w%Fo!;K{dWamuUUBdLhWC`+jzX1dTcC-EiuR^sTv4krGv9u1f<2*CNUoo@gm#SW`;bq?GQ<#P`t~*xCFQNiLh4S9JjS;7lb0gu1leA_yo zfIjblhcc!@o7NoQl97iSe;U+~c`(8o4y8KkD_@PHF!tUeMv3IB0IJsA zlIhDmvBMWmTSPhD>xDC1boQt{Lk#Sv$V+ zs}xM`TyZ6rS@(^!C^M)Pl9CKU);=)Z=62Cnh}sCG0-mSP0o7+o`f@_OlR^7ym0a?Ek*XD*bKd~!my@yg_yFa|r)+HxzLgMO?&sM{c@{6E zpo`BeuH(LhtCIy1MXxEdhLc3@Vu7LgdwRLCN?V!In^u@Z= z{=JXJrB@cG+~mJGl_taw<-on zL5W47*EEW$J~<&>q@KaX7&>;Zs|37s<@RB$n#7O4vfG402W`*S%k1zLp^x1ks@i*M zLZ52#VLTACkGOEfdcL5K-*mDTy?!o|%=605^-h>Q42ee}r)r!{(w@MZc83J9#1imPiq3_djdW0cOIuiJP=MFFCKZRI+PjIEgS{t0p#5L zdllr2uv3=%qhe?ZHBM`LojAC^ivFoG*`>o_qIl!RTljq_lucp2t+FM>#zq;n}ym%w8)KwaS1iyV?=X}EW=}PHGx~N>1q;tcr8`O?YL(f_2 zp%VK%c*$d>#Z2$;sSgk8qJ2p(xn$((y!WAxZXwA<8~_0h!I7Uf+qPR%P-ZKLEO@;uUba9Ab1T%0C&J%vh(HA=}!R? zmujH&!LxT+Kt#1kmAv^k48c60sAp1wd->fs zpwso>?07T5#a`B!_?5{3LW}ocao6@xNgW=!)mrO1X}4cZyPw7L$8of2?b2$tf=~)s zG)SrA*G3SB(z8V3-9uUTu9upSMrIH(>TRSvqgDYEITW#drC0n(Qp~i1mqRcWkGoQb z?C**kr!~J;5#+OV6E}|TX?iKrR#D$;m9DMr0y&_pw(Hr?3A)fZVMVGbhPK2v$ z`410r8*(#p3+cAoXT-VM;6twmvuwxc;P?O}btu32r`cS}1qdws6Y39nHHXQ%q|EI{ zo1vX@4TQ|$^}$dj8&hf=dvwX~M8LWNqHQLXWOZUyb7`Z@DGZ_Y5j?Aa9&&SvF{Gfn z9^sy&P`UC=P#8|aRKH&6Be6?99x*}DOjNlXa3f0D?Or#arMd^^6hpE;PBAXB1Mkqv zU>?ObRj^}q*R6NbN!+mBZ-&y`0eEcv$ia3GJi(d}`3Y3Ys<kP9NSf;M9mE0Rh=8;mHw) zO&k|K0*BawR6##5l02Mkw3|kN3x6wC`ihNn+RN?Nh>>FnH(TFd_pxSe+Nx^HoUOWZ z$=N25VBT2H!{DE_%flyXN|LAvYx>*HwN6ESb^QBvGJo7!&Wg+?o^}rS@-2@-C8>Ro zZCP!%Gp2fWjZmT-$^zA)-3i9%m6GCbQFqKqEXEu&YsIN1a?4)5jecuIQ>O}W{nckq zxY=R**H$#}j<7SQ5JZqk!5B(7%(DKgeDTJZ%WCE|mgb4U(zPZSBJqz=Xc4ATh09Vs zTSs~d2zJBKG7Bb`6UXc)MU_A3Px7(5;U$fIM5rioN8xZjsn&1Z@}5XI?NL?1T9i?k0q!Kax7cf7ZfUkFT@d22V-4A}De7941dEj;Is1v%o3cv$N&8X7dS z<>t44xZ|e8y4!=AbZj{U%iH-S^kv^q>va5_yXI;4PKCQ(A)os4>g!5ww*oG)%N+)h zimKe2>{P%Wz|mTz;@_`akl_8as+CCKGw`x+sLct3L1d{72llO0--yF)b2Zu+6bxkP z^9d{|O5vb$Oy`L&`Sb`tf0z1fO)iN<&{A)p(u(#ALuK<6iV8nUGJQF5s`xhX!)kl3 znFQyIxDa-zdc;uAM+$?jZ4;u15bw!2S=bMCHudqeb#6tgOt5W(d(-$t(};gI+?#k! zc?YFi`yl(bYnW4#YZ3LG=B2uRvTJR&orD^jejKe>-}@*YITzWXk+{;=p@nU^DhX)A z=zDz1W7RMm7o>(-n!&3L^CAt_c~wapFajiLlQ&{ZZ7(s!Vsra$PV-Js4J_ckKp+q0 zZ(Pv_Xbki92|jwa?0^aLL#Z~c!6Z9w>x);EANP5v5_$FCZ1G_N7Ck%ZR7EQM~sPBROrWH*WS<;^Uk+Xn)UQkJ3Wb zz#;TJp+h5vjfKqkB!Pylix9!{fxz1+?Wo*4NHMFWI{cxs)V)qzQLA+XO#F#sbW+B= z#38>pv|Y&dMDWzeib_&X6|tZk;OVdY;TjpaaW~?}>FwkF$rayFY`>?;TE*g@>hs(; ztwASR_&W_-MLd(Mi9W4#@BR zD##-KK+nz~>xX;>bvd>uV>-=Ui9>v<)KM6nr+A7UvH^u1-NHN@l9R#PN*&1|9)Qsd zJ>`C0^fJ~g{my_E{v8=pqt`4YD>E}d?0P=DiiUqmivx3D!N;)pEhgt5FQn5-=+R@x z>oKy~*r>iR8oBlb_9dw?J_tz{dvf5jo^Ro*Kw!91TB&9#UJ`R9Qv8rVpr%k95E{iz zKBNbCuiU)7pE;iCzPfXB6HSJkDs%Sh5ZYPWfPbvHzZvq46{a)_C)?(ar8pk=&A*nu z{upfi#im?1@RLn>T%cf%?);^q!4JUDimC*MMxq!`US9-1&D(gdd}apy2Gzrd=YRwH z!6HFg;06u4* z{!x^2rkwQYS%#wY^1wYFd)OdvUv3+czcrZ-rzUzS;s$*^du~7{Y(PIucl*n9w#o;N znVg|-4Vi5mG`^mL9FCccntw3X(FwVoo?v{@he_AH*ixn-oJQipcf{txUZD z#on-7mf(eL8p({JR+X4;TI>7|zm>EKq}92(naOXjcQ+6hkObffozCWuMEi74% zOHhhH!zRO?%#LFq$A7(=!0?Do5!_rxU+B{Ll3oQE?9}-lgZaEx)$?%QmaTa>lK=b* z?VDa{?uvAQfg05e>23pPQclQxcVY+@B)cd|s`VJlN8!*<4@ z32K8I*#cG!g0c^q>>#LscFAf6x2>o=M&ZL#VwqC59M|{svwDmaEp%h;p8_0WtMY4F zmCDp69F;ATjq%zXyg61JIc<|$dB6Y#JiGqbhrxnv6?(TkMy{{J{@)8?{Juq)1e-!F zOR$wXZ&)yub0M8bR4x*OM554kiG%poAUPy`ln`ztx0|;=LmcI)yWM;8ln43B1DH;( zj#racjaZ=8J%irUJ)?OZ(JIiVvec9Y25moyl!-qsbkZ=DOIMW!FO7 zH`&KO>va46U4Tq+2A-4|t;W{)Rlq8o_U+Th6B1ij*qR*?Kz@(NvmEt|Mc) zl^l15NHYn_{jKncIs6`F%1w;HMBoXi1xcr&7SF>?TE~$A#I(GhGk~m&R^}Z=-V|aTh~cKf3w(r< zG+<-Ok-j`|qF3g?^9mmGhFIJH68C;w=QdqEaQ(PL+3%Z3!*YDk`cbN_yU7VyMN?rD zxli>YmR^ifiP#na_$n3JMk#Oy)=kw77@>~jc~X-n&Jvn@SDbK{f^Md?<72@!T_07+ z8W&T8AYQtK*s8}&@1>cma?~{GjGp_vMSG*?aZS>@#;~W2iDwWkZR8f6p!Y&B6hGX<-~e(!`t@H z4p4&6%jG4gI{BSQRDp`V>L=?ADMP;Av@P#Sdb7iA@!anxlVZW!*a(%UUMFjgN^x9^ zX0{=|k+i67_*@5w(Dql9Iqhv$2(ytTF}YP9IMlh z;rRu+jyJwOEhx(r=jHKzp^C|UXrO*r-ARb|L$DiQqS|@N!eaYh?skB?`+I}yv%0Cd zxwm4_6TDQw0RhD=%Ao1vDYNwk!u5ui7r|+blol?EEEUL%%3@Ig{om+(fOr6M+tJX1 z6vs?kuvq<;8MN7S5KS1xOS5x*niT>caxjl0zU2;xg()(MtZK+d{QTQD*~>ldoRre5 zYCNGUbM6h|RXm$R7(oKfyAciJ0yaTRWdpf{f zT~fhlfT6HO2O4bM%6BQ8%Z<5NsVg-^XNBn256Yt-t~D$-yurc-KOBg?h%)haPxXVu z^s?jsJSW7+(*Q0rzgb`fL+K+}-qfeP)p zM&`~ttFue~ZB1>bJ9t|B=VVj(C@J_fY_YXny4dlV-j)iwbLy1Q4<`yr%7U|HDvY`x zZY9;OlsK-f^z_%AQXCn83v1~1r$l&i6DTwf@53Yce$V&^PT6#>8j|B>t8G!OX`Y|* z7`V0XM$jQ=sRxiX^!~xZ_38z|zvQJD$&~~pselWrd>DZ>9tH!gr0vM*U0PisUoXJNYgg4FmjY4eXo2FjroDw+vjAo(57+( zu)_V6x*3Qz&OCSgs}w?IN@Es%lD-5FvKC5PDF%KC*~xKJLi$fNw28G4T#F0v_gCv5 zEUI!)C>U`u#sG9=nDb-#hymtsfe*|eR9_9P5TjvI+nhfKj~)Na&(I01b&-T9*fTkS zHf2pduaOYB01BuY@o>A>@Gu9s%n&$B>9dunQU2UuhJPW#mI~)IOU_1hl@;^jtGscr zLJA&5hBtrzp&-$e{EMmX*}@fGE9{S6obKfzNiBu_G5bpLxbHdEXZb2%sN(Vi&wRf7 zV#LtXPba5 zFXWeL8l%e&41Mq1Ipl3U=)3r-5f?pEJrGpNZ7{?{KKp$xL5X_=MDhzu!UfFr9&um< zO2UCn{2fe4HF;uTB9ruG{XHy5Ip`JW=H+@xx}AY|2T}}~oI+~shbKqc#4X#$I3Tw^ zSXKMgEIZ6$TOiv)HayZoR*E1fy1DFnBg$+wE4aj~HGY#?tezM3S&(f|$kJuOgSqIY zo{u+iXP^bC6oj{E5k;p^2g-1op@Jog_Q#I6f`UzQZISaufM!}WvFo0^cdKp?k+alo zVYB>G6!l;Ej8=oog*CBos))W>M!WtZV9RTL#Y5`MUcFYK`7JyV_5qEz}7)h+Wols}sk5^C*fX4z*L4E3E)-0%IS zmSSZ1-f0K1Rq(sWB={v78m5sUv!_)+<2=dt2c1&NJq63Y*x{eR10ony8FN&E1hZZ@ z(kLi=$|S#r(sr#;O{~z#vXJ$(lowfK`Rx_(VLCa&q9vIcUzG#Mg9IZ>p6@M{k)?_w zUEr;A2_LH-c$}AvuW_@@y`DU`8r$3-KWCQNH1M=(BZr6_fY$zjV^`jsz)22uWdS`hi;G{_~#y{|Uw<1z*l%~2y`SxsnrV)!+B-f9b3JtVE**nK> z18!sw<2Oe2R=f=O3(PmhE9z}DMP5u4W-i*byioKdxjLXVCbv+r%LFg1FAU-SNS(Q& zhRE2%N?4ZT{Z-&6mj-gNwd#jsN;vCFX)Q(&!$gEH&=y~uOdh}=1*lj5v!Bzb<@w1( zaf>5EV^T$4JHL#l;6+Q)mIJl2z-TC^gCo)rS2RrLTQP`<&uNYmA(wUIog3mpEi7jF z+muzRf}dRVN!C={Wt{F3%5F8+gr-+k_2eMZufEn7-+JRYn1HlLF3QbXG$)-GCb4ET z)lZ{(H!DShsQJD}E49vF1$2TI2QnYW&fgYuY_OFcRXqB0ynY^h%(*ttl<+a=GC+=T z0)hAOH>h-Ww^(Z3T#g5(yp8_TOf$0+8um=;!O6Y05v~SRBaG>aZ@w^PrUE`L@{dh) z)7hTtO7>COS`bswXUw-JeNhSdINF#&WhgQ3CQElXBn2tHGs<>~6B{~IxvUvzZA!j@_qOEkvZcu-GWpD8%F5M1AP0AP%MFX&cdGAyTqSkID zQPD-jPu=bEh*bWFp7SF$gX%7_9tRFMLy+s&{8509NQ0RM? znW@P=npx0Q!mP!|8XX>|OLo=HXDo8l3|)?x;=|sI5P)QGkB|uG$LD_SWU8R!J0$2~aajJ?BAKMXc!|J3m2K0B zuDt#CD?z&@j&ALL*K3hyp0(&RSNTJt?8q(Ws;nLsV9fYONTEJ^EJZ6tj8O5Hq0OK0C0V)t zRJA%tc0f4&X-(g@V}cPBPnjale2uhm%TMy~)Plg==nf|Q%;l{+Cmh04%AC8#bX#gt z4q=aE6GZ|1J=31xEO?0oZmDY+&GNi|09Sye%BlD{KcJz|Tln>tHTgVv}9v0x@yM_VIoLl*1Xaz58V=S)08V#Tr&;ihEgmDV7h_+Z2Re>DQmyKuabVhG7Ro0?{wV5 zue{F`3m73gEfSqny?ea%hkhb8Y?nHS7{!%5cK_5Px3 zzv4-52nfEjR{FOAG|wq-y`)&}TzonZc9IuX<5ZVpN#HowwT&NJ)&6A~cG8vqQ7=t2 z%U#<4ebZikhu>6H()vkt%7k;VSe3d!7E-dvpj1Q#=^{wfPxllGV#{> zsESFeZhPhlj*mD*{gfdZf!{Ht_Yx9o8=lsdWDV7h>lT|B@?y}C&)Y0H|KaDrhx`ms ze=1e-D)OPa4Ix(G1yL2YE~F=E8JriNXT889+L#qDKuRi^0+9RL~Z+qRoytNt`}Lm;|lZ$+^o?uDhfD0S(?_}0HbZ} zImQ<4E@d?1AI7ov*9=Ub*EH>w6rWOKch<*!d+`l!Ee}FuTz@}ID6?F*WjIM+f6&n5 zT0BtRU^OU~K2vlwn_t+}>>kz~wd4B!&3HljZ~5k-my#Dw8qN1 z^*?%};GYpQAndetGsY$oLn|2ip)%|nx^(gM?;nFPnU=G}ag-U2evZSab@P}_P=~DF z){~y~wFMp8$^oyh%yn{WKD@;Z;Q z(S3PsGNFYzMpDE+I_&a~E0-4vu5QJ`q5j<^1Kb7mD>mfGdp{q8HXq$GYT>of1|q(4 z&T39#d3wm_sLL5`HQv;2#luNiF-_5vB0O0PGXuz8<#tSgO1@s=t!iiGhvE+@q`<&x zFLJc*89s`RoRT*B7VthOk5SVS+095NhWZ;!MD4Bo`qpbzt=l0SBFd8%ZW}KEvt-nz zS0E^-dB7V9ZMpX6XCV>*kL^$UTuRfA;>s0U>5j^{fC+>Cw{f*zu)({Vo=HmTfGI^Y zI&?4}I>na$$7IeZhd~UwW!HrYWRS-xgKNnlIq=A=Ugr@cz!v;Hsp-Z{-<26-<$1EEJToxM}jFH zMmKHi_Nq8*9lP`QsHo|1_+$fgekFhIaK?UhJ;v62Mu={5$X4->P3RFQXw~K8r?`G4 z`F$3^lbnHvJ;Cr%i7jI&PV&C1e1o*9ZF$gNaRbi_(lbX;AcF}w!~j& z6zyh3CT(wmzy8o%RuZfs=$JA|Tjn=LBuEK@NvRYzvqKIaEAiC6(2)8{98at~!Qh`H zK?>oU)nHmEO$N;+g%V{h8jaGe-tP1*{*$1fp!Rus#U&?4(|jzg9J?VV0X^uTn06w; z?=oU=go0HV7q?Oodo@1i0OCMBtl$r%|h<&a$MZ9&DC&PPO&~LGv!w6nj+dV56SuEpFUlHdb7_{ z`NF;V0OpxzD3QA3^<+v--#>@iU)w^|`EBn^RMov7kQ-!~gNu(N7ni*|D6w3JH?&{Z zqjU46m|Z>NzS-2x2AnZD>ekNLGz_+cB7T^juGl5;Trt0nZyAHtW?wFzPP z_S^6%J2$*L*s4iRt9{1*JwvYGLsrM@x(&Z{+0~yP@rwFJsNxpnPexqRc5ynZQXMRJ zUf_8?W%(Hu)(n{m(#NBdf!&#wdWPjs#!^A<`}Y&0o5%?-L!&S0hd}m`$S<}e5 z2)2b}^Za6GtoI2eXCw#Ff@tX4KL7v*V0xJ&2@;EXl|{)4xn}4iOad7Jy^#!OP--*- zd}dlV3P;aPJ24xFdObK$)#i|D&FV5dn7~g@$JhM0uNoy_u-|ZI{Z1b-D|wuM>~^rX z>)h`m4d_46!5UK4u1#>}(2>*269gLC#HV+DHy^1toNoB~tsF*Kgcf`R!cc11xYO_P z&#pn>O>f4i?eGvxbN#)p_VgbGpxDq^@k9G)#M9)CxYef*Xk;^D10`^> zA-K5tv*OO4r%lMZf!;c~Kgs9J=PPyt4_oF!wtSA4=Q!7jZO_Wad)zA%8s`@FOFxD^ zS>r%HjeHR|y90o|<(U;~y%gMCoL-UgoU<(d=vUy(UzvrrFV9dkT39vo!8S#>TClSQ5Pp3uvByv*u7TRcn!%D7Fl9Dscqy8bjw>wQ@(&yniTU)gUxsb8+` zb377Z)ZO0JTibSt`XiE%6rh1zZL-u@bhgc*_A3_hg&ry<+B-m3v}Ri@=+Ks)=JfIP z;mY-jI#wVi9ga0R-hn%2tVLf8nm;su%pK?%T(9mO{!5f-GaIO;E#Eykj5+H&JBjlj zI}eTgK_Xz;_vR(B1?p4NkW};$qz90kE`JL@nj-pO_TiOea}m$NAC%x=Vt7?NQ)3NH z--y}gvUxC_J$*nG=S3bI(&sSb-a|F^n}V}a1)@ydNbN6qaJN_M*Jb%SJT@}}+H#>_g>9<(sIQZS^sS$%MF(nTj#>;;xtv1pgy zs^QPq^VyX3{r3B-A)SqzZwV4%3Die(UjiS3@!V$Q*N@4RVt)S)>mu7yQ8H=ibhKxp zU$5hjkDK#jx;}K@+LJ(RT08NUyMCpD@r>cc%{jX$@0nRoCMzftxzbTq%wZUP89B?= znSA~;AV;h92rpgdL`%D02iSp+Jfs-D-H`CQ08O0G+vpS(iWuJP*b!!+>+opytH-~3 zQak)QV!C-IX`hsk;TU#kJ2g`bO&RA~vxaTlcnOG?jUiI}hXEKuxd7@PH{5!_zqj5J3a!s1bJ~ zs>?sb*}3tvxh#B*sJ@Ib9rWHWGfi_jNgf*0rhp}|v@P_}Fh?=-t{vNUV`Qkn({Wfo6aas&n?-n(|%mgc>o3z2;^;t5_^$;C1nW)HcHzPG?ZY9Q2Ct=PSR5nP=;- zB;O9R#rZzgN>Ek49HxEzL#z&J5V^PKmnr-5F9~ley%GyrWgo!?Q}-r^>fHWQ^77ca(PBo_j*!R{n8o(~~y0)A2<9 zfD3K%d3tD3b+99!+B(OMn{dnJAYI~(Fd5%Z{#tX#gcreL#D-v5fWUCri87w`)u>T8 z`=>l}ZJ}<%$hVv7(Us$ z)2Y+QhTC6!Q?s6I(@gr$G9C|X5;Z4x4JOx=TGnZ$|0K|E>`1?#3(pQV&zz8VNW}9$ z-~Wzk+jZjF>pf$XW`4!W7j=!aQ*Tk7Y>NDm8ely7PA`d~(SALb(UR~)5S#=$)mYAJ z$zRIn9i+g=Nn5tD4OJB43iMLKsTJ@$U5o<72oZ)p#9OloO9hc)q2dqs4TSWxm8MQf zTdD__%Z(J}MCfU`vE$GAtv?S-B_$}#s|P&XUJO2WF3v*fq=-BP0^A5b< zGzpT*TOR=A667M;mJyNLGTR(lUj<0-+)6vWlI>n9is)O>FVIr9VuZc0I&3wpVsdm- zmJ*$B3;5a>lGaX7G-{=c*?(fpMg)*&+-Q5uWB zbZ=a|+)L`h%PKZa$he5-X4W)x; zor2u|=4?<6thhC}wo2Jj18<8!Y^pEDUkFm8j=#(BHUN_oy%SVs_46e^o9)-Q8auzoW$V&x96Vkg&zOm)5L0xX^h|y_ zPVx7y*z`*W?A{ZUTfGn3iHxG+JM{~5p)vK){c*<_5JsGxM*5JE@Vn)FX=U+*J;wv% zO@_xSPGgoFrWXI6eHS|^{@fJQImIR2H+}rYeDmP51n4JAv?1t5(zQ1GySN#-b}FE? z5YLHsA`5UeoE-~aWVE)4PTfwUi3%_qB487e_s@SxU$VRVe%Jr%qJfH_#|-SFG>rz# z<;V#<-{G&LQMMQKg(Gydb)TQfb+Y`_Z>rRw0v>Y4{b19Kewt{u`APkZ<&M5;&4Yw1 zF-L}3H$?VI9X>!($avi1#@zyI=p1-Y1L(I(@CB!H5bBdQeoI2<1j~A{VRZLytj&*i zRfIQ)ttgio#=mQ&%0{y@kw7t0QX?M&9H)1Ft?{Kv;5YH+4?yk|Jh3Qpc7FK$5S{jo z=Npt+ZfMRtJW_F~815ouaLD;XB72sUESCMaTE$|i#s_v80s3&i6R)G`Ruqv)o=Sy6 z0n0ju`IgbRf*s22Z7H49ZHa26j7$nyIrXVqcr?Fp#H|%mGyaoJUp`k4w3A*sl zb#*e4f?SeZUw_#xH}=U5YbxU z-i5AWm3V&xGZrJL*_mS#* zwy}W+PN zlc81L&Nj~vO82*v8e5Bw>*x`DKl+yeKUfvKGbTJAmv+Z7uqjNuQJjiMe5HyLVN&_p zS>#j=y*=dI&bj`)`HHy^yoVU+e>K=ip9N!zP91(P5(!QU0lL-}D&1sVo+qOkK4>`) zY1rwPp958xHLcwD>$^32C$L*^s%Y~fl~r(FW;)m#NB(zF@j&)?PU&p!Jwaevs5tYX z%x|>X>-*EE8~*ENVS8qCGEZzySIj1ZrgpM{rZLR6x8@LGcQsNyZHDMXvgNEMK2O?# zxgz3e6sA^XXF29^Bn9s~8q$Hk=pVqu>O{53Wg`we(EG)w%n6FP*+|P3H-cUsA}J0c~Jxzc+-QeyeYE zRCOSSN!nAyrz@oZuNDi&4Wx|k*(S%O$5WfdaDo0zH(WQ^k`nuUE9gMMjxo0HiC8dp zb@}KzZ>ACGDAgKt{RYr?b@x!`>v0a+Q(c^s(TEwt=zIq@o?ILrLBttwA)x2$x7Z9x zo|J4OA2ZQ>1J9+SoZLPaJpL$S^QHMGhZyU?5!d6ju;Zs$O##@`4O)-gLNZ-k-O5ky zpVF3nw)*riZkSsRg(mwtnf!cn2G|KK4?S6+<|C#TH4N7icchB;eV926EU zr%`zI)-3h2OH{z>knY+?-!C;R<;SUFrUO?$MCWQeJ;zfhnV*XITl($e8`s@Ds1>n0 zN2?#}NYXvsrwAiZv?>vcB%*I(H=hb*i=)nX&ImE3j~@Oy{#mTZqN)1%QYIxN;Opk& zHqWzHEErGAR;L-KNrYUhAy?K7E@cx^jx|GXZ-9B6zGYGyHgm-EZ0EUO0bzeRMHW?WCa;Y3Od+ewk6?fK$J zi%_WVrr6@}e7lVj-#zQtdJKJK&qiq5jq-tE=5D}=CLzfB3R-x9D(~qYj;tPa+Fd#w zwIm98-KLe4QgE3Y9ajSs2&HhqCh3KCcq*AevmSO@V&|S@ZZKeNHUyuiHo;LR?i(6| zOdHGg{JhP@s6#vf`|vLG8adfah3#`lnfqc##vh9HMj3d``H_Ik_;`>0r%@~KN_YwS zPF0Sk$|C!@P@Ts_5>W@Xo^nhP04>fL>^B4%CR3_WDkC0V8~i5wEQ(jSopWQhDBK%i zmwKow%kc({FU(vkRr!7v*Fua-6A^zRxJkhPE29uJ^(q8&E8LJ50x6x$(65JV2HE1S z)1IC%QZO%tTFH^oT4d$^c?jZPXKvKt02GA| zmmosx9{+kjhMk_A1$_5g^OWZ295Qy*EER0Ok`r9$$@4%;pbr3;%e0@L83iIt)<%JWG5W z{$9nx8#ujKv|&z8m^KF8@OFy1SHi@0Qh5iX$^giufNH=E5CNB3n5F{Fp~vqnTC}>S z@#&66er?ajZ+k>V^n^60^0M;4OfwJZ&0jA-A+`?SeD7YSAlY0MAMwe91I3nkL49$= zO``UALoi1{y=#|DOdIvSXnkAt_(F%LS??#?L!Rl1*KuqUuhgl7?HA5c5-5xZ6h73@ zzHZ=B>Hym?dj(Ihr3g>3)tS7~KcLTLA8hu7%AmT9p-bR-&9RxCuDA8fBhL)vxo$j1 zZBf!I1{j4CZrOKU!4w2*3y%@Rd<*$L!Zt7GboCGk@bhi>{z2IJ!OO@1;#sszql(2^ zdBX-3^;cn=t!FDk4>>$;Z6Sg-B}{+DuUd_IpiWnpBhGpQZhpc(cQ-J<*{rlM5C!cv(1qQC`ms;Cvhxkn5zY}1=FC14lDy$ zG~`mK=>oFKLQzfQRqp4!35R`Q$7}e0HtRg0NN=dQ^o)d>pGtS7g6{Dzi^?C@1&h;0FHwYz3b+ejVT6hsLIh^*Wvqi&HQI5(Ymqob?a+? zO;|VJZ%{au1Fk1r_07yI>A0E z?j#mJ@@aA{ENuOFKegq+&#dSq;#R*5OSKJG@xpM!P`4RT>sUzf=!P34XFsbvX?VkF zb624JI|@$Q#-pyRNyqrmIR}z5A-SOCdFw{%(90GZ+xM?qj9T8mZnj}3KB-{Wt}hVW z5}W7}4o>D-&4jTrdFnES;vs6w<8^B6sIw}Q%hxZNgV^OCSkDT#D~yCXk~Z*5D=+W9 z=aV!PcS2_s_nzN+RR@Cv(@H=3D-(~6yQN1;8AJAFaJ*a6>Z7}U8NiM)F24DLZ@7T*Qu8x_Boxf0190D`2+rAjiHYa8`Rfj!i);4r`a8?9 zUzgJ|S>CX#8?@d7-{NG7qs0>5#2lHP7LC{a=f40NjN~yXKZO)&IZF@1bE1q(YPH!x z)GDc=+w4Gf+qtjQ>r7bEXFe_M#)&r|g7dR6Xc|+Eq14HsCWXQ7CYm2~E7u<1gdGXe z>i{n`JIv4v4$X{RttTx+L35nb?8P*S`;`gf9%p|&$k+XUduk>hr-m-4zF%%R`t@-_ ze)_PVz4Vct^0;H>&r0=u(88mpKkFIk3AT|{Z0Xq$eOZr@i`gb(R zc;w&6a<-5@2StQZxM!m%F8!c%rNu-_usbs#?*cobIZp?7)468u7?5&#fhn_3;MDH; zf<|!y@;Gi8A|`Q%r!>yZL-$wpx|l3*tb6UIE$y#r(pr-+^dsy zJqvQEGgal73Uy4w6^6D%3yHQQTZ)5YM%m5$-q}OF&xWi4{+6?ph(b*u6iRmqyV-UN zA8->=kyB}4oi^PhR@y&<424@6oe2sPgyW|xfegP9jVp|8+UED|2-B70i9Y%~R9Yt* z>09Gj6rE3_H%e?orp*9-HvKE)YTJl1%Fr9RJqBuMTR@J&{zty?u+!c?Y2R5s`DNmq z>jcetcNVp(?#llK`9KE0s%Hs6M-fi-N!=U`4thk3XiMoLE{&(ZvF^=Gm9o)Z*-(fO zLvbaH@}m&&Fq@`*3lH&WyCuBrz(=2J#=+;^PD8iT)oNca!-p2E9a<78$LlxgC_`^= zul?W$Kd`_0tG|kI=(VqXt^M*Z|8gW_`HtlTDFE<@4}tymZ~u10iwEqv`2FAi{W#gf zvL%y>-}}AatEQ>NAJH-1{N`$Wyp?*Kl!uOuz@qaqNYH_KS3@7pV$#tyNC2N*w9HB_ zzx?u;t!U|?)y_kD4cyQRYz8j|IV=nO#$qTVvOMi0Kf@n6O=_`D{1fVij;< zc*K!gnmOs3# z1rNQ5t}IN17as3S@N)S52)=Q0+u;dz{=Av?(4$-I-p8J`x14mmt#@9dh<1CMk%>GN zfb!s$v?0Xt}Vv!@5Aa zW`uK!quh}x1GF*-XtN@hf3W5cAHmV648#KypC@&|1}^d1q5dO_}wR&SCqtMjl@oVqHP#^UHf85ka2D z1B$esk|03z0XLc-diA>~$msk9w`dU^ zjWDM)QVtXX6n6*A1<%JqK?3Hu#%y(8og=cyRm%js(2m@VgwQIK~+sIq)~?|4KLB{Hy>R zV>L=crIRxV&|qo!>J$h#-f@HvhXWpP!9_=9R?dbRZux50@^!wV8C^zYOs4tv;^9~Fi^?kxfm;NRUr~JGFqgEQ? z#=FZ3z%f+A3|{Iibkwq=5!0}n?M&tT<=?#?u6~7ft_sGy2+l*5S=u9O z{7Bcq-Ojrm*5jMphzp!vSN`I{6%#x|s1KwL&!t@mM{|>T48aIW1g0wnY>Z>r>*}5n zFv9##JBTtA12FDLp!7l^? ze=HNY1>=b0o9YU~20u%==*(I6!EgNBe&xE0{aD}$@x28UEa+giivYyBLmg`a9mnfv z2ka|hgB?PUhnXZo5nQ!|Ulebc{)o@`)K9d9A;2!@R~ZD7ae|qD`IE=#hC-A8e$x-> z*YJX|M6mFw%ZYv(<);6*J$Q7Yj{yf=Q5X(~lHaNTu9dzPKWn9f=6d17QJ!N7Mpgih zf!d-2(n%PcXymnoKWPZZ=NXDqeg;N_8?%0dJ`XscWvEU#5X5vweP{>ak89QBUymLH zGy^{~f4)MInFuRJc+T@8nS%^QZ!TVT~Z_unT{Leazg! zMW74si5%~nGObV$JYSgj$?W*!*rUIPBILhAdGHN@n3e(d$2ZrHm*JEje@xs-IY5oW;%N|XocmHH(PI;wndc!Up! z117L|QuxCGJEalWvJe29vay*E9#B5^59kg8ECej^kp=~d2YyWSrK#!DyKVK_r|rM| z)*sj>|LJ3)EF5x$G{w~}?a&9#>_(phSpgV*p0|XN0f>Ps21FN>jXInZ0Mhb~-za@8 zLX?;M?4af0vJ#HqaSu1NAh0#S)`Bru7=scpz`>d=NBmgp2M_QSKNy^P@o5>bT%g3T z(XY|PLG@q}E*|0wHa?0GGbg?ifB>oPg>d19@Y}p)o7rt^?4-;4>}fyx-V>Yk81TJb zVg()Gn!)p~`gNg=V09kBix4>oiir2HgM01JE*GfN=7e=%qT`A+bWhK$-HCKqfb+CR zD$0a|3qTIXf%2>0Ioz@$OG!1RY>l<)w>h6|6l6o~y396|G|l&t6EY4+DQ-)C>Q_Ch=L zB>!Tde|3>K)fZ)=o@lFTv-k)S0{R zT_}?&Kfs^jt3^*a44S}^-RN^5D*&U<^QJMP86yzsPzcLkxcMcL`muvq+6n;K6-I zeD)lI6=7b?Vnf`xP({#^FBE}3-#E3)uNY=$eUA&)*kSLGD$0PX3;u(4gR;gz4N?BM zAT?&_1(6Bdy1IHfG*z^y|Yyfl#MB{Pwhb!LVzJ zAM3W89G3Rmwbe9C}T0a(D7QiFoj zjuUKVDOg1Ax#yl(??-Xb;F-b(FSFz)RzL0NcKZ#0zC+oCWe7He8-iTUd0#9Act8YG z7YJlBRGh!ZFBhs&RVV>omarsv*r5YHA7+N_!WMyB2w2Zcx~dTO-<~-(4F>EA9#Dv; z(kTI=3P;+ajTF3a0gr;V#?=wO-UAQv2+IK%$+_J{3t*Is1v=FISZQ!JAO&>DZJ z8`TwMq5gPq#3AHa3c%u(S_jxnh|)k?l0tyuKzjyWstgD%@aj2m63+u2jndYGF&yF0 z4*sHEi{%rP0l7JIX8Y@Lw({{c?(erZoW0miJZV9EkAuAxlpo>Eashl{3}LJgZ{dqv ztum61u?{N)j^DM?35NV;(ve247R+W~DF1l%vjT9G)hq*p%;2rg;?>Ag4ScZT(Fqs? zz=a8P&6+h~iq{Mv?IarL27_6S7Y=3$Yb$H!9KmCkGI1J!%lVr9ox*2HfieJ}HC}iL zUTAN}0UO@HTg;@)h%otMMn*v3#CP6{^j&@eV81=({512O=dKJO3J};h?E@g~1O%!3 zjwIqdmITF935l?HC){6D7SGE%dYIr{3Fz~ybdOBj;lkGiFPraNz{b9S$m=*NaJzhf zKH|cx0LCC$8FVYw1Afc;5LrPQCi@~S0v`(l!ke-Oo*v(rh!Vmh1;@)3^kFe@7im3x zEFpNB5DsBYs0a!8;E-eEpwr(kI*RZBLK<+ozs-gHd2_nm0{$&?Ll16`m z=kTH8(z+vvwuk^EoOXjEThpLnU?wel)WIZk;Luixc#o$xy=07p?Rr>4nPg+Ilh1JkJKXoEQ#J&q2GAmbJ- zgeB&3W;VRzfTs=ud|<2NM1n#7kRBWimf&E$Rcq$pWky9=o1wWDsuaF1CjvfYK-mzz z@@v7U7be)$3l;@76QZ;*SXI~iOPR^jnq)}RE zLpdU!w}n0z{%W70KvdI#pny%#0avVC#G1{VWdHV~6?WA{XT}#7u@2N~qk>s04tO}? z6i;0iyeRq>vs;!7IG&10T0J--cwu=F-CtxpUN#7 zC;6`boxAqfXTSJ$v!&;|0pPHmar$XNKYc>{pg+JT`h-qg(D&dGRx@oVgg1-@ILa?R zpa3jhyg0%MOX-Nf${)pn3*OVmNvrTQE#nvXiQ{3M;Yrg%EA%n`q~R%?@&29_fTN&> zX?!$T8n%RZJ7j4<(cn@Kh8jB^LLb3llh4H$UmP6@!JN(p1PA!(AXcxVk{%p5W&m_> z*83S4b$Jlw6}|YRSDwf?$RDLG!)Hw%+6A9ulEIf7qlTz0awMRv@ZBPEb0V-&e21JE6%-wCyv+S2ILiKo+$46Cw-L1 zmC=f&B6v~}ehP~;l{QkKal%nxfnUYz)+x&4@Mw2@Es!iC9d+e}eFyFA6Q|i{e(}>N?Iv4x+TyCf zO!)yEgegLvv4l8;G<_eQaKeBz>2d+e6#bXJijPbAoAN_+tz5a%xc{Huy}iBmM}PE3 z!DH1YI8aVL^q~)hC6H4;?B)2a-})`P_S$QM29_H?{_&3oP1Mn!{K=o#(xpo)ZA@xy zihcaPX9Zx4>y`#j2cQABGwT=m>L6){R+!S@)3Isvbasxft#wVE?{Ra6osL5u;>r%V z8X(GX<==t<9^JZE{$2p^NQX<`%dl$wR=F5R>D&woe8I2R2L#H1fQFZ?!UdOLqZ~1a z3pN7@x2kEdtVI`bQ~a`&t=qQSSH69lC%G_0*+CCyr+&Ct20)6*-?7^i_(M!0}r6BAkKY$bKNf&06BG@0ot5x4h!D7sHdEs zGU)TLE)U}(BCG(RBpf(scii{5oqtAer1PY8UGS-HZ2gEdA7!MS$+72+~Qd|lfDPMSnli&2+?K^hbKY!_)W=pQI^_#cZyi2aI z6BiV-ecGJTi>|UlpbeHNNR*!$HwpoJbXZPSy7baZ1B{d-Xb^n8lmG2+f4lwN z-~FAv@|CZ&4}bW>_7DH?5B8^j`lpds>G;839TWvD3ruJPi@3k}o4<)V_~=JJYPZ~S zi@obz@3QZG?|Y$yP!F}nKk|)7mlc39tW`RU8s|`izluO3r}5H=(+012ItT*}I5`(j zr%X-$V`wr~(nC&s5^GXV05U-utz7kl-SovT+OjLIv8}!rpg5!MsVJYr_CN8j zn$N>ymLBmEj3nb7hrq_u?+C02V+4l__AW0cN(Ms`CJZF9O&7zBp%JK<^I07`?KaegLu-69VfWF9{6|4lzj8kQRb}M=zD#g_jkxe8P9^n8tv-$7e@l*Ti zXZ}CC^wn3|y^s307hZLd&G&wG*vqN3Dfn_&Ss{>u1blb}f3U8xf8Zk@`AB@^hne_w z*Ij45y}eallAeCWkzPIuhVaL|``z!h>#x7w-uvG7Mjr*Agg5+0frqD99w`rwKK<|i z{_n<@197i^{p;;BpZSbE^2j4$0gb>8`jZuaF|2Gl3=Ov2aoQybI6QQi)M-$nGoXDV zm^t>ut#zbFaH`{^`O5F4p_6IANaGcTDUZPolRtwRCV3QsdTZvu0Iu?uPeMI<;4jZq$>M&zIsv{b9~Uy!XUI;`K-fi$hgb4OLaYtI zs5}5Dxv&d(+5vQuCTQ^3Qe0RT2&}pS_5`Nt=pKrzGJ!pSCx=+$}F|5h&>E$0ximDMyvgp}hcC zu<3*J6@@Vyra!QM0EhLB*)Ycr2fMEyx`8M9$uB&-55)mTaB1J*p{;M;veiEQxv$#1 zD=xGH`}dixf5EQ0_+0Du{Vhl8C;(+TQX0XLDoEh!?d=WA9s6q#>TKM-`R1EruMKf! z$9@Z}2{_;(*ja91yg^BqBMyrlAN+xaZ+zn$@$i{+_>Hww3IXw?1x6>jtN@H*t<(6@ zPBIiyX!O8BQ)b|iqeG(&6UU&#?&Wsw*(X16$}?f`2sZ+hFX5r7b40G!J2A9imTx3H zfej2?`W^xPNAMY#SWCoufoV_2@Gz&Q@CA$flpmP&oYJDWNI7DF&_{?;*tKZuhwBQ2 zo3PhEztKMSi#V(pT>f)_z!?+mJ9n?NS6p^hSP05>Uz>g~ z48NnyYIopOzu{eb0r};_8o!|MLfU2^k zDAI8aID#c#<$!*c1>hNRto^_9o$riE17ir|QR={zjvr~{rarUFNfYzbSE# zD*&gSZKs~HDDbrkzaDNx!70S7afyY9LxIwWRut@qQZn(0`;6}+^f2%`ft0N#7=y|Lbp0;6MigTW^r7`2>W zlZNF423q*T>|cXSnRdae1(Pz6@75pu#L@NIZao>U>IhI{Nw-jxSm z@1=ABG>NPJR=GVc;QDXY*M&R96?ymroC~ z86Sa28)u1*i>%nB%UuKrJQP*hD`s=rET(H4))#Bao&LGs zAAIXy?dskI_WafzcGwlLv(H>;^X4Asd`&sp2=d@@;6?!gKT9My=qEgN`rX^xYoGk& zC&N-lJo^XW6MYs%fMdEijem?CIM(<102E6wSY1#gm=NKxSbpkLp9*VY1a#P+YylX< z3a4T7#NalJbO4i%gejf|j>(@HGo1}w8YEgpNj^HO1`Gxr)-Rd)FIloAj?y5k8OWN& zk(Z`RQBj z#0!_%hTZ$E+Xqx1hAUaf>+hbnY9>#G`ha(`cv_AER&s|#7(QUd!i5EtoK(PMXV+(eu$34L9d7T#Xz^+x7yxx-6b|{s(;DS_hcxG($;DhRUD)A z0xfA9PI*atq8|~)L;K{oA%c!RjKZz^2ef~Hc&rFm322MJ1_uxD@a0lmgb|OM;tIqj zU;F>NfBur$8LzOt&b#A!+&cEiW_$bpb*0UA9MCmf4jSoil!-Fdb1DnBGv0XPjd7&! zEpK^CoC5gT*S;3nZoBO^d+S@@8s%hD;a~stUx&35`0shodt#TraBvFXh8u3MKl`&k zv#)&REAg;510I!mQ;(+c5}p-+@lums4jM2sas(Tf;xJG!V@A7f)*N4p9qJTxLuGZ66(K4vqR`!W0Tks#IwTZIoC4Q`Z6gCIO&w!kcE$&w`l zz7PohT26xlzThaG;_iLGXBAuTvKeQ*!M5z&R|s#Xq3XDNCv|CDBok~OhZ)TcQPc-;Sr*8 zLsxO~faEFvz~wj?SUtRIeV`mH?AM|dxZV!pONRsYEd)j|1#UnTL=-+(CG0$S*p|8i z@Z|^B+wVWK!7g z&!9Lf4aW{?@9JapU-3bhi{F0p^5jW&*U#^_pWOTwG+pM`rt6j_2*36|vaQv-c2WCsdurEwoPmA2Invp+tt5Kp=)&Lp3Q zkp|j|_M=~fNA7-Lq@vuF`r>3jO>=VvUu-;d_yO0IBGB(Sbl4X3c=-?a*-f|KYm*P} zwyD!*`1$F}DNJ1s-KcW3}Ndz1!)Nm|N|&$y0^I2fDY%bp+4&gjl2X2I4 zosy1=)dJ;$nG4DVWkW!>1D`z5$Ad+I0Z&&R0~=)n9yJSFbOMh&z{8*9JK2By(|hbo zpZO0r|1Yw=E{F=Tm-4;f;Q&5RFD}4k68DPo=rj+hOTSA$i#!#mEE!bw3>Sn>XOX`K z?#SZ6DtY3%-~q`}M1u3uwgfMrVSqsQu&@ZEicBS4@Fgr75HJ8?|8Gt$3@CR${wR~< z0~|*HxeSPZP|M%_E}(4sB>VAGTkY+yneHDx-0zz^pYnE8SSC;|nK9FrX%o;-J>o}M z9L`Yti-TzBgsWXVX+B`33IW2NzR5#)x?I317t+)2wOoJ?yq=z^cIQ3!*{xr?$e0;vvD<6suWn;X+DGl#wI{j99 zerx>Xz0+J)0LHk+)u>w~gT{_y5YxGMIyQoz#?M0jW7fhEza>kSL>U;c+bsidp`Q+pH36#vOBGysjB8bK5gpJBTzC%* zOpjmx_AL(4KNagjX(tK*<#Oc!v7|DBvwswIAr?ylNNg8hPOIDi;nyD=(F5X?q3Tx> zv)Rw-3!#ohAYFP$@D$m|<21Sz;U!U3xFm5pBf&s|B6ragtEnk3yYEo~4napXh8KQS zLOk67q`@;6{$5|eD3&{l%z>Jq;%EG&4gyvM4%-EH9y(%YOq*msTD#ppjMQW2oq4+7 zXTLj?NZK!ZVpzJOZE!IS0+97{1SestTl}D*f{(c2bkMH$G*q}LZS;EtDq%b*(#-xj zW=C6X~n!W~r zp%jlnidi|cWOmXbM44HzA+K4pc_aLkgN}>v$85=tX7&?s9E!mTSMJ!2;MIy_Ab>{* zY1aDSQ#*SJfDKQf8NM)sgwKyUU%IDGvmJY&cb`pmd`H~0&Js5c!76!IgjD`=d!NIL z$nXcmCO-xfJjADBQCWRPd;*zL0}eUcBLwO!aThb8fgivMH6Vs6m?CdK~wY7e; zqIK=JS-vb_Pwudb-m%<{b45uGo}_pb-i{9}#+ae@^LB9OJict4nQ&}zfDWS>xi;ak zmaR{gQW^~~?Us!EI0i-C)y=G&YdTqG&I+&u2*?^gLaW z4`rZ-_Ef% zivo@~Xm5vZ(ZGNL9~jgzaWZ)GEduz`u3`Z_&<0<4I9|DV^X5=05F+3C{#PCS!*U+)>>A7X|7U6hR--5>q&f+)U?@Y=YeD+Q2g32k`9`FWA%TH`#fo`w0@a9LTAi@teA( z{~(0eRUflr-y?uUiaMqZ(HDrvihx2uAEX`7*3{PoyP38NejZ@pkEToMW0NKHuXyw^ z+rHusJJWamKfLBSo8i0tso#Bm4A4$q=;fXrU)E#!faMZdO6XhgM*LK{C{K!~QTf0R zPl1!&=yRY}Pal0MGRB4+pkvXY5NsUv!5rV)+Z)H%h(oZc!>L0Oj#J)-2r7}JMNl#b zYO`E=j8Fp>10Q(sg|}Jwz#uQT?yX+EI&Sqth~9J0Jwp=yDldaGvp9D8!xyX;C>so3 z?G7@O6S(jNC5SR`X%GY8wr$((dq2L%i|<#u9!Rbjt?=I&RALSFQVL7b4447GBwYzq z!=H|$Dq9MC@!bJz=}x~lj3PI<^#|Cb{XInRrD^7ts7SZ6vG^QWhvz zpe<+sM6@jw01r9>5IJb8^2dd^UrK|b5cTch)J>G9l0JTWd~Evj8SH)jZ2%MiaHw3+ zleSxgkgQtJfuK)s<ubkkROEr zexV>qF%l0{P8BWXIbr-}GJ^uptP{*~9j5t}Mq$*cTse;LG+yaC#m@@BQC`ywCJZn% zTxOoEXLH1*eC}Qin~uRd`Sd& zKr#8#@lh@qKy+F`X8@b*E38o; z6&%p6^3hg!aD>rjSuUWzp(xN+u@2C-SRNw{N`N*KVogXD0>Kk}d~geXyo|vgXn@Z9 z{LcSxeE!pR@vGiyYc_1ODKmSbzwh>qlQwyZz2U50n>k~8GzH2CFUZT7z@8oYMY@jy zVV>@zpkImy_~N^G01PPr3aiHs@&ix2)bBLC;42MrX*$BwxHNAkzGnqsjBCA7W27-7 zXwbS@Z)PwW?A$#agna42z^t6jgzOqlH_N5)()YpO6Hj`|!5S;d1p^7f zpR&s=&pREpRYx8MGJIe&I5RtER)Qq}-0e7(5uWo|QI-_e=>D>Jv;;3IR2L41|#b{b3Diy2+uYAY~p+-9AN;Rbi24sY(OGA(DpVlkGx=i=-&po?A%$=MjG`u;FRML`f|h};1T$`l~2|I z+L~s?%!Z{9V11%}LaWx_X{%-0g<~{)a8L4fy3@y%FMjhoj`CbP;2TESU^#ngw_CQR z`fPuTFaP)1`KK?oIew4A0k>|EPkog3U*1RY|FicdfO;0y{r?Hs*PES$5OQw_VF%fi z7AUw>6x3RmU#(hOtySBff3>y#{a5|}+-O};(W*t@uUNrq-JpsfqDTSDsw`niNJ0W3 z`+jq8cHsa0oO#ak&imZ=eRJ;(NKEb|?|tT(IdkUBJagupIkUlgCYH!HU?t+{)97>J8<00J2dW!T0QSKz zAB`0O!?Bi({M*~x-JA{c(1@dOenL(ygJ0Bsx|1_p%yL6a;;NvR_}Z3!2=;xEtgA8185v+w_kfBUu_ z_eZDL%1v90S@_Ws{u?^C*jOn78y9S{Sy}^t0<&Ysc9pmG4WSp_;z>UPJ`|r&jIg{R zsHqEXIK)wY>Vf(|F`_Q0gQ#QZ%HKVa{t!+c;JNU^3%!oX_ah(qh@bF>A4gig!NnI} z?7MzA7>FrT;OKX7Kl|Cw?4pY<@(Y59V<7T|H@v~)4#YpfRR?0(_sKHT*ki;Hx9i1G zyEJZGFifKYgwTg6n!yFb&^e*8Gje9m6J{Vi`2?Okk6_MV)I6^0&&$D)yKI@mEKdi3 za)BccXyFO&psR|j`W|JY1A&Rb&s={;M@Kojpbi6nHGQjh=%!wv5tBc3p;W{=0^Wf^ zaeDlT<#zGaw^-Y;Gi|+Yo*Sk(6`~G(68^2ency*fv-nQ+Q0MAVWl_qf?2NJYsN?h_ zf)!rp0)d`B{~qrE36|XmT(g_g+yez^>yW0GA}qc|1g0v;^J$RQgOUe0M>Z$yxJ4yFp21R7^VBC+!rRl$UrfcacFguRP!W`l*|t+TYLF2;ce9HD*U0 zXOnH!+!X~AKLrg_dff<4}ReB zQEw<+Opzkg>C+JSfAcqgW7l1Gov+y;?>D~jjrR7pzul*EB3+*6f#e@h0QP~e9UTCq zAk^qIF!>{N5I%G)2<1I>2*E5on>B10lF=*7#+d<_wz*^{_QyxZ)Tmk_e{i&N#8VEK z9_0f68E2ePj3jz07vRn7U{AQApC1eTuq=dv6pBUQ;<+bUBd@40Xk~sE;m?@|)D3ZT zEYT4K&IOMwHhc7Dn|k`oZ0V-0He58N`Rj@xk4)kzy4(qGDfc3*3KQ=QrEo+FAGnw< za6I5I@&JUW?3EyQz`7Dp6O8&2iXuTPmk_4!mP4z)73FpiS5R_1;H}5)^(Z*-B$x13 zbM{4lc_ApP(jd$dy**Qdal1^~lgAFT%O75CA6nn3@56VAysEkM?>*H)!dbNrut=0q zF(Xm<3VvWjj55sB0qr+N#3%-|J=!(x4P4X@{ydNVQCUlcLOM*N^*ipq*Dn9+6?WRo zUt|wH`nXLQFM&Nq=UIrZE!%h6w29jHf6SpaSqhCGDph`?8m`=z(+J(cBZSk}Ypas-__yMjUi52xdAX_QzwA%Y|pFe83sZR)r%RhvLFwMMmbB`{&G=<9X&nG}1;I z=!ULVF6ajj0-9%x&Y?fb8#sg3cyETAcwng;6bkB}D_G;Q72=^U@Z%@!wmZ3eWRlOH z4)J=aDn^Y3n$w|g;ZC?hgu0+B>M)mg39!{)wT%$Ckv?xOqRki3VPz;?04NOW0fNzk z(l;!@+*2usUQ=D5xIw>AfE!`%ZmJx-iee|Ps*``%R={o6F#q5b`E-EJvMETSB$K4{XY z;r7UeE%uV5X4HqettOO=;u(1-uAY)c^JyknP;A9KmPHLy-#}6 zo8IK%`5zP}F7%;JaPTVIlTjpaQ8(eIzR2h1n{W2ZgQ#m9)|ne`ywNVXFm|VKtUk26- zLV{}~WI(w$$Ah0=vqXw>!A-n=h47ZN%@G#=bh(u`TEzt-t%S` z0~S#70Ox_^1{8qj_-oVP5NeqGStJuCe;5Oy*jopK!W^W*Ph;j8%n+m7d?6kkQln9M zvpm5UICB{C1wTy9Vm$D&o`8ixy(vdCyyO9H;=v!*k$lP`%AC^^e$o-&%uzQny@FB% zeAIsi2{Qolh?Jusj87G}%JL~c&1R-I-yNf5u)tQE1t_ake)>?|7o{*v} zCEg;g`tCvW#XD_h@JM;e9LwjEo6qK`zQV(@}94QOv9Jq}{^nq@`YX@(UOF#5-~HHSpsP z|BHJo4%LW%kgeF*Wd~0fqXn&+VpP41)f#|d9HqSePCNeSf3Ok5IaI4}eNp7u>KT1o z^iROjZy=}}Irc@fJ*jDcA$IK(o9xw(uCOzXpVc_S-(hI9KiU#) zmG%VRhojxbR6w*xOyIOtlwcGD1Q}L5ekc#{IqAR!?~{%n`1$eWfVp?xL-xhbTx=)4 z@C@tR#$sG4A3FJOr*L*^?tiQ-63IX{MN<{&dflMwnS4SMbg~$b^#S36=6DAVhqZ)( z68!*ce%P{n<&{_3SHAKUFEeWhIIDp&GX2OxL6mx|khrLuyicJ%nF?Sr;eqM~6oBXG z%SXee!D8}<;bCkTfX3Qi7isBGz#Y~FOo)tb(UcKzk$+$EAx$t+(24Np@Euk;W5TPR zAJAL;;D$NFfgjwASeWmJZs?~>y`euk5}vt>aPVOI!%P_S{oo}&>M&0i{$WbDS)JBx zT12HT)$5T1Z0f%ldDaAHqJXlgLk>jl3u&mQCka~hPRCc$4jjLT2T9G}^IO*9fJD%f zGZ;!&;x0nGqsmhU>+M(2!CNh5%6Psp~jV=n)pr`1?@$RLZ{PK zM}}U2D@ue{!Zg8Q0V_IurQM}p+fd=^-fg$ed)&@Ap+j zaee@uPngd8nny4?w2&7g96C3w3M|HBA1@sldG?l3p#fas@W-etwpfNm0lHh&f8gPJ zEqh{x{nss80661h>(Uk~A2OAT0BZR~zW#JN3V{_7qOTO>?vDuJ$mlL%BHFw2Ciyr_ zgM1Lg2uVi$TAeq8oo)q{i~YWY>T~bS5ij!o^>=tX1{}Hw53-0=W)R2 z3PJ3-k1Kv-cL5_>R?Cx?=>*mfjME`L!!++dS_1U$d+)akzjU=7bJj^d+HaHa->!Br zR;&M)ui5Bu!}PvxG zzQ60PyS&WQ)17zT>FB5z6bJI5PQb$gL#zeVDZ2=cKmK@!eJ7$beOm3AP{gc7S;tu4hT40-mx@A)FOiP;AIqY+ikabC&S2`un_(& zk~fV4Z`1=q784&o7!-}ZH>2|^au}ZajURZS5sL!!wwmv+;^G||5ij@L|9~yK{V|() z(!sWQNA2q0l%^1I`S%nJH5Vd~kUYEw;ewR(o=(qVq+Ek&lCB2OzzTvEF8}~Q07*na zRHu5-mOsA`2un*{$4(wqy2xK>2+kY=6|$IL@8Rmo8L<@vWpFXV_xA=F1zeB#fJJ~K z{L{9%08a}A;|)4>l(mXGNrwV0SiN0a+_c(W?ZB@q78pE3`v?d8v+c+e=j|!WyH&n9 zUGPaL^0aYAz4>882tKR>jDVMDE_}GGR#UVJ-sk>x)<2)VvumqoH*Pdr|F+uD5t>5K z=>N&>3SYm;PCWW>KLVJ%o@qyYLpSvRt@wN6NXKA?UpT@TcyLfxIF$NE4p@|iP#gk= z@VtH_{6KgPC;vC;IFyi zW?h~#QuLVZkV$(;^j|KUYQhAX>m9ISYsJK3)U^7-<6lfBxPb3ZBA&!)edLh*CONBnq&e zkEqZhhT;c`pbZSUVpN1b76%3hl|`{gYC}y7u%GvLv7@!F)lc07qmOzXb;g| zXghsvZ#~P=I3#$fzxZ?7pqC$xaFkrigo&GQ78CB2()&Mm+-0Bn#J}4KFFMYatzK_j z_uJXM-A0bkB0o*Djnz7Ur@Z`B_9&Zq$iZGOVVU#{p5Z^xN8c2D-y8H%R$!wZ{2S~0 z1s~@yfs;7kt8sxN;10CsfCBIwe{mWhE;@z2k`w7*dNh6x_Mwq?badFLkt2K#y)O*b zD1Y*ZRkpD*8IwPw4g?}D%F!r&tFT~R=m$5q0y00(=sW~&e{lzM(b~x^Lyn)M(Z8lb{PPvkDTI_K)QbxvJ^dT2lXigxPzdxwvN;GN zbl&+Pz`_q6(%=es_h)!STj?A$iz6c9p#x#T4}Z|-(lx^oX&hIH8AP7Wzga1fvv~3; z+H(p`2v}`S6zBb?keW&psmZ;}P9FS(C07@l%2ST~%Tx{uZHa`GCRBl4yvhpj(DE(z zQk@gLQAhPIUHX`&n$q5T%;n6T(^hCVxiSFa{rnvM8_zr-mH>B( z+Z#t72xLtDOe-*d#>!eo7=-N&L#56-O(|f~#IgYH_V)I@TCYHP>0}YuJcI5y^eFGt z`cgNPb>rqvNitn%H0nHCy>5q%^wF@eAmMOc-+7%E;T4fq0PCT6rKK)P2M&MN6My+$ zK)B$>uTiR-Psy)H&m%X_m-H0z{_a4YKBa4rM?qIAk3hpIz0;cKQ4`%tRkH$6Uu^kf z9=UG6zta#1izjuUa1Ntkl|eym2^17lFN253QY2-HGa0yo*cTd7Uj^Qhh9ZpR4vug5 zVEfIgE}J-Ix*dJw5jxAEQqu>?hwa#NUT7P1=Kol2?Hi>vHoG#Fc13K8G2 zst}QXo>wz^T7^R&eM<~b_7-2OyawVL*T8@R(70ZHr=fjq_#rGpFtbn&CS}yjD2A;< z-eDGUflfHByzYIm$OF3Q09cF{iwT);WMw`bLSO05a|b_+4-Q7=C@^fpV@d)1xc=e? zHys~!g2ECO1)d0h!u#XsupWPck(3k+ zKmQmTrTOnREt2D8J}gn<=`^V-UDGJq zIYR!r8-692O*XM@to=sj|6vLrJ$j_BvR-db4%KaVe(t|46B78#pFGI^_@u+FdrPOz z>AzdNGh90mCX@Dwj}`dS{+qHw{N1Ma!0fdK{JxvBSl=Ng>{f*Ik)>4B`0 z=QJ&t60IF3Wj-%@$(_4)ySXpxD_iCd9)vSLw)ZX7LLwI9(NS`cP%J=diKIm&_@SRp z7sY}OfQ30I72wA8wKE}2bVf`kuqB9#Mi|MlZ>vPNIOrX!j&i$9r@y%MF72W^ zL1xokHXPG7K-G#xpqF8#s0wuHG`vs{JiY#ekgh5Nu0WJYGrU=t=<^Y_r_I9g_6UPE z1Q&Fn2ql5)a3#$}upSOpO4I_D@G_!|^w2WAN4~XsNZ*$CYT9&*UxBE6@a?b+0RNPd zXxW8yo}>_1h_N7&O| zSlfc4$jN-HtY`X&wuy2L3_q*`A-HHC{G$CqL96x)T*T*TNcT)R>V?IQiOSPCfo8KMixI6!G<1H^bT) z4kMypiQNS#MC{a|-Y7$EA)stgzGmei9{#5u=?ezjb5jEY3czzy@BPY^P6j4~dE&TT z_P=sz4a^krFeQSN&W{CK-P)?gbGXvh=3yI?01g;3w)im`XE7d{dI9k0TfM%64 zi~X2N;Px&qCPHZMspDu&wYvPGQ>9ZzVWHFJ9s=^AbHep!R3T3qI@O^H(lE*el5p0=HwN|eNyP{NJ;6)*BDQ>27@>n8s*Eqd@`t1H zKY~-ea2p`6)imXqy^b}M3A9Xf#i0+&~C`J^pUMEG~9$v+r$AL&c1<@wJ zMZ1oLe-fbyv(uf*?iL5HAIOrj;Bl!2L*=p2L+#fKbR(Pu;252Gze@)N;h;H`Pu^)q z96rOrR)k|6pe^xZiVa#2f+z#5LPwC%zQ7a05IneQn})}pZAa}u%a5=9x!J~tt!>P? zwt0Ixf_R*^{h`dVg>{0g0=vW?n>hzy{8+P$RAfNKjG9Bb&3+fuMwOFR{oH-imw9VudU!(#WljUKZgw{0KX?+mqv?V zh7mFO!(<_t5z;VJFb16y3IS;_{lmyx#r(s^N_j{;*`w6J;zBqxGDkQwdIm4}$RFJO z>A-^Nz`50~RZyg*WoZT~GZdcB@AJGe!@wA1q#P|L?lZc2xT`)3Udc#Jp{QA0-$^I06n~$DVU| zkXaOo~o!w%@bP8K^jgcQPq=6aZ zbUcJJogJNE&;WekjbNi7*WO zv}KfI1R~+I;kd{r7l!PTK_FrsAPsF5{s}xh!(SF+FFjsr%B;M6^#Ax48__Yzc4@@V zRzF7jVJ3k!tf`CUUqA#M4 zK$wR@z+z;SZxkWwfG4~ZWsfpbZ`2b(E{c{)t#BpW(xgk5vf?Pe1=9yN8`b!d=T1x;nROBatI zQolkZ=Ha=&Oc);0q;h&lOISe=&ks7V1Po%4UzQ#uycT5e+CzzG7*^o)1ooPVJeUN=TQgh0u!zMdK zM*t7g8kg0px7eS)_+(dF*bbP}lKV*)f*a!C-6$VC2wx*)W4eGc!lz*!K;ftC@MH`O zqCTs9N;vqrqWiO-{jAf>8jXMa$A9!Zqu%0)#qX1!{G{hk9v}M9hkPm!I$6tb;e{8L zQ#izN?RumI=0I~jrN@C*1N&%sY4`{sv|cU?V!w z=hk!@ARPgYJYY<;_uFs3-5HnBGa7pc=V~4>DCuCDR*ranFgqO~rv@@&;>0fKqf^Ln z6BcP&;Ux|@(o#Q5HQ&z&okfILE$En{(+K>)_r?JS9dt@`INTF(+G(e%&Zpa=N0->$ z*WYH-X13W*SyC7=*9o)8!X-&vOb?-!#8D(@L7C&q<4MyZ9px;`TEJFt)V~HUS_ppV zjfcKDP_q)`x|f9}fO$Z*TW4t!!(JX)dg5 zD5|tslmr$SLKp2AK7_Y;kDnv!!yhlBaO{@w{>gPWn5|oA)3v{U#pW)Xq`7{qA-?@j z%D~8>625}n#9}~&b6Aiq@6-)|M_AjqvA%mCt+(h&K$Jb6&0%paXZhc7!wsbZK-rrKf0PNX?tRN!-eTYU<~Ln=`07``>V!c9`SZkm z{No?D6HYk6zVVH3`1$|ue)qebCgMK*=}-F<1~^{xn%CId-u5=1&IugA9B6L90)`dV_+XL7;po^RAN-lCgC%J$%p#rA*1<(Mc;wIH@Rzhjp0)g{^$IBUmhjZz{9WNm zsmR0e7w}~;op-vyGA-H*m=ZvqBydwmEj{pMJA#fVi=JFNtXAkJ08%RsCzvhbg=0qz zwc8hMuqRe;v?&wD3XUm|{=1Tf0tNiR5638!g@gz|+7@k;1+OT@@C9&~v{^e4(*?o5 zw7Z-qs`(Vzl@GoJ?RCMT#rEM3e9dN^_Iz#iTdR28hoA+2W3}~fOINoP*e7k$xG`E2 zu+95|Vd@W_6kWLXnKPWfX#ddBTbI)Y9h5a3{R|corV9vXs(^Aau({@%YkaiM0bN)g z=_jg=%YNwpUiiWndemS4^%TNkRQ8s46+C?=JAFZ~&$HbDYmC6e04PCX^1P7 z24A$N_Ts>G351a}$wE_yzC@hj>&ry4Vj7|-A8p6$n{s51u=oxHd#wJHRguG!RV4Sa z#Y;lyRRpaikV@n`Bec9tpxdRi4Wr20cH4r-R@uo%&q`$1qe;m6`8V1f$!W9jLWCiR z_)!;Z-DMFSKP-H)m=I-vABq9IL|WQcX&UgnW0`{8geyN~`ZT4ZO+r5u0u{NFa z&otm}@jNh{S`RRN>Lj)IF1HG7l2R~^$`M_&XC3N`hV$|!Ki;WZ`U(^b zrfYyFuD!k8aRlB2@j0LX>?2>DMu`wY^JZj=iI9{z9RuG1+#_|I@`_b!>_^=Gci2qZCJPGA z58x5aNgCBU<@D|#nFQk+Bnfj&PFI{CPV6_F4TTxH{xDaL3mu4&2bGbq|&3 zCs_sPdo~BzP{noaR++UU=C+4d*sEWtEwYMhL|K+;zrB<#z9Z#A5Yo1A2s#89=5lxk z9$|FM)Bwsq@Qj>M2H+)x7YYFm`1pDK33i0q@dB;<|L_McvyRhVYAe@m^tt~L68>Yx zjJC&DtoGIZQ#Jah4`5;Nl(sR(IU${z8X0}Wc$+Nc_emCoHf#r;b)WwA1#e$@g%L#LY`qt0Yo_X@l27x&olb31JIoFhDWFi0a@)v#a~aE3Av zjD&z@1u!f9ng8qP=y2wP*_#Py@KtdIUh)pybbQPoE?l_K@BU`wK!=G7n*1gELwMk? z!bF~Q5a3*@d08wWtWzi#KNcO$W6PefHFy8aj(ownwyIOdA1eguAQa2&!={s@M$(l85 z9F}^aZBTbu^vDlCbrr6aMKWOa_h<8A)BaxaeW-jiP6QBhVysGK;Si0tmEdefKIuXM zps_~>$(`U!mON@p7tOcH(`VX|M<1h3quxB(3}+q=t>EN`i~L|-7S1uJhWVWJ1PIc| zKgZF?A9(Y)z>PwHRfM@ytRl=M(jkzB-=1918gRTvXTwwiqkj}GI*J@`SWf0WG~bgB z*Gg&jl9zdU-l-k}cM|^5$rQBZG!j&%(@QD1i2zsa_q6uttET;*i@_X^gXMWgx}XHy z;V+)mcanIA7!i8%U5js9rq1%Mt$TWCx6l(KbW(0xVs&#U5X|Ui?y1HVSOd>I1&a72;@5v`dun z;2TE$?d|QpsEvhfjFw5q?iAMTz!NCNSPX{ieE)|ZS!AF1(3kC~Gmh52f6V(-f&`;> z1#r<1)xdGUY!T!Vpz&H0z!@Rqv_8l-?y%<_Jx6#(xR4B*o{fqCH)Vxx20OS``E_)3 z*hLpz*ndda3ufyyxG; z0S=Hqja9A#r0~ymF<}B4K8(QOIINI|Awt`2#T2b*4+Q`P0uv&g%02h|RwlQt_Q&U( zX$w|twoRQ|Y>dp?oE6>|7mNb!{Lr+)4`VS3i7AD?YS%el=%JHjM1+};=>)WVgfxPi z`U&_}v{Qflpc5A~gO`qjj)W0NSX=0L123@DW0$64Zo8XKLw6L&ObVtPCwS{z%1S|{ zNB*KGCd`BRLzo=274#)#zyVSmFm9r!uE{quZ>wC*odYWxm>Rx82}v!Pqo~DugySWw zf#7uj;*_vi_C4rI@I@-ekZ2-Au7LNEi2U)boNVP$K=~kEvb(4AGLae;17#@@#^3tl zraD9+KmhK>@*(I49$jUBaKhoglhHrKQ@&6D8u51?2LA$27;TnyI{867OtsN>tXZ>K z%JJj&@Bev~*_@f01xa@RZ0ypuUkP^(Qd+OAfJ4PQTQ&dBlm~5zT>xuzJn-zvbYAc_P$7kmKC&xWI1On1cSy-^;)XPoGH`Q?|}JKpgQd*v%%X`lMkr|i4m{jMir z(cho`>7TmNL>;j^;B~KioqgmZAF;20{p)u1)mQuU3BVU$e6jB&U}4~uS6*rV`Jeyk z7MG~wMl|g2VFL=lbNGdW+0pb*KmBwUI57g<8?u6N5YCv&SlGtGndd+M1B<6o9qs*4uwv_DehD#CBWPwOtFy5K=M1FP;c7&jYa& zarnd4@=pSfIE7_&1!VcmlOx>RQb3vl0<*UJQ$QK-(M4Ro8^Obi^o|QUli;i&EpR(s zilSYG5wh<1sdtsK*04WwKH50ExhF@RrFu*A;Iva5VQ~=8 z)Tsvx*AV;F&A+xw|MfyU_Uu1#p^lmVDYX&GHEPrdUGB5Zjy~cr=dYzY2Y~B;v2cwY zt_8bPr`qj|Ll2=6d3n|y--^De%5!qIs#0pm=T* zc5dD6(*e=O>+{+Bx4;|iu`d`SZKR3*g1%#$=ESeP;U-1R);gb2_VDtxx~^KM{%O^J zo6P)c)@`ss+6k~j7Oriw08CUEeAL;cIe6U!xJ3$(%|6s-h##1Gspb`VK2uNd^F6UN zgy3$5J79Uz7sNI}Zx2eubRfTQgforGFL!~by}jM|4Y*VTRhAp5m*=PkXt*@ey>Z}x zu@U~5*H8dBh@+ze;m@gln*UViJh^SOp9io>qk_K7f70oI8>VGsj*w<+O`QG(?R!&( z5bn%vLOWIwgf!C${lOhf58j}ebp+sLu@!iMX+$%o;$2e6?wR+H=e5If?}DmE5ZNlRUyAe{j?}@MMUJ zETgl{L3@VP7KJvd&pd7NgS~*WZ{jIm{CZE*o*@HgwDmowXbpi<`ew3-nn(H4c;XX>! ze6-gQDi;eNS4!DoDj;nM)G3ClgGV1)Wq$71Srox`lCPcEkyZzJoAC0 z)G_&N3680989s zy|cQ#7M{U7u4aG8BTtui%=q=Ii*JCZniYyjW@0Z>)X`kvttvMHrg-(PIB&v7kX6Nk zBBS8er>MRmxTs%Xu_TZHDr?CoYkK3a?-Xz(wQ#SCEQLpxe>BA;iwS!LQ1B3pvXR^M zw>@Z+NAC9ZAgsY*u^8wTch81LZlO0PG`QIy#%ak`v6y=!F9> zFv+v8mPQVPRi_omhjYGXO`B*p-J=uw4xjGK-Ynn1Plp(D@^n0y{bAm|&iCi}mS{v22C&+$@T)r_&GN3|^~}6I`q)kF*A=E9F?;$qPT*On zyiQjNigf^~3;~7`034?U685w`w&h71KWVBy?sm6oftxvYlmXV~;15AX`(eZhObEkx zMh<&==4tl?+bkX&&~)psZn1BF?F)9|*{`xLZv2#zDjT4${O{D(KW1vy>B!@`Gj)Zv zlm!$Y7WzG}+9ZZB~{KzAZ*ivmJWD6nYZO-1OqpC6%qk)}T%|1!f0QBE0 zbS-N8^huf$(BisEho0kSJ|43?3;oz$gCzjwt>W)(1%aEibYdKOvSi5;?`T=vLr24} zKez)g3JYr#*fPa52-^~SJCzXCz>%tEkLjY6xi)B5aWRqvX%Z|^QdMFp$4XBUL=HlH zqi{v$;dKIAX&@qxuVqpQcg06oqy;|w5f^0oJ825%_~rM)Ci=YY>J$a=MS|?-65FB!s=9Y+ zf3vm#cIkbzwg+~uUndeq+6zuS*-km>crB1!SDs7=|G=M=nMK4=cFLJ61C*gRWskB( zdit$U03vSS`Ma%w0R`Z9Ti*?H>kIP)0y+_x6OEl!usktuv+WN*IGcAgsVAIs$Rv9} z^Pe3vrZ{X81}O*9qWROofj>?r1V4&EbKF5;^Jn0u6JZAd>lK(z;0yt_=k>Q!052Ug zKP(}f`_0~dI2dR=C5nKzXUAkn9PiIUp8`$X7wje@5R zR~NXl+a6xB!sZ+{#S1f9BVb1UD5Fe4vE2@Z0L7PfLHk4?GD?OYP&O%Bv@!g+W_+FM zA@QdJ_VthO4<0wYFkor;XQ5f2%ij>5!jc65?H+|7IyPTnCO- zYmL?i9NRv_PXQdSwE;V~Z?i>fx7aZ+p5^oZoRNT%1JA&lD9rFCNAbe*l#g9BdD-cM z`cig^3J=mpVV%Uff@@Y+9y##mS*w8o1>jk$+P))>j-ge`=v?S@7@;6QtCt1AlrR|# z+sI*7Wr5qQ=@adic}sOc$Tl4#imL|%usOU3W`C-2`Mdn0x5sEZ&kIbv<1$#^O2~We z9IP6KfKFl29`hu1{>Y~;khDq2`*+Fn^r*5TQa0Hu^d6*kND@z0#CbOQljs3Sn&Yp1 z_hfMb4n5A(iv;$>n`S;9T)M{2J2P3GI5dZrNLtzyr$h3CpJ+=6MHB)~odX61Go}kr z4&V>ypiS}%7qk!uOn)3SV3vP$>Ere{pS;)(KjW1e`R~+9{_QqY=6}k=PJj)YHf!PE zU|n|9W*a-Zq|85M2ag-ADJ)HStlMNAQzv56HP?Qz|jZ~ar>(qPyqIk zFWZ}jM}wsS(1C~%=x#f zgJyW6gIck2wcUCB9X9#6cH5~n6V?0CQ+jHWUm;QOhC>G7kTmWTiQi%H6Bu#ws!q#y zH5%9CH+O%WdY>S>)X>Ho;?>wyqzN}N(s70&8wZeagruL}1O)48#f>YORdUD6b zPSLKL5$yOQXKP{LNWb_h_#0}VmmiLB)&`*LOTDF2!j4S;Uped9G!05LBJxy!wx&FtdqVf6;Y=73l|CVV>W)|TwC3(ebNYM z&ktdgNuvv$z;~IYR)NapcW4!&oA5z|!9lpX2e!{rFGqYwyQ}{CzJP4HE5|#u|(JnD0j$ z$CTEHd0^zwwwslokpc?Vz`Vsn8L`9RwU3nky_pE6+VSXytfR4otcCe)j(lSDg&jp7C@#p{PtK zly~YuQ|DceS1@^So^S3iOyC7VrO6W&_yciWTmpJi-&<%;=aMGAul0*9qo&lmUbdiaU#G;p1q#Og9C8 zIBn2Zp&c>^!#c;&)R$fP1BL9A@SWi28K}{^1%)#{fLp(*)1(DBk8an&KRcxDaM=;- zA;#)}AhrQ^NpX0JgQ&($&?T2MJYCdLgJOG7qq4(mjmnPY5Q_oA7ac!NN$59O11j0N~{? zkg=nO+wtww?0@c=Z_`HXbftnjw8=XdwTd6y!DQ9A{2d%I-=CkmPlrdRS*5+Vc*#3P z6X3;^h{8e#j2}GYjia35&uOdrfnlUQRCPV?fd}p1{_94atA3hw%Y^Dqc;eJOa0)5N zGc@GKd!-xAUkjR2qAa2g4`}pF>tIs~|Ld@MKKS!@z2>yJzuHE!yc1Fqxv%~YVFmuc z=|T0dUU&5g^+tX6#0u;|QmPO}H7t=2NQy*c8!qzZ-NQ5Z%p2hGBoAE5rv{h~a#E}a z9Hw8Fws>0GP`i7<3j33DcDaQhr3p;UpWyKJAQ1QsVaPjekkKScG~u-OIJ=+GCrT^A zkP$K#g4k|H{m{O7^231-|4?1%b^VRE*f;+53T^v)sV@BKw#{0nGg88ydp-QHDy3yN z&*))8{J@__)~&H4w8)or6D!wmlmalqMrr}!U`-EfmBnD<+{wBDaIz{%c@_D2roH$R zS>{v(W$2fx@g@EZ3rVctfI?BGFS-(R0={I!yd{?^8Ip-X={EDflCO zUBrI+GN1tLBOkWf=xE58$_Ph;#>8DU$yWtvUKE(688ZP#L#H<#bkGo+G;)yr>(zJJ z7ytLGY=%ZKY#AcH5q{|5w`bfMKTE@JHs9Y2cPqTmO+0w%JP`hLo-FpmHBwl}hcv0Y z4%Xuj@zgaiSQ375<4x8*Pg|%?nqr+YpCUe0Y6+4M3ka6*2f7H;Q}rWcMMUu=teGQ@ z=rTR2gDLz0vstTt_)n^YUx*OInxSWOr#ByEZxj_?@ma)1^V0G@=yAB;XxXfc%|I2kzuPuu0>$Xyz}50_%f zx|H|*%U5ma(TA&DYB8YB@!u{!-=e+z+XeG9iUtaZ{;|Htb625xZr!@gCNMo9WnsIn z2p&FUkS*`pX|Fwbj`Cx1V9zbXPJA!_v?@D;02WckMMhZZ_Q4#1o(7qcv@N*m}9pF5j8o0A-z0J`);6cO3 zh;g(~N#WHq0dhPcRP(U#!+6BQaF{zeF1R}1--yYg10j8*c<>X)D4&rN^@25o{ry-% zBHy4Rp5P*`nTxunUfDfBCp@-|dBXd2liJH|txi%K;&aFuhDP}4I3nD$L+Hc;C%Rz8 zmoT|sQJ#+UMIOADRse?YyyeaW`~dSOgFys1KH}uOldHlg-048T7ik@i1cf8s z^NWDuDLDUTim&C0!i1}Z6ws{06NPy={#4FZ-`)kHAk$UBHATd$D<$_Vz~Oo1_2Mx_ zK9QXgf@{~42iYv`f_dn19iP2&gUy&a-c9Bx2mN)`a=?qUwfG^Nunxr4*6b``>W#&{ zz+w@I`E%A0uxRo>zJHZ1yM3NL|K(@cf+tqnXbHA$YWq_p*eAA)vqj5R=+25!wyv{V zJA9`2Q~+B5$8aBjPW~G!D**!oiZvS3`+aNe@Yf!ulR>2{h_))t&t%!b1%8Avi^1hM96Wf-`u%#U5K?Z`A0EA z*ug+F{;DHB7={t_uwldPB`-SLmT$kyZkxB*Ui+ex#D$F&2XMiR-VxL5Rvfqy{4fx! z@nK{%dl)uM^OYoZuizyu4qVY8;U|t!QYaO1;1Plw7qI@4{UMz6xGD_cjI>x#iSTFN zcYAw>U3vv`tZ4-^XT}eOt^$XU)MT3bL#QOWJtG%x2#=_vNE7b~stz{Ck1%ASIvjD4 z7thR9%apRN z0r`6Y(lBjG>`~}y>b_I^DU^yx<8(Ss&&=`D+%zL$PrH)$ls)+9N_)Wxvx~Zllzac= zJLTUBTCiBbqqJ)*0I zoIYu~G+i}KxW-H1BQQ~1*$KdW{|-&}wQEb@8chRW1#mu-Rba+6O%JGjd)@4PF&kpY z%Z;CY3}pu)Okac&&kuLgO*h$(e)J<(jHn;r=~EEWjP&VCu%JXgK^kD_?{E=LnY<2^ zmN|YF2j6<@t*(3kgYxz2SHIfR@aNiVueFOVy2$?HKmOx@5&o2WKmni@_7O*;r6I>k z`Qwf|&KV&_xH%*3iDYmEGXNV54#S0V(JnJwJB+81>M3)l+hy0^XXpLFv7!N?*)xa0 z(F!*mQ9j=fY;Ua=lrQp#+$jSc01kzSPO&da1vsM~z#Vz;MEJ9Pi_r($DKSev_V_aU zx67`#3CH|_&#`j-ZyIstLaeBqs3hS^5=;;L5f0Je97ZpC5nR`hkBjNlRblze3mf?kM1gh`%Q45}$P& zBv2*Tw@M*qvEVj6SLjfnV`l2cODP6B+4dmE&3}h$@nE+s4%RWn)3Shbw{}C2|L?-l zC(utYqQ{{GkvDb1S{|$eRa&SAgg=TyoWpI)Zr1$75Ll>0yXvp=(6{pwe4 zMFl3u?d?|dFwjEDpsz(CJ5bz#VdwZg_?r318KH!b!#s!<9D=Ja#h~%ixN)S1>1fy} z7YI_e#2hjA2=8QesPzgGM~xHTUJ7|00#$i zllPt+c!7iFU{L%hEGQ7S-+sH}qF!M9{wNjT4Th(_S%}5@3OXFjl9Xl9qf2!;%3U^H z`@uJ|_=&H= zIF1Spbx7hxX>-LTf{SO2{;@RBe3~>Nhwe!N`B!MK;BWOxN-u#B^UbK|s7B%>DEAZJw8YZezyK1#1ZH?`YL`n%FC1oJf8mo7Ny}$Z+eqm ze);9j%LsN9c;@eMH{N)o?_>fWCxM=K-g)+yfBBa_*Uv&krV6NMEGry8&O195KK8MX zIbQn6_(qO!KmRje0eFr-FpZ4{N#nfx?z?^74k&PdF4 z`mwX@?=Qd6-gs`e?-%A^kw*KwQ7)L%k37JQNiP(O-puzmDi?G`xv?m)Ac|2sQz2Lt z`l3_>{tz;B9@GW2G2hRKg1R6c;ZIq*)p`Dc>wiZ-&o*q`VIxFOZ4^j{t~;|~#ygEi{kqQ$8rwAF8rbiv_zXA9uLQVObfMe>o zQ4YhA!DDp+(9>LOwP1^#@P?ThWv2yzjp%r`!r-~+KdO0z0#ps>J$M;&KLQ;q0)vE5 z>WTlr2R`6%z`y+EFLxy${tgG9ab$UE`|c@Q5C zC4h{%Xbl z;GJ41r}A?Y@aFEVsue2pAWh{BalS^m=@PGE?c>D19oqh9bJ}cn$1EwuL!CDmEJVFQ zRWtf}Qry1~`nh~1e{Zosm$x02mwo{k@Ay#$aFhjwABU2VyZ8p=A2dYSpY3wzbnFZM zXRc-M3#Iuzrh!I`S)B&BH;piF+z32I(3~I8-mWw4#dHy0t*SH)Fx<{RX_o!$u7!SI z_+XhAgE4@|&yTq|77}u5Aam!OdWNei79QBkKj4L7nWA8-BFyuQ)cY&^gLZ^J=O{4p z$C`jLl-Gr_mK&rCRvy#yer=Z;?kB9(gobH;J@FP{z`4J8kMJUO{Uh)w zG|)SM!t->+XtqpSzGvyn$ih?78xi0vl#&SQ0YgMyz5s0StfqB(m9`r0C<#wEpg6t{ zAtZwb^5P~){Vmc6stm$|%^VBP8@DYs62w4N?fE>DXVa0$dO@LIQt+E&<{!fr(0wm1`Hp(1OR| zIsvg;3Ioz;Wk>v%s@h1LH)k}BIt(K$qgl){McB2M^{<4MKw%i>dH z5njBMX?PcG{+|2m@&i`5Jv;wHfZiLia~7Y&l;!Z0jsFziJ|gQdMf!LKP)TPQkUWB` zORrGcOzEPml{{;8SVCqV6w(4h$e9hnad-~j6&HE>v%u}E|4CQ82OJCICJt2{ZqWG~ z4=(n04lYjMgRk~hFh+ax{GzaXJa`@eSSfCnwC2@;<}2>aq}) zg~9M1iamV=iU4H}lR5PnbsF^U%@d0Q`1|Xq%jh$59E3$bl*0rpPh2%F(&V@f1pmMk zz<%(RgE3(!RAmkQRuMcoHTT5+=e`o+Mey2=&KlZn~VeFYzQ4 zwZ9dq%!RSL(B8&7r!gI`d5(yB!)?;ZD zf})T?RY0Q6B+P8(laeSsf;KwnpaiT>T4t0+{>vpMwR$T$4F<43_jU|;gh@7T}3_d`4Oj6bw( zD%rl=S~Y22aX#mq5J3tJMCRZ+N{H7+IlUr<(v_edbZfY+ms8_IBSOZp+v(^&`4aa zxb>!Ap?p(bbiV$!x zhd65FF#FR}X4#MLTx>6S{?R_F8Y0}73E8E<3V)^)IFd7_EL!1h76;vQq;z^%E7;%L z-rnwE(aD6i-V8^h@EAq0s{&;OYY2OM>Co_3aXKBMVY&7I<3Jq0TL1D#ciN<5r}~QU z*h=PksBT=SBbR(Ly`B-KawK^L4B?)l7EcmjB7%6wQ^3_HiooKTgheedkf)Fyoi8J5 z)oIPu!+W^XTZA(ez$rLY*T|BPrz_#B^c4|hos@uk@MLs0%qL4;jnAq!gOaxvf$9-T z1);2rCj_N{k5LPg5}+zE?fiJG^C_Ek_*{9PvUO6b*&_~rF^WbgvR)?yqf)p~r0u0| zO$bdOD_07ti!XxwK{{|>b_U+E+|XDP)!M#JuJ&zI}Ow68KyM(K}1Z$ zp$xEi7(RzbnQy1etoK1-CogEG%(+59VH%a2aNvo9zftgGYaej~?zyRf0R`Z>srP3j zR~U~GK8%I1Lx{4v6-I@D`|5B%tbFvmp z9GyU4^MP(UCoB^z%0n=-h>$f4%+-UpStrsgFX{k?pr^A2FH;nBUf?9pMmQsE{(?nH z{jg0OeVDD(-Ra3yI&kt3ZC*(!yW%~-6<5$vK8f;lc(Qn2c}Lhr8pmDWk9dC;e~1s* zGCl}VK*YNM4_b2sfS@2`Y4Apw0@Ue+4mpq4(_uVh&wrZ12Cg~`4Fl^Sn!V_g+7Jor zgG4VRhk$YT3~f^8YVj`ki{cdzPm4g^B6GsrF~jVd1#A2~0Jg-Akn;9j$dWzq#)Ld&6u0(b}JXq;+bdp2LV%t=r(Y zQBKkP{dSG|x%3C~K6JNf{(qc~{@t)?iy!U1LG$}Z%$R0dCBRVt#wpEOofODbz?==> z_Z|p`S%=N(K-o^EXWc<4^C-wDHVAqYX9P7TdeXO`0Kv12+);uUWUw9xUdF*=@uOT( zMvpB15C#tFF&6PPg3HtFji34s*C?O(9szTpJqHwkedvqRK=TIMiZmDvO`MTA+wkao z*wYRJ(^1h8;ehM4Q-8n>firct_{(ODwcBI{9scxo?Tw$WQU6JrQkd+_8~B2GdP_6u zf^J6WSSVPZz#_s>2>Mb7RXV^C^ZnptzMoUVn$br%*Z*#l0&>s12Ze5;kCvXI^YV_P zkRnAVk~)w~vN!>Bq6$+Q{Yicg&;H~>*Wt3b9^Oiam33e$NCmp`4sKxhK}spT^JJAY zxiS!7buA@s0_`)8k8g29b~e|kI}rlx_mPvgbOBB zPa1`y6zo>GTLfoPU;@>W95uB<;ftz#|z&7C+IfBK3Yi*W} z{aw6jtsVS_$J^vd6UEEAkyGuTJ_CA!_ux_J!ZE;rN7jce-jo|fI}`>K0t7nB z0ena~5%ksa0#7(~N1xRn2Yuj)`o!PNMLk#J^R(cL@>k*Z_xAw>U?2Iy%^GTzykMvp z=`!-?`~bGK!Jse~!ar94g0ZW(s_%4)tOFP`dX)XenRD$;Ke*lgd)#Su=9y=Ru{k(s zyDJ^ReAV==-gCO?Fq!Xf^L|xFAls>*06%ac?5LQK`U_8I`*_0WvyPd-hO)X zUFu{H^S##8L!u)D92A8>(UV}`en(gl&cp9r<0$~)P!GjRol`ff_XyXs4u){z3s)DH zg?n`MU4bQh;AsF8xZ~Lnq+wvBs69PDi@fNj%Mn2j22_bpmw=bxn`WeWjAd!bF?z=zO7IMP-S zjB!LQhgabkNn;&AA%I^|2xyN%3;fFZlT8wa7kuGjTYT&NHuv65r1wSK#-#lU=_@jAnFF| zNNybf5BX4T(zO!Yo?(+ekq&zJ<+L|RQWqTXQO>{@;dQ|K`OAO;@O$)uX`nP#1YWa~ z_lOTT8aT{`+f(Bv9ye4SQg_!Dvuzv9R&BGflV^yL$NF|5wguwIA9y;x@CP$FK;=)+ z!)PMt=BQv!?7~_Rv;{L)k~e)Ne>zogU`@b}6$F}4Dx$*$KfgwfyXl`;zS17N>1vyO z@{6_qKiw?ou*iqHbcJ%_s0w_l;{*e+Cm@mj;}4hTo%u=%!$w*Rfs^+X+{gJleQ@^&5q4W47< zi@=NHyHN<3Lyxrs2twL>@C>{FE*A6s>m}c^%fI}6JL+_uy)k5%l!o<}QPk>WKdts3Dqdup;(93nC@9oV8ReH;XIuoecYqpF z)>dDHx^AOA@2J_LcZBU!-tfwvTwdlFWDrJL78>)TT&x`ks|Y&^_|`E+f)WHjQctPe zQc6T44xHf1^JzuXFMjb0`^is!;)=*y-tw06vNY01`(j%0vdb>>b5NjxDI1OiX8jUy zSWdtB&2RcPX%-!W2e?SzD*yc{Za@L}J^7k27@Y{64qJBONMAH^1YnhUc;`pDYFxF* z(dc1N=F8_l{E(e^%p805$?NUscg(jFj+h|^O)J@f4J9Kwvnsy)J?Q4iR7Knn2;J31 zd4V_R$?;VEz%T_uCyVf>#!9-8mbqt!_+-`b_q)(R<6OO^?< zJdk7vN+|{*AShIWFP|i|`_lMShMa<(&j2 zFq!jojyw;{{C)IR(C;y|^2Y$5kKdVoVdq3sM?Bw^pMdf#>&jR}dJ%IVrSW?Uiu9@4 z5-_9W9rAVz8*Fz!zS$mIu}&xKb@&20l!M$s13w(?jvo$ThQMPKOuL6a!)%V}oGGXI zTC97prW1bfWrFhA zt*wCLTq)V41B<3g;O|twz`VbxaQ@_AASfF*X0$!Je6>xN)qp7ht&*|!nKQgEVQPa# zgMD#%nW2pzj&OuKRu860u#_NVLkZ$kPplxoQ$NH5k3$*SlMBB6pa1zEd-I#$Y&^kv z$|s}Nmg3du)4@)5h_l1uFKpZ~ml^rIh5 z?W$;B0lPmx_gwS)bLsc7vcW_&;^P0~1d=LST7Bxe&k#D^cLr&(j19@Bjo*_<3fo@|wjYuai;GR~MOu6{OZh z7cY?HWJDS2fKq+-Y-a^gb$D|^>mm}yT1I6K{%um`9!4rLLPZH^fUAZPE#-9;giRm0vVK`RQ#%2NVzecIV}VaQwnHfbvdpy3lh+OHJ{Lyt078*Nvsh5O zOguJL3c`h7zSO?>>3_DlfA}(6xO$^akg(pO5&tIb?{Cv;f3`Mm(CvN4Oqt}0Ka2Ld zrA*LrxVytG5Xy^F0h#;XthxU~v_+6r5$m-0Z;UJi+-EUd(_EbR*QK;JcdB*Fn(5{A zqmm0A%;~QySc{ixnp_DY4kd^w5bA;f3km`Kz!g_q;m7c@PUq~i&-SgqSVwT+sN&_D z&{q)tt#5tHuDtR}Uy#hAtFwRFH@xD$?@UP5Wgr2gPgB@?Q4Dq0Q9~0 zz3=tQ%ij6Uclvh?P2glDgyXe<`H1JFQ;J8Ru3hwCS%80@PUK|=frZ<$9jt;Fn zE{@-&;|fMClK3MOi##m%gBO@kR#@Q22!MImu!7`xqaJwX{=f;Ib?Z0S|M%^iZSrx) z+B#j3G9>!>R0M%0PfcVHR77H84}YD2ARa`->k;C{ar++v3nzd8_cWeBae@Q1Tx(FD zuAw(0ge|%YpiOD0gq7QO+u6r=XeuI& zmRSdpE8&Uy{{O9(JElkor!Qc{|EgEL%5UGq&l(_36aC6pzG5Hy*vI_6l{Em1A%hh9 zk#O{ftSQ3f`BUdu8^7>{FSw-<1>=Js{Gd-)Fv#Ja0KNkb+PEE+wHqjE(N}one*R}b z0oX@BFpUraMWbeKKaD(u9i0g-Z}5$Z0dqGJ(%{CSSTJXYR*ojjygZ%b&fT&q*bwVD zbh4dv+En}b-HYwalaF+I{J@o>6AV0#toRem!*s=xB};tHpBn;UEQBvoHPp;FVV?wx7)!-z1$vIuesGQ|JTGsN`er@q@KG} zjYYg_uBg{SycFR@0@WFHTZhAOf`lKj{1VaxTZDUk*Wv_Jz&TWUExt*5sYF9pH6Fl9 znAef&q?rpUo5d3z^<9IBR5e;7B*8>@eP}Ix_=vAL1RzrY?keTWG1XF~(DaGqFVcbm zJPHUe&$#(6~Ty^A`a_;qCaz}s<8#dgo_}+inMgMk# zjX!?8!D-`V&DywGH}z@3-*&C;XYt<{?bpZbPpd&tqk!zzNrlkKngHk?p;0|g1pUHg zt8Mm-DL&F)vthH}V}MlvH%JNw(*`@cciYKx57Pobh*gn^^aPCU_KH=YX)8GiN$J(uV`i%TfeHI&fTk@x`um zfa7zY`<(sDzx<1v`k{sV=ug6d6AQ(G=9+W+fmRdyKzV~%LfFx;@4D+QpFc#{A*36P zbQ_TlE*gH-0+2HZD|fj*6N>^untVBl%uiR-bF4-b?>O@ayYQEbY~{L5T9KR<|K)Us z9~wvxBatsF@|mOKT!OeB7}y*>ryIJ6;~6wYJa|!7=m@yp6Z{>`CcT*%n5Jn;c!X95nM(TylC-n7cAeZ@J%%v^I}2exJb@E~XN?^JvH;Waw`mPZoA&i@mn90`n=)~{t~Jqwm;AMr$mrEucBg8^7B z7t)ygVeouKe&9{ejsFJoh9VKP!{F2hcu`g`9b#UjW8G7I1f4mp_%V}q%KCBbuWr-y z|0y#6Ym1|Xf=MJa>gtnFDQYGapb=PJG%8DZB{0&5q7t}r;HuxhKz*PmD>4OlOV=o# zusrXcc~JhE5)>Kqgi8%1;&}oS<;%iV{>GBCij((>^R#(6Qkqn)qBGlR;afKYMIGwe z3*SAp!%CP4B&%@L0dwcWI1fM(4=i43=bU<^x3wCMfNuSyjk3>wvrO?{(-FV_=L7$j z9d*{9*aivqgJccZrBVH=^_zS&PoA5!24JuRJ(u{bCR_r4qRs_)WW^dgbmBO_%x8xz z0Id8UDIW7Jkm3cD38n)!X&u0rNjfY@N(0LLXr=*lO5k*@;gOiLqoe?!MC{T*LzHDt z{L{L0eUBwS)&wyH5Q_lebG{3_%}V`N4l5|rl~@FDx88cIz4Dc>bbs_=`N%%Xhay2A zf@8hV_rCW%`{EbB=>01e2nH6MhKTD;U*&-N!Jh#IU?2L3G*(8nVLIm|yEu)juN@DK z7IV3%-&sc_)mZOQ@j59>wSSA2ZCLD?X`X^I@W{p&O6Vpy6P&2 z!{YFbZ+yf4;UE6N-t(UK_&Nc01+bmb+kj{}P~3n5un&D{8vg?iJm6brFmYp&3TA;p zdh3FrVHkuk4nYbdpgpr_j{UGW6oFtw4=DbSAB&#GYBlm-opFS{>od35hu^Tyo;P=< z?HZI8@da*#Ka7ZwKIWKXe6=%pd&^8wws?XUdT}g##9T*xu$q_pXhd&8c8^EkBqID# zl@HE;M4|IFdYa`*fuEFuV4~y4QRRiV=E(vB&Vl0l=vmI~RpCE!=0l50^yCpzgy=49ImtYf4XF&F67x{ zlXMH8hV1nyD{rLb`$YTw?_b|xuX_KdZQJ9U?9`W@WOwUy!MQW0TBj@ktlA$jbhNhp z$@-)_I5sP6goOKO3GP)A<~vos>0`AUK={}p&?#ZRRp(T&s(L5v7B;`3D$#fSQ=PU zfR*)K?|PS=bIv&tbwE7_6o7r;bBCa0t_$V}(|Mzjd2h)LW`faR@@BgTT6F-7l!q(r zsYf5iq^RBe+eJDP;Lx(uAT4ukoa%>=KJB#Ae6Q`^jL@kIIsud+loj5o4;Z^Y^$|iJ zJh;HmdFprE&Gx_9zVK?Oj{qG+e-J7OK83_P&v3b4VFmu$sFw5i zrO~53tz%?GR$irn53F8)QMLj{pb8ojobLn3N{IC!A>N?`lh_v0xx*fLY_%OfXGX*7 z0v|$}#d?^Zhv~O+&072Z|Igl?K>3y3^?|?I_fl133IYCBpExIGl7!{VekRWz#)N<3}L`HgJU3$7mRJZwvBspOTB29 zYEx-n=li*@Zt1V;l3G$PmfLlufBoP8z5DL=-o4-7en0qM%1`{Qe_6`bwdL-6Hk7Z2 zh}bf;y7n3z_06)5a;eb2CzYFDfLi?@`=ra~X z3`nkf&kr0qmZ}D-mw;eE<~}Kgt*y)BdtxLw94~|rFqyLpLqtgCnmN(tN1HG8;*0P{ z`TBl6^|(hn5dl6ewYJ}V_ua+EZ(RNl{J;;iESG${ZtZ&OTi;qg?V|J7)2?>^H{}-GaDJ|4dznpc<=7JUvBw^(6fiLtB10Xm-Yb=55!sS0g1`H_ zZz_NL4<9an<-1=Km!*}tx7%|7NaM;MeDJ}#_+1Z3xB9giK_T9O_L6|K34N{h56Wy4 z+P6d_vkv=g-9!Cd5M5_lvD?RBM`g)))KJ zQ)Yd>E$791Yr*+Lw;t(wE$Lj|0YZinW5CZoy0?7K8}F%>{D{+5b_e|3cG-)gZ^xy7 z^pl@1|Kw+XwS4~n`)axQ^|zFX6Q?6Jn_pf1fcs?p{Er5_SH>{#SkwV$QsKhXi+DJ7 z`bg$|7m{1mYhvU_6*n3c#NbNOfB=37fTJN4W4XJt#T~LS==q)~j5q-Y%{_UlFcwhN)i~;8-R{6q!DDY>nNIY*sUoX+3y%JRd;)ET6peZ~P zDi`xS#-oj+I;JOnR?)9Nv9lcc!e`5t{^&`&760OpRFKHeu37nR2N{rt8BR@~&kut& zGd=5ED<>JBb*<;-dabv-T3Jtqo^m?Kt9___Os>=E56-WZ0Hk<0^d?sARaD+{k`bDu z9Dw8(kC0YQy_i2$XFl_!i)nXzmM!n~e79#-@6~SQhWjt)6O%qbnwO$nh+A7%U$2pC ztL@UpDX*PrqtJWKb97!FeXDtMqQ=fYd!elBUtIp(*Y?HQJ_dTmCHaA+7X-IYWY6gC z6Myo#@{j(-zb@bV=iXXA|M2ee!1vr;o``Gdidgn9%KX?LOaBpKe;xsOM$7WOEY|xc zVt~3O8}-=5Z)8#spykza|6I6zhJ9lpD*BfvsE@grZ(BI;pEwrPRU+zTER>C?5Xtgv z?pYEezvYn%jHxRd2Aw>eu^L)hzVl5FW{1qXt5*(-dmrL-YDd=h2zm3Ew5f_**Z@Li6fZY>xTtlJe0nJW=l4x~A;ivpa@&{_x=YDZlRDe66Zu_H z!MU1c>UWoW>hHyS0ZNy#Ejj@Uz@?m4Zu_oxxBOjE6@aEn^(CuYTLL|;WaohWx`51N zxXY6PiM3qG@=aU}_}9b;jKSF6@c>u~0%9v8Vm^8*z>tJM;Ag%z+~c7m$2;IpK7nP9 zgnRT3NLslt=YbJg;n?X;#H?vx)AzLgU89p)TibK4C-cYU8uB;gbve5$b=-UUTF>U0 zT{su+zgFar-;rX)7QnqJVk*^N?Z?G|tCdYPXoc zcfhTQ2cWM&_jJa0|G~rM3lIHS`Rxz=Vfl?;{CFwD%gQaU{|jY7h}kDcCd!s|{bfn! z<8izAWh{qq?0HU-c8`a%ACB1mL_pf{%Gd(fx|cZr=8g6aNHoBJ{qo^~x>1)M0k!K) z@(&IV)1i`i~2TqJja&W+-kqC_FGw)la{WxoOb`*_w|p!9073sWA}pJX(C+j zWM~Vl3A}sv?qV|z{;qsrQNKWXCKY@&9nhY$0nA&st}p-HTep}0{SO~2uieoXE9QY( zHfp}reKqCv=CKeoPD9)PL<<##c2^x~GGFg_w0hj%xPVTLkj-AQ@O|`CpHCL31=9kfku-Q*+ z=nwG_!!v0hhNJreS_1(yTYAlvytouTs&0n~va{o&*hKuvuFPx9t zU)pu@&fP>={@=6lh1RKSvcp1t5{O*T(`9R_CwK8E*mK8!s#>){d#(2@vuNy zbcp;NYcs1{#EOOIS{9YdwE_4W2a*L`59!F)cRgJ``0M|*Y`Aw*?7O-}v%BAN+;Os5 znx0)G0I8?_>GJl1%FG*T`vH$s0jN8y_tdhwYx32ocF}8BAdXlWK(yz&GA@fpqR*9x zsBw2K$wN%M^Uy@ODZbvDVxNI~R89q)TnB1n5i>g{`AhH2N4z5dqH|6jL`5ZpYOWHI z^vSX!oZ3_m>RIi?`7f9k50OX+DoCb|ZZxvH9iYyyWdR-lzdRSH6B^)+9#4C7jW0SW z#X!oy8OPwh5)#JZq<16Ft=3D|j{xYJo;KvHe#DYpxaxTe;=I{a-oB<<-(|gc-^J72 zvq%}9iF@n)z@NNg>b%pbXJIZLjHF~>?b7lqAAh)f=2IUp|LIS^R>~LuI5{?#&2PM= z?0CcNRocQ}?_|biOE`M(*)sbnEq9%9tI?J3lngAELl} zbE@2N+s&1mKbhx)05SjM_1z?R=Ahs?VvisY5Xz&YqcsX??^eHv1^_O<1kin2QtoW2 z)yKw18FT3yKLT?Ez&HLGOt;{26K6WZEC(hqF&+IoAT<#g=FH4qaD4>RRh)<>L6YxpybO-i1WI>lqX6X^V|-0X}R) z{Pfe$q^|Yli(h&uIUbLQ`{wf0#PQ^v*#n3$m2mehr9B;5Or3CJJ=Zei_a)~PJXKaL zr#r6wYPaOu!oMI2=ct>@vH;^yIQ%^c=bF=%TLzv$K%zGW-Z*C%xtaNIX7rr255E9RRPR5pipid@Rh86fgzdr@`Z zxppp~wJfTp{8%s%a&|nx8%WZFAM3{lX>^#sj@M~ zdTaqspF5hsJ0WP!9joH|{)phcgA>LfShRS3D>MYJUa{)YUd%S~^V;{-6CK+%JNaOys7qL$|V9pAtBQF&Jb%77*fafi* zIo!%uZY#glp&WC*{qDV9PhR1Va$DK%yWG9K`C4(>CuyVPUiq$6PR;l-{xe4ae3L%h zHVFW<2?a=TxPoN3W+EdDyzK2IzuK<$md(8nSAP5LuPC?PwYGfwA3s%o{!4qxD~9Hm z?Qtu*mek6%@G&~Flb`kyKWeY*Z4tSaaxa$`QKbJZjyQs7&)$9IU;XNTDrNm^%Va>& z8qe4(mDghG!tcnU=v=*4FU>g4Vp8vsp*K^P*O>}j-OmbUoos=DUG7bW;ChYVwkiw# zIIHfQ1hS*?YNGEEK#2{sedp{cg{&f+3PoC^J zTw`Qon|bGRbJ}xB@-ywKl7YqDma_HwbiQ0{&!j%X8TZ$79T-fP+w$esT1>6O4>>yN zz)(A;|UNHPCj-6<9Wt|A?EsWzHLfaA2EL~acy4A27J!7Wd5`{NumMpn{+@-5ySx7 z0n!2MP$LE~AG6sG)N#m|n1@aE*Xp{OZ}-n^b2~Tl{+apO_12&EySKmX+_%#A90Bl6 z__W(g4@mj<15%Oqgt!ExCbtQAJz(0(cE4y^KDO<|MPCRf_O93NC?Egj&y`O-av%%K z?7RBSHl%Ih(IL_=Qd7*_+(>&($kvA0%DI|z?r)1%(^={dLgTX!Jyst2qyJKFd*BUG zjYfZ+_09s^7W!I#?zaG^G2Qocr4~oI+fChJy|)(PzOE>dZeFvh z8igG=5!bGWhjDEO0^9=I3}=zWb<&l0_oSStaWw^i7OG#y~lMUtbX%>;)=8lJQs;BfC^rMbl?1 zsC8umAx@GTpi+pC&1EN zI~6tA!O7!g`?|q$V#@dvs7~ZLZpyhMAtLK=Ja(kq|2J-r{(pJe(T%14_7VVfJnLv( z`bWMtHu}&A2=kP50N^W!8jo0?x`3ojq9W+E9cXjgyGxpIxjHXx!Np(lYl}_MKJt-| z)V93kEpI8?wr#6rxz_SfY5d;r{a$U8HBz10DH@A}lQb;p<1r*ipOinRr(&UAc|o7ls(uFx%!v+U;|-?VYVwu#vIrUu|b7Ip2%Ef2;G zt3O}|@Ns=CvK<>Qx_4avvMlZZ9QySKND&S6`dGk{cwE_P#qATN18LFZ zd;s!^lt0{Oq3`su{Smd}=GO&yG%T{|R^^Qwzw1_^gdmC?q>^)$b_u_=v_p>drI z(P8{ZJLbjh4}4#m+V6bld7KUC_GN4lDcbF6UVNDHT)f-6VIZ{OS*hOZY|jlXljmOV z%f0bDm`o6&1LH>m`0>vV;I4`o-g7&Z@lf_6TzT@FFTRb@`lwh|C$5DEfG71Y_jqo7 zc{dOa4DPXjz{k= z>Yw*r-R5yAC(G?OZ>u&5&bO{>m$uG27Omes-z~3i^AmyHm<=GykWds*9!Dbbkr>!) z40FQ3mxqt-c7Z>Fq1gavhtHK#Yb!e*+xVK-BK>XOzP+wdtxc^A{k9(N*s-Iw$$dZg zgFiUk*I)njU$2alQ0vN2R!dhq9xeYj`}dp#;0AhvZIWGXLYR0sSrf16`vEEOn@QD8 z_^w0B=p_IgyZ>@8xcnwEgE{{FsAJ&ozE-wHe#cGO#`Dgh@}WO^vi#YThs*uwY(%WX|{C3avB==dES~<#aZb`slG>hav{rUfpg*?{jtCp2Rr?W`r z`o*Oui=i8)7o%PvaiRUrcYwajN3@G`?Z9m)gNCYhW z3(~&*(GzdXUh;?1?;1reZBP%JfQO~_c7J=%Io6VV*Ug9KFD&Pm_Laf&$LhGbx+MM3r)txXet7@JBZ?>S5vA|+*%)Hddga^*!-ZHV1~XQ5nkaao={?-1Ju zFU|9_sn9?;ee+7J0XqXm$5ZwHQyuHR^4FMar%xT1=_}8D;GW*({zi1WhRX)3>M-xA z3Ty!MiBEi@#^Y~cd2hMpmbxA&>q>1AxBJ_B<)o~gZO{NEpaxaoCe74*%yf9YX=c>4RYH3dm>0DjYE?wAa)l==NYaZZF7OIeTZFz_b zsSyG6);4Y-?#AJ*TTQGRYws-F6ItMoMUrCiek$WOn&0!&2-20^0d?2u#sQd+v8%Gc z6DPOz);HoIRvopu0NeJlICA$<3kd$g;0zbdTL2*J4-ueeo~LhRRF|{-HU=!w#@b1U3Yn zgrNIMOY(9S=f@A__e{^^%_h-okZg7x zQANMI5(0n!@BjVsV?Xv|RZ7tApY0p?OSVQAzPBZ7$)k@xT7R17Z4Gfk8zia;TLFK^ zcYH_v?TumAPGUVic?f3m#qeebK13HA2gKb`qyelT-}z0AA&&DT9IrI!6W%%=ffZ|~ZD&b7(jd);x%1N<5C;LwTj*b|sQ?rUXRP;h7f>4Cepl(Kqp z`NY>p%SZq8(aLqEu<7DwD_46h0*IB%wMRcYz@7rw=@PGepsj5jl+ltV>D!00z!A`s8uZ2Z%ryx}F)jqy(uukOj%VMgIO<7% zav2zj6@aT{SKy9Cxepi$ma{xSY&;aDepLW^>IYr(Ao1)0#n0turLI*| z+-veuW00mO-4Ekdjz}m4QH1>7W+$6JOPq#ECib8 zuN}hr4(EBPZ-0z55txVzTLI&259ZxK)HxPh$d7GZ6KF{tN$0Dp4lRl-J+; ziZU2?K%mE1w58tp(mNLI`t^Prcb~Rys7KxAJ*nbtx7}8amH+WS{>SBKfA(j~yWaJ# zYA+xkdn2lXUAuNof5bDPij?HB4z{G-%APH?`f>jxnQiX?_x$h=|8V(@-}sHX36OC_ zAavu?5k|9>_i{ZqrvkVEpRAYEB=YS=_VFS!fVuzv`>Sz{3DqXs<+|XyWa6~rmRH#* zYv+Xm6)w2#K!|$3T-l@mH{Uc|zWdfSi~R9xC7dnp?}}Y-NgjQKj}Od*)7l zt=!gz*}m0#IRas3etiJ`tuMXTl_4DJpM3tS<=sE_fpYJI8_VH6yUMYMV!5bIE$O+% zq90(ZcrXPDIx+POpd2x)@W&NcbMJ5#)LrrMULFn%r(3zt04^@gh|{?PpRs`RSh&2~ zH?NCzc4EMX007n_Qye*tov2viHxDR{AuwRL7v*olFemVj1wZ#|lX&hs4~>=O$q(d# zDcb^Ij&oc>YLGh5Fpx`Yj)m~xp5|EjqB?5dQ;8culE5$!VA4Hv$rGa;})NW+~GX%tu=)pTTwh&_Z4G;nHWW>MKz%4p!k_3ZHe~pOL-da!E#af=e z8wab8)wr*Yq>(YAQ_m&Wrc=-9Y6O>ZmqzzawSXug9$#=WHWC^0O@pfxg0KIQ++*AT&+*LVmC2J~>qPzQ*2DntWF1@K zeouuc=!^CL=tN?0q}`q|sgcyi`^?F>{^dDy0c4z5`>^L!x#x~kY^qFKpd)ijM?1Fylw|;B7P4WY096V2djbT$em zoN{sDS1Px+E_J(aW}AESIClMV;18%7+CBWl)8&o-;1`0(2{v>5SnzbRifb#!pT5*& zXnMy-CjmiD10-DSV7TGoY@@ns?}7BqxqwW79JuvoVcr`+M+A%nu-9j7gc^P#1j2&I zaQWPe6L0ZXD3C=^D3^%aKJFvo>PQM$kdFnh<;7VF*X!KF`$o&n0h4M&Fx6JT+n4%R z1Lc4XcQE%HPFu9wGhG_7fJlAw@=S)ak&I*DOr&4CnSM49-y2c#aN-Ye^IN(gpbjvn zuH()R=iW^jgJZGy9}6+k5gg}h2x8I@#;UZpvxCSu1O$OI z0tB(dsy*3=1M(*)>J!ME` zR}y^aNC*edpC^SrNXWP@B(W2E*R`g*+`06vST$JYogPbBTmEJGm(s9|mz>MZNI1$s8>CNc6(P#rN4 zML_Mz^V$$2OST&)@~k>lQEXB2?jiWnjD#g#T5h{}TUywSXuhV?Y)rxQ#TV(o+{U<= zw_FM(fDXi|9sl0n`+HZcne>1`VH4+*S_*62l6PLReyEG;@dF?DK$UDdZ_Su(ZF{b@ z#=S4sH2X^ba_#LmLBG6=K0uRyKixUQIq~ruK``4(JX_aH2%k3DXDgdEE`u2qvdMU* zvTIpct#PQYyL)To>kGx~nb$pPr=@^BYV?m1BY$M=392vN!{+3@F?aAoIC@sXZ-nPmAuFSw%^)H%)U0< zwjeAkLI800!m+Z}2C@xExoQP*D_6a?KSp;U0<78s!&ELg0LM~Xv^$A)_v%O{Sb|%x z5E07ibi<1i1SnyBv)BVzZgx0Vu8PFLxM#tFi0ZY4zdexko8jP!ltuLGe&8X`a6sZ{ zxJkx>T>SKn?zwW`DYx|#I1@i3I_S>Vr5#49*&7%xu4O>huo*S~5F4Zehz~%k&xjsk zZ&4O4$yH*I_eL51u#?bE{jY>nju}JNHm6TLPo%!TZtHHYNA-_#C8Z|XMResSD{*Hm z^{GbMOC~BB?~5^FyPz%0jcuM;XRRL(g!Gy80P$juRM>45^j4e(>w}#LkIhdkixUBw z{6@gJmqf_@slEcWckyWI@hpu$k^jo52abk-I!rPXf1i`$tZ)fZdzWdF01t2$uc<7!_s+vyjx!O57 zebe(isPigof%Sj{Kr(-ys~y@ZF#!D0AN|q7h>&zG5HnE#06+jqL_t*X`@jGDs|vw2 zwgA)$)&*;b^wE!gw7ULr|MgtdMWVosu&EY)=4XDUs#HlCz7^BdYWS_#^&7L#CPUNr zGkT%#=dNNg@p-Pv(t72Em)B%)>?5_Z*%Hvnd-1QeY_YigaBW$*rG1A+%a6YAm&&I< zv#UHXw5ps*-UX-P7d^e0XMw0|M8>2p7J(e!Ar3^Ot8w>}7ueZ)6?bS6!W|EUQxl9u z02^Xvo97G#WLIY~CT6t-D{!O#296g0^)d7UxWI`Kpbs6mWY2fpak5pA>jl563IN_I zkBeXiChh2m4f=WDNn%jB@st5X=zn=x(k z5PN`onzm6b7^D2J^>B4FR-GL{PA0imelY~-@vF2Wga~y&-G(;pP!2t|I0e3`L8^oz z_XB(tARdmSpWON=XSz4FRxx#~6|iiGz9BFgi14N+B%_NJZ< zf<+&+!I&~6tZ>uY2nA1#zeDv2LEP6jn0q>jgJ+JRt>^&=TGswU%ypcd?BbtD+(3F6WQL5Ko7 z6B=*om!mr;%D27xuBx`_-2b{UY;)=xK>!#!0NvCPe)p!fPR;z*X6Lx}aZhIX|I5Gp z%jJ*%_>aqd_jPVajS-Nq^-;vaANi3VsZ}&0!uWAbbpQL`|Ncsd@Fu`qxp(H8(%Zh5 z{fSEV zA08|J%TNDG`OjZ?y4-c!n)1lu$ugKgGmFoiET^;RT1cm@_&X{f)es|hV)HctcxmKl zjF$ouHmx}uj)h9$LL#@}%%ZJ@t~D0E#oCxWweL{nmX~KSCjveZTLaeT5N`gI)Byki zXX+5`0H%N$$4WPE_r6o%CRwoahZot6o|t7n6~W3#A{Yp=4(E@*A_h^o`;jxz!KW-z zhI1hZ=(pvkckf&X1@B30_~`zLlIu3JsUK0YIQ8o1u}TD_ZTTJy@Bo%hP0{K3mxKty z;ZuJwcBArukvQ9BAsqOr1FR(}mk9n$>O)BIIAGl_uRi0cFEV$}{J;G3eo7Paa$Glv~%*8S~>lqIVYd2io=I(Gd4Ib)+go1+HehUc9e zSQ>-Ag^S98k+ItE9U&;kV}Af+x;d&95|jNg7+fF8%dtq6glO(h+<={X_EoUg z9+E8pdN4%8js*Jvr9fK089#C~b_l6A|BT;ClBKvsRs+EC(Q^0gw^UDo6CoI80$ZC$ zQ=|Eke3`R|pUbHOXU^%Fjk10L{092%y7L&N_rCYN?NpxIh4P;Fyr=$l+z15TQ$POW zKR%tW9mw-s^|!n)r|*spc{%myo3t-pK#3(RcRca@`O}&#*ZX3&`x^KYozaWY%U^HV z%G6Zf_iP=gYJd}zsR@hvM~@#V=jO*sSDf~#JlFa-EL3eQ+P!`^@Yk2tuXf#H2B7yI z94&wQy}wv~^FMx}yzaJj7KLq;S1KYKch;1m&5*-pcqMU_f`Vb=EHHP!MRcZjp+ zzrShyQ1xdAh9g;QakzB4<1v^EI|(1^&U;+?+~WkUoQU6oZ4AzaTNE_LL<6|%!VTh( zPo{hzuqL340MN(IJ&XjR<77`q;s8V>051kSkis1T3}SQO4F~}~&JO_j&EH>e9NfP0 zwTq~pLE3@rIbZeCIY$T2-(NzYaQ(B%&$qMA6h1H40&#)ky3QaM2wB z)wNSRd*!lIpr!!QYoikfV_12%NPc1U)BXlIGVocYnHSiP_IBTvlBI zwu z=)pIV0PwEQBuKhZz4BlE(C)TAIF@GXUppsnJC@h)cE8`Pyxu&{f2*W90^kOD((MIg z)Pg&|R(=1>ZnT%2$R9!R=}&*UE+Q=b*%w@kUcR^Nu3VF9es&@2D{oxAusoX8?%tJ4 z%7-2~SU&LEAIq{z@?c@@S*3dmxx%r9p-pVrl+g?Ry|=nXylElb`OLoZKmYR&l>hYk zUFD8ft}TCdaH4WijLpuSNI;b7wsdXsH?6EdByK*RYTO9`1oi{r7>|b2Ivju-4o6q@ z^Z`GcxDa=*&LRxhY}x`)xB%jC>Cfr;5etmG5CeR|>n?0<0ZZLk(O^y_2ub< z_yh6-L!ggP;7@OiJjW9mm%)^c+Z1DfsM|H-bw2l%089ab)F5I+Dg?OP0evH4)Jufz zF#y$*;bdz}{18$iBC-ppV4QMWGAHu?SxMb_pp~HNj`A!+be;sF8CdEP1e5m&p;e_J z`Q3qexWN?ab37t>#()d~-6z^2QU03je7ST{_q~2~Kt5xDc)R!xrBV5PVO~W3^Ao)< zOlbbG!4YV^b{s5t&bo9ZTIEumSkK5FRyLa3VD# z3J`deNX#=qd>Z?unbXw^AZ_T2ebAv0hW!5zM}SpDu5`BSA(YTR)Ta@xHYy%0K+QhsuA; zcBzb#0OYkMeQyLn1Ak>S5nd~=F39<@BEa<3$99$P`MdvT`QXPNDX-tLwv2`Qx{yAO zWD)I)WMbZ#BH)bStoVqKwNte z1QmdD9VeM0x|I7;RBf{|5Hg+*-F8)3{xaeOd*I0 z5(8}~aiKSNzIS0(xhAxCxob=ZWc3-=QE9m{e)$bBjg|fyL>BRk`$$8o%|`0#2!iuv zB_b&O_IxGdJhTc?cfLke9F3G@GMu|8i`@M}57aS1OtnJ`eB-}pf1Ifmz z%wNw{TaV-!>kCx~8asyef6+qTt3!1e1XyDb_d z3IcO*^XrR_^84PBkegXVt&69loW5;nMftv8`or?rlY7INCr(=?{I#Se;t%`y5+{?aL1QE$|406KN~(6=T5ove>7O zt^t6uSOa9DbHQ5Z$N{iGL8AS!+FzPQU0Fod)dxRqMQCX=fR%qN27`_p1Zf|zu=C!A z5CQbvMCphSW%4M%T{4OT{3}yGU^Auz5l0^lz)%}74z#Upims@O*s(u=asZxv0e~gf zK2L(xtJq!@Y+KJfjOTOWuZ1Ef75mDk@pe9+rX0S>&5Qf4EFUWg>(5pmg-jCxU zQu<;Kf>_{^>YltY<-WUaukMD{fyM~5Ic1G$>x{AWzFyCo;NELZ#?!jdh>w@5-fL~= zw@Usw0^kOF($8l?m|SgQ0ce|V+;r1TwTErOUr+hUZA1lk8@u`4d+$9JRJW`jD1YNs z8_Of3Bm#WXMJPK{e&YQfDhCtZmWcXVi2yrvX)jK}0w!sLk422Q)1v+#KJ;J9TfYBa zCVI+=cqFVU6Iql2`uP3(fhzCap#EJS@mE)Lk`oJ8BuHi$UW1Kya?Gz?ei*Dj=iC4jrEZU85 zuvh~*ed)>$$DiK-l8*ggM7jXKf^Yzc6hZz<5TyPCiE2okJQ7vGKzz&Bg(IX&(68KH zi6nJO+Dkk?7!fm(KhVdaB5aNXM7YKQBw#PvoWMo>%R84qpt&X>0~j4s5md0tI1%k@ zA2@c+Sh92n^gt|^>IFXT2o%Hu7_R+Fn;i2Zz^QBVrRiBGYI&EOO&!{z0q4#os7~q@ zw*f(+JVt#I-9L$mHX+UybdjGbU=F392O}v_2o7Fx-nBHX5?-EZPSxi|B`}!$Cqf`p z!Y9Pn@dXL@oQQr~vkmjc=-&IX6W&cjYZBqk?tdYA0?21W9E} ze+^6(^%epEv1~4JDK@`Y$eUkHLeSO+WA$PtdyUvB@Osvv7qea8s%6g+056j#Z^1Ir zO|U=tlRv4w;)idtHABMdnRFH^t}_y%Lc}hSvuvnw99J9=`1AMQT*`RFt??Awcj9b$ zaLvl{dtW0SPfrpe0o!E*$A7}Au<3k(DVWG-T@+Npe%2iRDqa|TAt@j)0y=6(&YKonqmlu`=%%N z?1VcuFAz7{rXpkEnl^YYLJs4ijWJ?|t{!*a?L)vzBF{Aac8}}E!F~wqVkbbgKnidw zhObA?lQ3nQpJoF^?I z6%fVpa8y_tib%UHMutG1YNfjUmbZ z0C2$*Oh-^1JNS7Id`KZ?stPV0<>&uPJ^b()AWN0Vdj7@EmGC@9V2%KI9)tH{+-j0s zOEQUkQ23~N;NQFH4)DC*iRNC5Mk6Xn0mRi-mj$Dy)cwlax0JGZW!V$(UzYZd#31So z5y$`B#~v&1{lM>4PP(cC0&JISi}OVOY#K303{;-BApLvV=fhD!sq^Rhszuet;Rh>R0mK8_H`ak&GgK%f3ss^adH zaUpquBhj-cfbIp_q!NfFzUT=pIX)wr~!D!dR2UlSPDS{ zxN=MS*x6A`Mr3c&@5<@QlR{3N4v+?T z8DJtgm8M?6u1?04fEyt=9GyQv*RQW!wDY)k9QCy(Wm^=e5rhjC+(+Ob3xh?W>guz& zSnO|FKUjYJHJi)7`fpE^wL=kAr?QD~K(E`_SAOOXA1(~!e(F8nTQ)~Tsx1~ay7jiy zTVQbiQd+j(2+dQvqxe1O&i|Gm&DX4I7pwl3ch)KK(dj7>rwIRgUODB(Pfmh=>h5 zEkXf^j;GCi5m7e-L;dino8DyH5FbR`R1#~_j*jyV$C`UdF!V#xziI4u!fAsvOM0%e zz4GAAvauz_pLri+;z|Oc0Y4D4C2zdQUy|R29t=>-7S@>kVA5rs2w!Q0dCF z4}nO87LI0==elw&pw7|BzxkCrthbdT1wtj<{He?0!dko5n|plgKaj<}9Ei^IBcJ?Y`RU*IX!-YFJ5*lzZJWx5EXq&ESZi&> zutMW9s#=ziY<7{O$KM)Vcu8=a6p3iAmQczw`a8`zk@h zPQXrle77ykgZ-gyg3FBj{A%)KkCaWx#fY2BLWjF*+| z3E{eEQ8>AjXYm%d!1E&o5Gl}jTFk3;e2y*r>>1Qm>+4rv$9*G;aE0=_2(0!Vl! z^!!K@27mH`LEwV6Bk|WC2}9`**_w55*LIY5qMAb+;MeDInPcxXK<2yc{XQa2q^Te_eVXz+B;>8 zsDT6`%G0UOAMr|;&-P(e!dfGQxCN4GAb1d5-ti1!%8Cd-;{;Ik!+QrD2O;v%dpN{Cfbrky3jy-e=gRK9)1CtZ!O;-MR6uoG=SY}{ z`@Ls~C(ki2M)gEsG&q2GP1y&-{j)`xIAXWJ#dQvOre8fUUf%JB+sc;B8|!P z9p`Tj2a(R}0fP2M(ExTXW)I3L*1SH3!WtbFG``uF93{Y$SZfAKqCRc_h3v2x`=o~V1eyeztO zvSRuF@#p`n{KD^ks(kKC`@>q!&%Of*!5iP}bLkWPE^vOuaDU})p4fY^oKJf;gd1o1 zo`ASzd%%~(0(KxYK*-W8m}BAchr_9}oMs#*qT1&C9tqbkik}6YSQ)I$PalZ!5efj5 z&wDD;jHP8H%BT0-f@-ON&7<>u42;Z-rx zOBsCnf##_MlhF>f86!t@%dWF2(0*Js2?4r8vrmMpwW*A)XaS%x)=u66j34u}_Ca45 zpsmRLK-_uxsWO16c9Q}OC(zA-7?+LXrqz|c4p*@@RYb-o?QkC(4BYCWjMq&AQJJJb zst*Bqj%K&IQvoq?5za+=uvlNZAvBE7nYs~}nN0x~3qjybrhkj7A!lOfr+>KlH5pg7 z3v}^I7AK@^#+bbYOL{~N0#1K{G9sapEmLD)%#BSY7~?_Wbr%g0fi@n(b12>yA{lN9 zVR0sm|FW|gr*QYBA;k9O9a$F%Om3|~iq^)x?_9>iJh>^mOCm6y8ktNa zz!;q7o#p5P}uR zix&X=asIZ$#rad`ncr8F+PYy3tSt?=pO19Fd}Mq$mu`#^m?HpgjOTDwtuP^dQ00RA z5Y6-Nx4k7UAIQy|+$ODSZ8Bf3B9j^+p$tTY-&fipumQFO*gYE~KL5ejY$-qa!LP=3 zDZ15gI%`*U9snZ&h})uXJrke&fA*ihRQ}m-{b~8(H{DVmynR#Im}q-+qL%T=aOMXR z%J&mr{(AX$pL#rKl#}xlWazG;?7+GpBL9i9vbsa@hYyhvq&gm*B548xJx{>NZvT%SPzC6;9tEy*4$s)^-Kjf zR`yTqJyf|^#(X$rV%oLg&UYR-T*rQU+zV^`_K1kN4B{%q1faWR$j7*jj-Ic4a3$b* zG=#yqa#OA|OkAAv8wV1-FZUq;jIX^24#v0)@F04$kwtj}ZDS|m-zcN8fbQOm*q^TdY=}Ey zgApLa4dc)Fd8 zAfsB{0TU{(a{cNtzWP%G{-htIFeEzF$3C|rQoIvF1+Ps$$@o*Rkg||=Fs4)|qjFB982=F&&9PO(y|Z$+yJHCGV6)CKQH+R%n<-D z`H){H2=wW= z8d`ET2i#UH$)Xd$9E?7EeDrY67#bjX|Lt`9s7t~Cq`@kartW^o?VtjUio$l9F1Gq?OQk0U_yZ0 zI5GH2lPa)~1n?*TGTOxm5GT4nfU=$^L)yfTUmL3rdI*(0Ap*b?q64QYzcE$kyhuhI zQBj@>3UxPM`jaohCKTppJ#2w{+RP39R_RYeE_eLfP&21&zgNgwKim~k=1 zWcVU|Nq_4bnD08E`CE@Nj?_TMLinxjj~Xezc~p3QR2f_oYn8e0V4lUgNCFTeR3J+u z8Q=m~?LP9(^1H9ZO!~o#!1FV{1V4y??%Q+lmenrSw%rs0aOA-L%$ull5*_PIK1-IZ zOu(r`lMGQh8f*U=&?iQHdlEuAP0D*76+t8tA~dqwm^jLqF_8pg`@$t~ML6c&2S=(z zZAmzP1PdUgvN{^VVI{&NQigl}{4Hh2wyo8^=1ArR1_5&*x+n>r`T9bowgwmz^Tcl- z^UB9uv35wUUgr@eJbmWUjWGf*+#7LY^yg*S3KPpDGI?wi#Mn=`U=y22Pcji_OT7eu z>uut;W54P6xc3PcylvaIOLp?RT)EEsnCQ4e?p$}=ys5nP&V=DTdb-#>?r^*U&Jkxu zPm1fsc}+wzz%1?7NC)2Vz^3xbZGGi|d)AlRv$4>F`F+oW!{yHC_KBvyHX5!s%j!@> zr#lm7^>jF+J0tRs4Sw0Z?_dR)9qR_lZTYN@D_~ZhvSBF3fad~g(Us!p?TyZ&ALq=l z52%3)Zdo>hVF3={fHaPs*c$j-bSsz#fGf8e5CH7%NAM`ab^WqAPgO01yMJ>3NCoGG z5r+e5S=Bf%_m8DqIOQqfUX9 zYvi6ra-mHO&Z=%W&j#0tI~O9DmxGo|z72);(1mm7bT48lU$ss0RN^FKVqEgSCijS} z=lLUQSfK+~Muxy&HX)3?7w$uF)rBS(7*FbFE2}H9{q0(({_be%yc0=As=bfeFm<}S zHZ_iZoDb0lAOW}`vNK>EleCSR0CBQ9Ag-_SG*1lUaX!ed&sAMei874|pykWBdl!sT zSAUn6B*yqHh|fP4w{2TDSE7^w=vatg9taa5>>k^5D1=bwJtFP|mp@(sRV*G5y(uvS z*nK<`0<4a4`f&4<`=31*1|w&6-54Vq3#DBye+dli&fm9E^u9n?iXJovkFB~jp4X*(W?r|!`+G=blu>pNL)|GV!B&k z05Hy#VVZv81o=x_utx&s4DN`@M!Js@k3zMe*RG5amq-0_9_W(* zxc_8~Lp_TQHjD-?QO~cOyp#G?bLP0E`kpH<^(pss2!IPyJMGD(zqn=rd=Ma&084$w z*cgg}NM+%98jmG;cO`^^x=3n#BP0-i##nRJt9%4!I{NS^RquU13#`59iMjy{u{IHQe}Kat@q+3AA{^O3I4wKY(JxzgiCH%eX3>jV z-&gk@uC7~L`|jE}lto=k{CUxt_QygyTyqxU3W6!yrae9k=9FU$R)m;f7XZ+y5^%5# z>FDVvB8H{kw!i{d;$Y>af+~`==mHbmH+{M?aA91uAOg%dtP-QDXLV**tue{K_!!Zy8r0(g$@^<2UzwuT=|T% zdTcQZ#BJGYyI#BW0lu|4Z%hy+M9%#IBt#CPLOTJZO_6Z%Ct?8S`Pmo<;H(ZIrB3bP zi;fFrDA$SHYfCEFvU@-bAw23km2$OlX6k)E+qEuV2PV_!+QE!(dF4zs0oZv}d=BW1 zr{^fhlXq@3HV+6xDi53WxNp_eI|)$r2eH7A6am2C(|O}C7|E3Ai0Y9~pDduH3M5E^ z5CQRt5R9v$=N_b^&vlywot?KRy6^t#QF1h6d3t_8aWHW#QrMpHW2p-f5o6M{jVtAn zcn(aQi25qePkYzKi)3%qI)iD4`bll918f`Y(s<9??}<0d>hgupf1%uc_g&NA-{yaA zdC8Nt0pmjC|H)5&vTo1&!4H10dLlH&;U#a>T!oj6z#IW^$#C3QKY;(fefz3te*5#N31w0}7eKe3$Y*i&actt@h7k~-`OIf3;FBmIE{F=RWx?nr04$_9=4~6-mLL7L zZRN53$I1W#Al%c1Tx3B%pV%E26tEMWj}aQ4l^Fa~xWd)pK#4zrhJ{o4JJv=|ee_7# z6)S(D@GY?es3BCd3{FNGA}XK-_(CKDT&3)>PWNbx5F6bFF>y%@m(R6tEL-!EL~sQh z#8ssp06G|*JAkKyWeWFCM&;v4%UPKSzW<6dTQX4 zaz-Lzu5N+>FZZ^c_XMT!+(qZBL_?1CS-TKmpc+u(jJ2660TE{8ULoey8YqP5GkVO{9MakFH2}^rW;$Qg?6iOS`)oE?%j*F3no=b?TOm zJ6Gb_GYG+s(81wL^3=dZIZktGAFKUJR3@G8zmU=LcEHXRa9v+4*3B93GJ+uYiFj%J zMFjLN8ABg-1B=7qS4lxggGEatmC3g)e|ca4ZSf+;vw7C}*le)L*d0(uh=@qYcyv?j zPgW*K(Ef1!=BhqyOtirIY?6O4*AG>Dm1Q-WpcoN56M697?HkIzy}QfJx7>0=fPV(0 zwqGPt%*$Ke@|NBSj=IRfCt8Q1GB)kHTrQ1_zX(fPmTHLoe# zwsoR;1zQ-eHnoXnF;O<6Vq#*VeCku5Dgz0&bN~JK*YbeowUn!VWh=KO3#?6FzUK|M zlkwNhb%?A_#nKsw*nX9$J~n6J#^%RmuhV1Wc({_`K8JIR`?_ghdBA$Sj7B$3jA)VE z8&1hqsjt|$CR~!8(V|+2UNqUOCH9#tu-x1b0>EFmRo2lAz%1UY!r61N!pX9>Cthud zFjvqUKL6AdxVeGk2c|$-+i-UqQy-9cCPrDFf*sj>!`3;tc$%_&gW=Hjc1Ie$oQ>;KG3{X~4;}XMczb#L|Mi8vqL!X`5IA9hcqR2hcmq z^bVSiL=~Vf;{430kLJ6B!?FaW*CDjO7^7*_?>`W4Pwrr(HOG&&xIKuf#+0 zAe?wB*os%mQP{yBs{=Rq)7z@Xw9zeS< z5-dqsUZ>;6D&r!!(}LKc0Br7fHk_g&Q}61yMN&tIU4hWrv8Jy+ADacuiyZ-Cm3qdG zfQv&cYc==39qR_u#t=98LxM-u#T~Is##UhSy7KB*y{d-yZBjh#`bH)1Gw`RjeB&G6 zSnZa$Ap(DGpG!B!2+R=xH^y_AZiR); zF^J;_z2H2*`sh1j6nHdT?5f4{V;z0CMp_GolxG;`g+@ht%~gi4xF0gk!^u~H2MbKQ9#Ts7j0ZGd~avLXdY{rX=eA_{cj zno`$}-dpr{_T+q5fgaf<;bg#jLEgXqj4h+ZP9@N73?!S7SEMS>fWdVj-AaO46zMZ z6T*O)e{G2HWpPtv|ggT z4?p~Ht;wc4K>J!n^D~u;`{2|rUtR24FF-GbWMb2uE8g?CRPqY>R7&O$etMUaYMcK~cnMB$qvy51G{HwJs_ z!bL9+TedV~vLl97C!;fE@qOw){N=zy%H*&zVbbwh(y{EFkp z@!}3QC&VfdHW4`EJ;8wJ<@tYeC+mwh0ZGK6w7C*8>59erK&-im;ECK1MOwj@02tRG zGs$mkj1wz=U_)n%bG8u>$pvt+z%%SK7QlX8YzK&@0gACAiZ*fppYEDeg&JW~#seNw zEg%?xzdQ^Mm0R`M0UySF0Fcx}UyK0|r?;-amwWnhjV}2_${8Z~nj|Km4oTQ|@?0B6UW(B^jeO zC+mrq@DGs3`G4grUn$@9UEfvJmpgXss7#IdGna0x5tt(YZmj1~TXXZxH<$bFyRRN! zufEJ;WFeqR06?OFarJxf!3V48zqtl=i|4fug8L8^+Sig}+%cee)2k8_U_~OpWr0oP z-YkqU9LpjJ1Ztfj3UJfBs2PtPizk36XuD(eZzB_ifR85d&*E@F_eSKuWZt>*$TR!O zuEPSYWI>LI7m#Ibu3(A#t_w{1s?QaLQkQ)Q7Gz;P6OL`)=y>H)L4-wAJbjV>wC7ZQ zA5PwlF($8UPRjz!*LqiWcB2Qk=mQakaK!INr?_AK`9Kxm(CM>#kZ{d7ZlH=I zws!#40?rxNtbPEhUJEy=1%S!V3b2)^NI8B3cpeJ7Bh~O+*JjKBzWaPCXPYvO5ru&I zh@DySb9Y=F@x6qI5oE>~A{2nya0o7*1D*%tMRpCjR1Fd3BP}3Z;lbb@;|PvPNO&0l z#3ltmJg8fp)%HN2W=k-8wxnVeTvbxYROWp7MMW_YUu86Uy`a&aA<4ezg$_LTJ zu7z}jip)2|)Zq|X8xt9c6l+~1Mq7s0;N_Z4g9a8*h_9{1c18KW1`c&1KV4-t}j25 z4T7GWu-AM<=n05&EDI|uYTQiK!v>1%PINF_@J0lK*zTvJ{>Y-& z9}E5s0lei|XmLPQjGu+hVn^@lWBF&yhkK{*$C(?$Gm$WmEUb%;pZbAVb|`vTfIUni z5D9|&k4CCc7t8?2o{_QaxE8O0EV=s*jaJYk+6U%ma~}|;lja8Jcx9vnIjo?dUgKfy z1?p-$qaBi&zh)u6gT7bLlT;4p9E~ytW z1?+)keIHV8f1ai1EZ%-47?N-HG~;|Lct+++Pj~mIt3I#kyq&5AFp>_1ys5^2^yb9# zhymkH!eR`y9muotr(R&BCx!*iAAq0Cn0Q9oFSTdZ>>o%F06cO3i4dhLmkQfUV$j7` zprc3Jde03Q+kityPsQdUb|HiP$(tyYiRQ+0sK{~`pV9Ihw9FQ zhy&x#l~tR@6I0;!H*6`lZrdE`-s3fhmk{0zhZ*^r|0IYFeA`%EI$qNQ_q-0g>%{)H z@V0jdw_<_H=D`1X4eN`3^Be*2q7U)4S76dyZ)%g=#0UJJ{p@E08i}JHO58V~_WeSn zndSHmY(<9y>VM(eVhngRe$`oMaYlQN+kSUyE7fqKI7$E~PW(`Sb>ER}!yAz2k9{tS z@|q9;tot85Fj}6B0iZJMn6^8MWVZ>5fG@26#iAEr#kh59h{Yjvh`kvm+PTewZ&7z1 zcPAf%#9`w0{5+WjQ!D_BG$SvT_Kf;gMjD|lVzmPcmfQ#nQUzSQkij^AqG5rCfTr?R z1z;`i)CDBw#Wje~saW#1q(iXe`p1Z0?h1j*!`N;Ar{YH@>>O@xeEgyY9ZH&T$0(?%lhq z4bhjr^riCj(@$4Pfw_qwASG+_wyhPow0nWSiO&cJ70$n3ta0gj%bwXDby-YA_F}B4TnkKFWN;v_e8rY> z|80Zi&jd%wn}%DRNC?;?+7cjN8Fzp+akB!_7NyPo%X9u*f}X?&Ixc=+e)>R_7TgjJ z{pMIoug)Iqz-dFm)zX`uCv`|Ybo)S>Sd>ggQMX{ZT|~^iC+8T>c=f@cl7pr zp{H*adi@qZ-{L-+&)tx5U)=^b7Xc=)#_bc$vnU6oP4xhzyaAPaPrHuctOHh(@(?n9Y7YX6M1gdq z0&nt=HXuZl1GKeY{XnaMwDA!77oZ6XOr6n}bIR;=*rghxb!}1>dxhl9J;%f80!ew* zgJY$ZP#3Nmall|sIuas+F=hqJdZzt(_L+92Hog6r`hHgK=}gbMQtxn(RU) zOJl1-(v_^?`V){T?FH;&EeMZIl>+2#UUhTC{DKdePq!wNFfsqmsJ{@jAvemVwVi~j zs&zt$rrH|dsT@92-u&SGmDtcnA-t&%1=<0?evgih)>t1vnGKNNfLUF=(t+!>8^BQD z{GWXC$r?!PqaXcfc~!g-=D^<*x$)DSRmKhYjIIZy0h7hZhYoK4{{1y*kcfeZ0oPxs zqz-BXW!rvOXky849t*w~WO3UVo#x+s+dbuP{PZ7{!8_KJ-2wO0;cAwLr6Q8%1~qmh zYJmXzp|~cB%|0G5p3JJbK7;`W*hylii<53IX;txc(zKEd~KmP+GmFpgz z@+`3E+lb|CK*i#3;jby*h>!UJj1%F4hZB*I3If=u2R9hSem0v4Arj#1kDl~sT+O;2 z4Le&3e)T{T$fV>7{`vvT5ip1qn1=vhtj48}7}~pnfY6?y5E=(!q=>uMv`Uoa9x4F@ z2CzGt`w>a2sU2cmS-JY0C-sn@pFe2>70dkjQNiTCinS@v*djPaQpZ4mY~iWA2gyrH zq1^}?1e!jM#5%q1@^~@`*v2Vsp~657XiqJ-4&cQ)O}UT6#`~j>3?KRaA5R-eLTsFb z07NJdawv$T;Y8POQC6Xn>#Kn34x%qki>?~I2U9k_xk!&)w zB6W?7jaLGM>nLs?0eCj=ye)&xJ1V2m7|Xu?yKXD@-1&-1Oqk!m*tdR(>p>rH`(JK> z<6LS3{)hkxXmbNLSLHRYfVNLtE0lAkB(HL;FAV*-7E?1KCYsveO83uQ`^JyJf`5|q zjc>+W3wlSu1iTjgzx*hA6B4C=B4S4o15Iq4IO9JP)h=A}JlDkTZOd%O%22LF0|Bru z?nmqvEHoqnTn)#v*jPTqFaPm!@RwDO(;Ky>}BrGsvr63Mmt(J}-IKg}B@kTMFs`<6y@?Dk!Z%5Hg6g zOW^7dg(CQQ9}q}XJ7;p;x98+NRG(w6g1*I)h&G@SLLMLDLSK%gJSwcU8MozO!43!T z59RsT^J3BbPBO7QyJUJ7ZFV&jw?r{35Z;Vk#UA+Tm-d#w^FMrVBmfW8x#*fy$0gt1 zbMv%;vAItYXucv05DQd>xd|GN{`>kx_YOddU>#YUn&%juH ztu5`ox$hfy1m+|FH}12!s+O5>CT-ivPt0v%e-jr-qr^=}`Cf=*;`=lT-y)@a;zo6= zlez)7@zm4LRHER&-?F^?>O-UDj=^wK$$R|lf)D_Leps0;>nwh?BP{YbV4UK`L}AL4;= z9P#*}NHI_aBOx>x(+O6zHtZ0ZZ)G0R7Q`0F}VxlodKl`mqo& zfLEWi1tB57cK8jDZ50gI4@Oz66IH8-fr3X=OfI z+a+gG;nc6^{%ZLzmT>VN1QXkgi>Z|>cgG0_h2t?2X~gnXWqzJ3f?yzo&1m{cRYBq) z@KM$MQ~xA&p@8jJnDEHyclBrp@qm!5Mt`}lssZwi2GFNf_Bz5L7;1?Lo_Tq?l_mK*i&xi&@v~$Yv@vgWJ zL&4baO>cTr`S6E7T)&;41ONI8ebfAzBLHr|M`gnGPT<*^*eG+NY7>{$ElPabwrv%J zU2o;Q5Cnkonut+(t$n`R;-Hfc$Wb%vkFKWLzOm z6YQn<3jE_CEO2l*ap%FKeF0KTB?5pqb;z-(V2J32v}LOxCc|k9xPp@0dobZ>8J!VT zGe8u*jsyd@x;Vj|w1-DVjpUeq8%O!*))%Jiqv;<>!Br=XoFy6OnTgbAs;Tu-eIIjw zmoJ_zO&IJuv+o6>Z7|&br=L7ne)uoHuH3qPYu*KR0#{DjRL3XT zdgrQh%R(sQ;n8!$v4D8yqYZ~aAT2DTNKWQV*pR4k}{kQ+Ny!p*f5$EL*43NWV$ACP zwXeLR{H+HbDgW0ecb2=?E(`D_3~j{N0}BIkahn3fYhnj*CX3|SEPgnW&+R%CH^8|4 zCF&Wm|Ct2)S(LH>B9LZ)M?bnci}As@>j5fJ1_3d>Iq|*S??lW4NQbhp-n?N=7BzZv zk=>FnK#0qI=J05^;^mb9cq}eg8zL3ZCt$%{4wnUZ0rm;81VV5O53a5RfZYhIE;rHx zQUV+x!emkElmN6KKoWy89O1e1u>}YK>5F@S`A9%jS`#9MQ~;Lcf4r4x&0^8WS zm8Su5Gh;Vm{mmhw5It-xYNW*UQ~xi_vtiOjI|1VUk=PidYn8~rEh~F*MA-z41OEghhSiW@=2tvG9(qi)v~s;vu{Eoyhs&Jnx|p12zV-Uw{aeH?JG01nQCT zQ`L~LJTy|?`QC4jD8G~B^v=KO?qB9$W?o#ok7RBGdy+ioY(mx~1Gs&vMB{wREw_~S zz3+YHcYpVHtJvSY%AHxZ^KlU;>(`Kz-&;el1*Z??iO`d z1F^oOoU2J~F~I$FqUMF=FMa#nVa4LAmsPt)VNYbV{l~HcUqqgh$BxF`FXH7Al`U7ylLi8?-F0}w|UZN-Oiv$4iVUGceO23~RuyIXYjMISr(*bxy3_?KA zA<3BPOKz)NC17zsUwb-fV+{j6Fa28{(Jf=gHWoN>fUkW*8Yg=lU-bcT#&Nyey7x?( z2pk4*)mSh<2>3E+)&79pgg6Pj47d!+uiWz?u;~Bm_}12UfBbg+Sr>ZpTyhB0J&fy3 z>4R}Q6Yxbq0dnsj?mdEws{Zg!;K)~GT-aXJU`{!w{{USOCX$Xd(Zer}C?9}}(?Csg zGVinTB(WL^ktS*))fMB^(6br0MU{|1bdc;Ak%bxSa|xM<^CtDFghJ*Q1H-=P<_FkV zgus3LP-mCGretwk2-&i186GUV@@_ovu2+`V+;>-f-rD(mQk$#3l|{0KiSQn=71*|I zTea%v+kekJ_tZ6qhru8G!5@^JJ9k!_BlEW1|9sjqckfF-0#~=9zx3^zt8tbQFe!2U z0NGA{M0RZ_Ka+MQK)1yK!O*DuS&F^vDfd_qlsmKR%ic8Of~a-^1HJmrZRM}Lc2jxe zNc7x6KstLd(Zxy^W_Khy*@ao?hQhf%5x;2B&~DCxw>riGbk_GKun^AsRDk7Z7Fh;r z7G5rZME6AB4C*YfT=$5j1s1~n>R_v|GhE zZVH4~q@9fW_NT7>313`AyUADU2#5eiKrT8Uk+G$j^guiSI0pMfN|XfGVTi{-uSSc@ zn9#=y5iDHo)1#9Wba^dM1t1bU1EGo;|5^8aNHdhHHekfS(N@d?+5md)Y=D{yL4yDw zqN>B;)EPC&Qfh#yK9Wk!6e*D;1rMoAf9Hn-Ce`Rt z`_|T+NgGRHh z)DLP8{X_Uw4~*nh-?Sb-2&Vd8=`W1qz$AH3>6hwme=2V4 z>-=TmdiH2+R=xFWFFEeMPfPYLknxU++$Sb_tZXf%I%OxPCnZfHDyP7K7Q^Gh1D= zodXmC0|C{)`qo#5wN6ZctjV~d^VuVPc{V~BKXJB#=D`pFE8@9u$J+if@7(F~%>IJ` z>~LG*w1{mV*>gAxXTT^Y7OtT=4dUOPvUCwnmBQU!Yvzy!u6djU*CVIf`ypwBm=-0s0jT_#|+^R zHx7XccSoSkfXrBorfft8mpt4h34?$@{LLx#tc;(h}9wDH8|RvQC} z!bW!ru=-*rFcE-WpY4Njp>{x{Q{9vRK{#B1jx#@!^R{nYA3eIoi0&LGk2hZ$DS^#W zaJMTXRy`W<7H;@(tn<0hIR?l$wZ2FXYzzcENEx^U5{0vD2Ub2SvM5&ZZ-}H8+uwpu z%zG$Ai{yT8jV2i&#U&2M!vV(+Ko`@u_?^LE=dy@UA>t;|@6-kX$K*wn2tETGwORj!_O1Hs zlq2FHTM9b^BHS9}RD8}2eC!b+RWcCnzc;0*F3kewEuVLIwlwp7^)4FAndfJ_w{~PI zFHk-nLqNfyg#4}6`FZBX&X^dtg`vFk-ScB0I1-|h2;VM#KpFvuNRx~ibqsJODX0>V zIuC0Bl^`;P>^nSTgu(HI1NNRiv45-@zCM{fEyRPMsyP_@0_8lh=U{n!tg|&4iyh3m zNQVYP6z<7JL8M<_j$O>(edmMawjEn5apxJon8|a&bl7v?J@0u>`D=geuT>&IovyX# z#wc(ueRGb$9070xKQ{{^3aEx3oKUkyMjqp1L{lwW<+3nn)3f1#2( ziOFpu^Ecn||Fd@zJJm)y=6-J3<mp&wr%z5T4xA}vycaTN!z1sB6Zg0HYZvrreywU2PXgy4n4XZJnS3I!>b zKfn_S0~|aYARlzv(`D#aEElbnvW9OJP~%k)arwmSZ!^&^+%Q}`3%H6ImyY@Ym4np> z@Tp#2G}Hos4`l!oq$1cK5N{sP!O#%SUD>Ulk%|HxbtGXxokHAyfEI2R)dWNbfCPBK z6(Vsksf6w!0H?-nH$?6&gaeTP5j9&(28a>?3;7v8Vr_^M`UKaERDo-qx&{FUfVnuH zKHx78F@{PZPhcs&Sba}lQX-I|f4!&Pr2^?90or8@5x1QNe7#oQeG*XD3t-*=>Rg1OZXh#Uwtf-b@2n}kSZs0X4k{>_{jsi1P~4#@9G zB53)ILF+gfa1s@8VN<)=0_;^*7&a?F5H==z$%BLqo{(})*7I+h42b}x&LBI{bKZNB z5xYD`+k2OLpXLV57rbuN!qz*Ai0C1^#1XZIS3Qd z7YkhU{rK>@Pj6ql&Wp+mO`Y3HTz6wXZ+BaH2XbC(pIr7fg<}udOYzHRhL=_=7>_~$ zYZVFAGK>yd6b+mdIc|@#61kjB3OO>mT3%fnUR~8_hHY&!gqm5DABtq5-k^O0;ONkE zv#=807Y-XeD+@Z{2tefv=&NP4#i9;~5XT=XcP^zs;C&6f2% zhFipEN5_ju90CK6g#v*HZi2=c8^}fmWpKgd0qEfX0gPJdjS(FqE!eDcjUqlEIQSqQ zhWiAB0DIyan86s+p_AAjM3w}RI$@&9v*@D=0Iay3^q;x`Zd3(`s8JCB&;S$lKm--3 zF7)|u@jM3+wh5f{8zZpXdTuRQ8xsxJE0vu1Bu#Dm1}1nD8z&nAtJd5AcBNn})>`-lC+ zcyas8DaL*fL!G*xG>!BKVl7X089Xa=2m*}x;u(P>Cnr)IjtSbsy*Il7%^~+;qikNd z4;}J8*r%dcFNIWHwT+U7|!i&0HHw!XpFSPpat03?u|VKPGE9vm$zf`hND= zXYGYM_Sj>MBF7-7zm)fm2mm+sIRsL1wdE5Pf~zdapPJr35z;G#(xd# z$C!8=eR!vQ;4zM45k{X}D8PkdVxu6U8`cuvU5j_QVyLrd!kHt!o}`6&mBi(!5D=9Y zD3(9g()xXZ;yU*c0)Z|P(a{Vh zbw5#Aq0T{i#IqCiBYK}uTWt~Wa}SvNX?kw+W&GFK*=~{-TrsgLu=#9hH9~lA8Nf!* zF?(i6lJjg`IRA{3ZCmPy`%jyP3?1^!BXL9x`1#L&-XxA!UU{X}4)_n%`jDaSC)M9zhr7mG0F^+AjQ-%T09>L5B3fnKcn$#A zlf>CK>Uxbl5he-jkQ_jWU;tX;5CuGboy|H z^ba6_!v{#fD8&23@CWeW5ZRd!F+5;Q8|G`S`p69t0A~o$07%B|i1q81Yo#xq-NVhi_6@#MJbH z=y}wyp;=lj7M6baadeiyQ61gK(J$s;+ngjV^h}Y=qUX8jC_QibF_IE-&`xSadi6MXi>@013 z_4NEz;U7P7d^r4|eeK!b_Pq5rKYc$3zytn#-+1GVW^44ZkA2K4U=D|Tedv2vlbjiv zfrJ3a&_rgU8V6}!z$MyUYoC4gu`NE~`Z+g{7vB~?fd%Ege)`JqOj-K(b?ks13o3?9 zSD$^96sd66;*une#eHwwq^vN1MXfke?Fc9!Em7pTvU~x~2I;M}08dwRKy>_5P=lzR z#d5{N$PX)O#LxKdNA6Y=6je{6Zq<$<7NACmUJU)31!4pR!s6cgqj`b*2XM>fQGnzH zfHgxwan9yK2jQUnU>&YsMUqZnaSyj$B<>$WMYwu&>}>lB2%Afo(9i-1=^CI6h{0L2 zkq?9bb_ATq7!nl&ss%WGz{$1}R=o@Y0std0MDWp=bnhgd%~oFt6-(O?8IyJ0k~#{l zKz|t{?bT|`xRRA?Ufa}fI9mX&L-UHEAzUsJgYhXVexw47$J+zw)~HRQBw{@e(4z-8 zbq8*L+WD@x#B-)*woW>)hj6R&^tad4OWt_-u5#&B+HpiaFdCu);$6YmA)p`ta&&(Q z`%Ac9{Z~k3xngSWC4o2HK4VAqf!e@4HZ*SI`1K9w=ZipqNXRv?mpVYt8WAeTY4abp z^KDejifUQ>H!Ds>t?oIhfMp7KU8v`=MBWv&vr+weapCeXzph0$PsW@gR%R#&7Qe%a zb>Xt}&QQ>+D8l7H)`4Ee(pleU8`sC**oq>b^7)3Y6 zUu?b!vq^YP*Z2Fifpj-60iYfSe_m`j$-&g!-~O;r!;Ky-DZn+S?it=}>XcYHB2dO> z<%b#C_q2O!e8z5W=+@iRZh_7LL=Xx9Ee3Yzi%EbC zqIFaW06Su44EHd&nT0)Vy5aNb7+?q|?i=I51(JEOVjuu!JTrJ`(mMxnniqp>v6?uvjs1Uuc%J+Y{+_0J@ zmS0faXx#7wiBt*wJ4J4Q#QqP)M&2M=_i_JA$HoLyjPk;?fc z(hk%Fs01Jah?7qqK+;^mkQ#VW9-}V!tKZBcbI57hi&8g8>vDI}Y0B6?3ZxR}NaXF# z9=mSWA|l5!#{a1QUMeC!rKgwSU6$?(WhOV2q(18Rb>pgml0#z|((ij4}yQ6>{>0?#m?uYUEb_I=Y$H(3)2 z_-AsaM`v9L0Wdo6i!Fqi>KZS%Q}GZLzA$I1{{Gg(0*Sx=!F$gR|M&dE!kce5g&ESB z!gX#@T!5Cf?CQ5pkpgqW9xQO@x=zswH$vd0T3B) z)B>F?;$^D55vC~Kz!U*K9Bz|1K8zO^%T4jERSgpLMh0GQ*EEGUme*00qzG$6U5k9m z1^#$0jMsRkiBJHT^0Dk!pGg2FKn-xQ+;hyrPd@;6K$XzL5DpL}Y$}rtUwX#Pbw!{n(kCRk>i~80eb-_$M^Xwq_^1{0C#V>py z9H+oPzUj~Q$J%q`J34akNA!RD?YEnBjcYDBJABwk~KbzvS%W!(j)PhgTFLxKxWCw&v?6=7kz9o)7}`6;6tc zQKrZz9}((1^(|V+w+O^#lqO(?+lISdqlK5iEr`S!SBVG$;?l#(qtge4>jn0Re9L5L zh2GkN1F4SnvOB=@0FDsg!|ELnhEoI#aOcB_Zfc2w4=Efm+&%jJrP@4*KstzuQ5g_# z9(_G+5XKj)d58xr!6Cx1wyzbD0{Eh)AOKIPBovGbVgmy`Yzl~R0Kox(16EBkc;py> z0t5jf64DG*AB3MJ4ghd!MuP$%I7U@#bt6u0meC`jf^nbB(lL5&!WOR)s6${tG!XoT zMS=On1r7-S+&_RIyf;7u;Em~KujriT?7r50 zX6ti)&A*L^P&NoDPZ41ewLo7?3ZjeT=+MSH8h)gD(VSxBxLMy#dIqp@Fm7LSL=R2p zuX|0^s04C|Um*#?A{ppGbh2MSv|qvm3!TipM=gSS2m+1$0&quNF1Nm!CHdM&Npq|6 zNU@fS7+k6IZK4V=sD%5+JHhUSR9f_Q|2?LK#)f(`Uc@h-5ABlt-Q~N#i89nfeRtk@ zXJ}|>2)EsKn^iMZ;6IrD_IG?q{=UsXLICt_o`yYFPtIsy%~J;!{=Dee6TJQQ+u?uy z=YPUG@4RCng{c*S0Ahf=!~TM*&76s2!np^QNd7G@7LH4XQ)9&4&n&Od_P`2+vbsgy z0L@{E)BtP*w6v~Ki>T}Zq&}KmAL)rT`7aEAFdzi$5<`A zMdI9XzoQJm1Q3rw^sB|+=0u>1Yae+Kl~0${p+fq6hzvj)&JPYB$pai7TsiUk;lu%E z0GcHkgFwI-AOO(O;iEQSzW}&sILj3R5+oGd10o8dSB7{fl5rTKXe{*^4_hZg9B_{& zD=)G8O=PS*=_4)B zJ!OWl<3PXPERPlJJnAIL;u)D;DE+^J{Y(~izrIRDqt3H!bcyn?GvONTcDY*TQ2mr@ z?}2$Q)`V|-`hsxu;RlB$OP1Pp-!HxNl724<9U^=gv%kA8-HVTzac?V;k1~M&H^2GK zux#0~aMxXTS$6jb+E&?to~KRv?4NaMX0TUOG8z$0@i4;#r<#d+Q?-Kx2*~p{IQ7;yTg?Vz)=Tuh;xGLtCIm1qF0qL&bqX? zC^QLt;oMoE(UlYH9s|H4sTGJjfK37V_i2juM?fAx8M_4h<^f5{Ap$OuO#uOcn#F;$ zIgU`oM7^360j{`wfox;DhzIEVU(pN|XqQ{XK zpN&I(0{FQX7@Q$GrY!T%v;k7)-Y3vto8m@s^8lL7sQxsl@0@41MFQ0rfuQp|L4Xqy z0>spe58PwtL?jDLspfFK?vZiV>R&uX z*&-sS#}x*#&9wEksD@q&q}78oCH-BAm$g1nJ>yhf)Fy;-9ydYb)BN&WkdW+Pq?xOG z2_Xluhbjo8zbTR+u_4fc<__)45N#E_T6QJGsK90fVgj`Zwi)@#+pPH_ra~T$93nP& zHs7pkGJ~SXS^B%nHRgbI;M?E+c6j5BH|(2p&MZ(zCX<6dYZ31} z#4i2y^Of+mX^(rqfx1c(36TFnw z+V7ifJ;?_+^WjCM9zLY>#iT#|A+Nvi0Nmomi^IZ&3$?}QK^EG0+LXNT?Z3_oII@k? zBD_hU(<=Y!w$>KYkHg`vlQ?yP`USu&Q@CJoXQC{yYh<|(;7$-2j~^e2zTwc}t`L?s ziPOjZZM?P_X5$|$_boV%fR{R2rJDyF;h3?cpDHda3!S=T-f+_p8y>-r?R?E*U*G_T zOz;=R1rQeqyy46N9BdE_M6~FU;1JitH4A*;(%I^_L8=1)mKXr25a1pm7>L%#9BDy`M>mBD@~Q@-p!7;mk(dZa9H`P>KGMj>_penk9uakMH3 z6Xo={>AuQ{Rrw*%h$P6|Hflb3-T?JNJr_mV^rv0#R+hkhxjX}y%NdGsu~IIEdg^Q^ zKz6?D>}U(~WjDh8EEeZKK^q0FSkV!FaLY%+!Taqkkk^ia>N9|ebw2McyYd0xysx~s zH8nLR(F0(4XSqgx%JFP6H+=JP@W1!odrcbn-S2+4$NIpWW;)R>efjy%e;$rM{`l~L z4}2g@pFZ9CMSqesax>7o^X|y?Cuun~1H3Q;O}y|d2Grp%m@fwI0l@hwQ>GYaL7ahB z#o}*jY6?r2F13BWahrqdV9{Yg;U$Jcpl(n7>7y?`v^#I!JZtNyqmD8n1c}5xd&~&; zU4Bxy_+K6j2T3KcRQmtPS-H}oYvJl#9bTI{|TC1(9! z)3DkA4!>;twj*P|4l^8NGoa;C9{`xs1mb`x`dYvpaK#{y7zHyFr497~0JvPifbj30 zv_XW0<`UyV3=c_^Cze0Pgh(1d2aL!Vr*W0a(;<#uhsH&+zyrYOkqO^RdB7jP|LFpJ zw(o^|UI@N(A7O|AWWe1X6SV;sjH;nobwJGIWXr;QQ=85cZb9}C za9rvm1jtymiR}cE55m#v{a`X+B5leO8M7XwW6_9WV7FM_D;v^wGP)i`+|#+p0Y!*ro} zo49vE{Xz&K(P`1Mz;nQ!3)~x0mi+tQloYA5FvrC8D}Fo_cq7RJP2t6ttHVtn zKQkOJ!$0g({2gQ7dH+a&JcIxr-dzX*-d%`+B~lUaz7`i3+b2hRXc#f6a;ypbW((gZ zo_HeMci(+o;P2~%HITtSE>&PJlP8~i()JYaab&QWq^beVKtccvaN>ren7{n}X^WSO z4=;3Y<^7y5C>)cwq@=_M0tf_OP?j%WZd^K21BeCoAwV)f8OrnV1>fuKt*_)`(c+tZ z4d~y|M;~qHqR__Tjs<9EpK?UF`jtiD-pAe!2h7Y5%e7I^#If2KW&D~jNBUB@+K%=% zg$>>q4wzk`#Vc~P!k4;53v`V>RWh_90vq^)cz*1d9D$WMbpbaV+ca^##QjINj~*NY zK@G_E2Y_p;lQcmrzN8I=4IX2J0hmLem}y$|;k27NTg|wSZG-XO20RJP3tqIYZ8f7Z z4EYeZqh_EUxJo!|{^Qbz;TQk|2Wr0s7#fb{CBmG6uFlaWhJxsa*>aaQh>ZYMjw+87N&jF=Ii#!oxN;zU=8GtHYs9DDS%(Oslo zfUHPj>Sel)L7W~I?W4d(IqHZfeo}^Wd{lvfv0vv~=Tp_j&qn!sp0OjImA)pER(PjX zNJFqOZuLyky;P;BNm?ac3X=qo8WRK+&lkjy5o+S%3q}0N)I>y{R7jf3))d%tA8;$h zJ+d0(Og)cC2~1t0JsWFe$)B&eohE`|p}7A!6LZ5H*$=#>9RT+bF>vtoDWS2kG0YQD zfc*xNfeB+aYL3^01CE#-ESh;eg zy{`}iynl1&&JDl$&2P*G=>GffxA&ehuJ-eGGWAVAsT;!M?z`^}1Qz04EDFz+E6~E=2yQB8;mS zw@-bvf#dd&^q(l0|_#hGhdy{CW9~RM11c?y=8q*ftkBQ>qC#(}ktTK2w;K@LF@@NPa@en%% z_9&pA5EVo|M3O*0&arLsSoN`#5V@MSiIZim&OOt0OAI9S+c4)v-^+bjX3e{ zll&sLLv#0)R8?MmS3BJ~(2_)4v7ckCMtpcbtWA0r)o;(6?g4~QOIvFw$r1seapIzv zud-O|L+o+iu{$B)6gC=o2Gq#bqX>dXb5V~GpCVOK5BFfITmx5WQ=ipsn?srI2U3Du z<(nzvP2yP0RuoIz|Jnu6D{9*8-t4EKKX`|1Ql!iA>$im3hV|jcw_FseW=%IY!#Mc+ z`1p>$W4vE{{5M9lyWZ8I0)Ouh-w!BbMjspr1$(=pV!pgalFqIQ?49 zO`FcM84+>&*~@_Z3|CyI`IjMM{RoYWDj zC!Qiu)wv$0>0xw&XQ4ZB-c$U%pNpOyKZ->9eiZ+X*VBC-BffZh{Em)O`BTS$D>46Z z>BCr$y%5&vz5w>^Qg?7~AQ~aeHs~0l4oLyT7t$Og1*=7j&JgftiH&M(j?_=4+EINa znlC~MqsDi%%wT4g5?v^v!Jj+5#KCQE^5Ro87@Cg2q032SU~K*_uhMl%F0S3EU0%|{oEwZt8!+A zxUUfY--Gi)RYi7a7B@|Rpeb5uixd%v2zM-;=xhlLME|hrF3eY)djWLw8o6Z&d|2S& z?f~Bg0X&>KV9PlI0Kw5Ol#3yt3bxNuFAC>NtK z00HhYM+<$Y2pH4>1dqW`jB@&E$AAdHOrP0T0QWoK4`PNG`xqRmI!P8Hw?ga+;M6fT z<9waQHBN1p$O8f+M6C1yF+|7&5o%I>#@VY*EYnGB0)&*HOdLK0P&;Z3bmXdsaWX#o ziE$!o3H1CJF+x<(H|`Pl7(+*hg)n}y1l-+{BWuL+MZ`+)Li-4KQ}COrH&v2Gc90+p z>8}+`&)Zvh@-pe$Nmie(Qyi(xByr7vGQNKVLc&PTLM|&3A4Y%Nd&K>eXxDGNICLwG z2$+&5!#@$47*Y}(sZA0j?mLkt@dUuFvZ_>DOK)5kUe!Jq5P*3y+AP4ph?>#Wwc+EwMuRsLGfqVbN-(AK@91S?|AN}Y@;m1GzvF$H%;)y4QQ%*U>-eJ@S zq;0*Ac5wiIXq!48ee}^lWI_6Y`T<*jhaY~}b{RxU;rRy8L{cOJGmxkNGBAlFRfU7I zFVsWPkNAQDcLCS}3UN^*z>nS$RRM-ba2~weND**ZI`YUPjr(x7CHmRhi%(mC>NtHa zdH+daz9Q)T+wWcv2hS}EOT`p-jLQvE6fB5fKkLU#P^3U{qgue*Wh;Q5oIpU!B=WBi zn8E21r438#4O+nW2WGIuFM%-7KSb^Yst<8{n}Y7NtIiT!=`mAATCbXAUh0RVTlTV{vF zIc%e>eqbX(J_sOpU*tEgiR-j^4IztbWypwe8`ki}0xE!im8|5)jTiJ!()PoWwx~h6 z*Oj=JTOqXC(M?Xv=|B^mr__!TRWm>7frYkPE?%+qEuK44G7ExBp=N!>>D8>Q*%LV2|&J7qW#T!ewNlWXf9xt zD;r{Oe6;~|xMsi*F}HKDlq12hk5?WOwX2Ri0V(ShL2#xhk|6zT79dR2_|elNA~()o z#o1qAUvb`8_vh(%q1wbv4-0-!v9?WOcX5(<3UCkUGfN?T0ASF=(-<5%=ZI5)5h1RE zNI3{a+$dlsq%bax5I7JfaQsLxY9wN2YE2&a@MAS{xz2+HVLnS0eY*w{e&&PBT z1Lg^#Iwp${UD4ETDybQI#tLOXNtDSM()E|f1rQZaaqi?WR+QL$8UGdN8N;1(uhJsT z?Uqm{b=FiF|1DV78Xmp%lCaO*O4Iv0aQc`A@@>z(!2eHw`ct^|)?1ATfYaxl1N^z} zbCrqqw%11ti1~OzLb%5sdzj4){oy!CBR2zy3Si{seupjdg7lUEU*lkg5&YBp5hn&% zOz0EmSWvJFK;#GK59kxT1jzvE0PF?Gk6|E$gD(KSAbGviS0qvS(fiH{zkPOISfItW zQbt#;;{1!WAkWO#4tN5#1{r1%qy>NMI@txlabuiEz#u{d=Lwh_rHjP_Kx?+?@ZsbT z?c@KCE|*0f5C!1S?-H>O^}uw+WCuuy_*bGZ#Bl8TZy|H(cZ;}ixNXGgP2#TMlxY{@ zr9o|WNCb?VohHSgsL*T_L;$HD*gK#vE)(H`uAXFVim>Pu*tiSYH1&ac2+0c&;*(#e zc9|a{2O_dY70{&q6VZ=EKqJ%y5ag95yIOcDVmQ7`r`oh!|`a@rHV+P)4GQYU*u%2_bEUn@AB<2|EWC>p6RU!Rqjn zTQ3b~{=*4fR{iO7VqNaXAJz;27hV6SKJ}^a$Rm%K2TvSqZ7vwUK0}@Sy`~+0pL*)4 zCLQGc_kIj@ACmgFJp%~=uLP!|xm1h0aY7P(<5K?v}n9Ch(wZ0ixJieox;znS6QKmT1Eu(k-+!nt0$ z=p3!O5CC~vxTh-*+%E=WaNZM@r@l=Q_|zvh;8`ixK8*f^35ta;E?fYp?B-er;G>tX zmyG}-_`=-CV6Q;j{v?UK;ka95Q$T;v?JlcdZDL{21@Ssqg6@|P!&uZ4Uq43%b>?=c z{uc=t(CIFh8Uf;h$MgjGV~DxJph*-R0rwOeS&CT=D!BidbBy=aYFL0${uD9~EKEj|~O)WokjT0{n>BgYXnV zhor>9{pvZwfRIg>as~36b+rkV<240)r6mGxaobciE&r-L# zC1G@Ppow(>@c-}s{_pUi4}B;+^UO0{QaLBI2U_<)%Q?vVH~j5yKQjeOXwM^Oy5%x6hD zP~FsG`hVQDaD`&lEJnV#UyH=2g`0T(7Q{umSU?c4EGsE8KyH)^mzY5dsIx@A+MpQQ z1_RVk4jw=VA9{@N=qG`R0Acd6V;;eN2&RL`m5qgvG+=2CkwN@`R&mxw1ju-i`Y{B| z6*mtsBT814MraUspkDb=0nkri-l7xgxJ|&t_>llWs8D|qgp1lhBEXp7`b#7U!RQcw zbzBT_L&S{`JOnl&2(V+o&z%i~AS%XaEJ#M0Hxh+T3%2sb6AL=`%WBYX16X}~iU2o~ z4Uo4xsf^0?Jjbc@79zSqbk)|Ww{z(|+$zsjC#+ujf#(KNfSjysn``<40l@uaV;?+C zxX0Z0_1Z^(=b5=<9+B#x_F1jD1LSjM>A$F^!PG;jX$ZR1CQp*d9( z!{@I!HC%uB1tvwHJnzR~5`6+(0sS-1IK#Yj*#Cp`qiMaMKg@ah{U6NOld=P!frJ1U z_}mR?Nfsd!MXD8k^Wu;OyHHpq8Tv!N0RQ^>dh_uo&)lgfO>qT6ovKrQzRx_-t64C+mM1`fV61QT}F06vW}<9>^vv z8{|`sh#UhpIBGWA!M7VxF2QmLfP;%3oN=L~1(N`58De#AP=6&sU@P2ufhDp1;dJ5F z0a7+}Ax1u8`yz4DfIlD)7f+xZ02UvA#O>x!F94>EGP&;IQP3%XF4i^1-lF3uiQv%~ z5xEoom;6Wq&>v$H0I&n*xCQF8>Wm!0DugT zfk1#8Etls6HV+-TPhCF$Y6k;20FN=j&2AYdI|;Li@A<&E)WG!nYOhD~W3@$!4r6)r z9Q3~UZw=6l&!hbHXd?~mdsjhFU%cI(HAGc#J-Y7@n$`ntZwxWTvxCh9(iD!T>OO7~ zk+xKl9Fr!gZ$w?>K42)xeVaHIj~T^-P_0N0@Sb1~iV7J9Ho^G|{Qq9x8LmF_2u1t5 z*kb<^?QgK}H17)F|NQgMo4-Aqea)OXbJT%9cfi*KekW<4 zv1pDKN}m4FAHW$uUp5pWMgWF@-q-X+o@4(oV2A;ia6m*5T!`{c3_w&6$qf4eTb#8z zAGpt)aLSQ0!}lI~DKv}UpDJ#*O*_(+OjcN9Ewpg|8)ccDBe6Q1E+B|OSGMZMfR9aW z5c6h<>t+KN;l3p->es42#MH;I58ZExR0g5|B*I;1=K(#U>H)A=hGYGYrvQWk96ww+ zpv^Y95CbH*b}ae12GFouAMSz{)lYfN_CR9d7TEw0$frUZ6R|IVR&EgqA__DHz<-kn z3pHzCuj>A}PH$PH| z}6;hbQezSK`C~ix}fXa)6qks;Vj+dg!6r zDCj_w5(Qe*kM@Lz z5TibbfeGT4(V3fppz50>;zM0C&JlZnT!Az$b#Sx9=*L(Pu93)dNEI-`!^jMmK6Z$M zi)U9jqR;`xggfRR*V!o$E*&Vde(N`YLw_M+07ASYsGIiG4eJLs2pvW(XctE8 zS26|uc-&xcyI{eBP*qixfWO_}`?%wdGwOc! z?AZp8bO-f-i`qFJtQe6xc9d+P6z>#(YXAU107*naRDh6(1jtT;#~gEvsR020f%bv% zdh#(*7eo)D%^5RhnDJnf2!Wc~dTpUPHhkxU2ZfLQ?0Jb{Cxin_a>I&d#Rkx(LYQ35 zkjRw{SuoC;g4kS^-Nd>_56}MYCb}2kqvvHIhU3E(5HKad$ph{H0N`P6f|gG}p=)r5 z^ceBEZJr}hFQ9>UflCl@$7AyyKpoB-6~I(o$H`tU>V-2`-pKL26q6$N~tUW z?{X0Us3!n+0Gfa`X4$X2NFxyY!-W#XkUnl!V@M0~rSB(r5K#)5R`w`hheP@cq_BZ|s>=*{-DiM72?CEa`A-kv%7sU-eQ?JaGiE z!1$xz-B+2X^pswn>n4dv&lo#4BElpnQD3bN>qqx7^Aw$uokpkTKS!!3sR9W2G`o1B z>G#`AH4>?K)@wCf5ZQ)bb zo)-S;std*WYqKCpm3WQ@lbrMCJtRse0e0rhnUjG3U?z1pESsnRM%hjG1($gI=bUp+ zIQZa$ZSU`2{pwd1&%2?a!Bhco%6t&RlUNX(V~PW6riMq|s3ZXBzy0lR2KFpGaKoG% zDEO!MkG^~VX_IlWXwx>)`^wA9v=2d1*t2p*IPb8kuyX0D@XRZ$>=zWmYC!e_2H-_%b?83wD$bMSxt z_18`Hh6-+Q1?L=7A90NUA-8W2htZXbicHCn_0ZuIMf;f25;jy&RixbW2?4nVhZtnc-k zeq&HpstA6okvn9Ih!#TrPLl;X(F&QX6%t9alOF^?f$Bx#L9ijl&;A1RZ>8)Ah?6ij zE6*eV*2&Z#xax+3-^M18DP zP$|x{>mU05PN@b&hKC~goh;DKejA^y4gdC|tHb3No-Ml*j7Vi@sCy9gdtjgn_(S;5 zpFcmaxejVVTy8l(Si&MUYt~2l>4d$%=fm}`$NQ4}9=aJwBmhGyELV`7$>Q$W{f@vs z05E#od>ICUIat88edS+N-S_qcG4ue^H8j%QFhhe%DJm+BfYM0k44u*lC?%pG(kdN8 zBhoG1-8u9#+~427@XVWeG3T?->~r>BYwfkJ>smBjCOLZJ+5tl7?4r<5YK>~qm#Q1% z|E6(Ckqv~)f06>BbKVt;}Zb)qF-syBaGK0UFt0PoiAuV zP;JUCddCi)wSLzHOyJUMGIXoS4|s02mJP^LzV6ZFPC^)pBoX@O|-uDqhFPZ5&pu1E9uu%_GZ zvWf`UfVuYxV*Am$8=nk&aC$(m4HW3Z9{U7rA^7Dw4_v!h+(iY*HWM_}sFT2|C!L2H z{Z9)OPmnRk7h4CoPw%P5aJrne(pEg}@4BaUkt7j2V`CvcU-_pbmhfttgr-N7Jzku4X2ZT zIl%mqj$-rf7@~&ah07Kk)<6kGM*KE>#Vbv^TwK>zU&Ht0o4p}dU^Drv<>-ZAQzSys zkfHTPj4QlvY_-e~`0Cy7{SRweZ9-*jYO!WAhF(%pV%F;a=*X(7eO3hSRrO1_9XQ7h z6NvDWvNZWjVFuN zGW0@tiN0?yUm$E;9Fj&O?HG20yqEy)5x-qh`R|DgjR-z(KSnKNpHLy5seCQ$5#)|$IDC|#?d2;u;t9?J8 zrKZGN(}01en(!;5P2Y?b4JSr9eb+(-W>p>~RR=wOl&zM9wbq)hq&)wGpiGe|gUM}c z#eSqvBe&lLJ59)o;chCHFRjxrMP94fYQXjO$jMp>+&`{0EDwKanicYw!&t7w)d|>2 z8#t@r-fT}D9A)256Z>z_eDXGS8&p$A4|EP5-gJKBIhR4*ZDerTuLPbJBK-KX4w@E@a?NyVU$&g;ix)buZv@_6ZGgnn z7%$X5zCNdzN*t6Y9wC)$F^5G+GlN@%#@U)RZy+#H>5R&s;{F!cds-rgT(Smpqop0HL)ej#UKLiz11-~A~%$3hD; z9{Rp~_ybwr%QU8k-(|NSZ3cCW{_Hd9OX^f-`N3%l$@|dwtWQB1TAy`nX{__r^*L+$eV^Y8J^!)U64k^{;5qH< zjoE&BOWV!i@RBc=!ZG`d@;6Q4#QXfdnp@44rqQU`LNAl3C&Tb6787cbJaDB_9HgO|~|3YxS9cC+=~l?dPslyv!_ z6Z`C}=B>3-Pm*-gGlt@6m%aX0CS86uqgT={&pZrS{RJ+(Pl?fHjx}z#2%Avpw;iO; zt?A%j@1u@%V$LYY!AmK~lxhz^ikWaRAviitkrA2IJw}kcFp&R%3g|Wd7vmg6z5GrK zzpS#8Gp<6DP8gv2$h2UtfaK$^C8J=~3wv}DySsAcHnBlyL}XVQbV)gnplDB%c!IV- zMHN(gSu^~A>P5*Y+cZ!cZNdZN5Gt7U-3dWYg_EHm7;_AE^ssgbC@>XL zRx%Pq*wAWmYm)MJSE(f&#!DY-cH30*9e>Tv%0O*bGR+gZt9$3#NRFy+*E@zhAG~(( zKd2wv$Xa>wqM_eJG?PuUj(M~hDk1j0{ugMlO@c#6eZtG%GJ^GU+RfsrNMDFE!w>oP z!&~IScxkA!vlixh9L_xJzgYf_7jf8Hx45H6(s`p4s($>)W8m13hMAG*DN6`w6O{FK zwzEB&=2Ob;6I}%>)b`#FPp5*W)UKNdfsh28A)$^GC{RUVOT`q41yo%#lGzdQ6W3>K z*ap?s)}luDO>ZxrQ*JGYy0s8+ry0&g0@rq5xXFLJWesPva4$D|zey^nIyWq(A%lNjYT>K~TKn6feqhMMU9kRR63w^(-Sh6FP#7>GlyY zAiVnbGc7XVN%n5jhH5zK=c5o%n5lLp_%qv$`F{SKCVoX*yE$V-IB;z7iK^zjmY%>U z(8m)J4KnW3B(uOV40%#Yd4S#nxoju3e_v`0#{sz9zIF-!kynBK8R+`N>aC*E#oNg@ zL;1?Q_#XG$!cZYP^rgU(LD1CrYC+b=Z;}jrszJ>6LZ4CR%i{Tx^HEwWy4j_;!{^quIZXRN`9*u{sI0~MH6x;_OH&uel9<)K^ddu~`6v9SC9s^?NYzmNGRAq4}O1hLM4^`ZkY_W~7 z2$0AZwfIFh5=iR6vaL#BhpgVxE6JZ~f;oiWPH`kr(iEvq0(23FFUw2d8PUAEvQ*de z^~PH(eyUn-#u1zI@wJcdKa<2J=OL@Mg=@y6uf7`wl!m{vUWzXi&86%rrWaq^I^ol0 zOJw^O0R`tQq<>vN8oR%%-mN%lSqR^DI%7zHpB$eknDhG$MFI_Q3h)MwzJ%Yl3k8t( zVYw9hC#A>+JMIPpsqo{!U&)-WX_<0^=4a>yL9aFOQw<#ddu`3Z^F$;nR^^f02uip` zZln^lY`$#5@95K``*jSx1Ov;j$>VKIYSVho}zjgGT8KqZgjwmKcYNWRO71&bQle zS{UP=Y$P(2q63$roh z*b}fo3h*% zK$IoGkpDjJwOxYw%S-DT{Z(o@1ieP_5ozkLeg&H%a@0HZNY@Xgmp-aI&gF94C(RSH2jPk>i*o#l@VrOab z-;xwBfCIF>>3K5x{FBJu_1T=0>aIi8jJ2;LF(=e_rhs{3F4!N>S?=^mj$Zej|5_S0 z4L>2X`=rBXgNS5~Uy^jW0N?P9W9 z@I)N8S#!;2{Klf20E$|S(R-kCcwHxNlY!#ZkEhSSNc&LC;%Zl_dSEjUXD)~@`!q~|hTYn&7ADVW^DuRT zFbKuq+|PbRyIf7ltw36sI=l_Q=E;p$&`HB~%d8iEVc12Z=$7Pl-)ug@@t&Psm8E8B zlwvNSlDO#)G$QWi#A%^6Tao(Z+IUzn84tmdq*J`Rr*8pGLCc^Wl{hJb5l>Q~S%dWP z%Z*{+_&cUj9dA6t*V5z=FE;26)s`RsAv-o<8x~G+_6f{uO2z`ROS(xfqSEMDQ=z{Elrmr^*xJ^g_ z$nV`b&)z)K3(=|xt+YY3ng4i#W{SYSa8dor27p;h&tw8t?)hzLj7!MTrMQ1rk_DQ% zd!(+HJU*&_n%nrRe(XbL+(r0QOBZ{{&0i|m@$Cy)D>$s^dYQx!mu_ultSkVS{@Arev3xw_N}gwobR7-qtw=WrRK!H z+`mwoTX3x6PAF3|ShveD<5m)k_zWLeL=ve%O(^FLp;K zS82+g*=|gQ5>fA_8eWc8a05hpEKJSHA1BOB=tcBe4)njer)X?5oay0Bs>kf7*3|Uu zA^>y+NPn^~kSdqGC$6cAsP;G&y-(UWIa z`p9M8`?=8q-WxWRT-AD#bU#*B^yNy)w+5V-@ySR`>#HCiB3~1W<6q+`nJpxWN@D5eiTM zxN=s#zU1a6c09B(TQXkLEFba8-yzr~(K8oq09S>Kgu-u6kSz*iij*ea+Oo4PZZb9dO0#&BLz&GJkSCwD{PBCtK8jCiz}i$cKdIg-}5T95>|ZOn1wY6q0N=X zAIwnRyKgltpv8;QxXMLd7V73((Q(<8uE}cs%(^4r(y{U&HpLG=W>4Ig zoy`n1y{{aisQRa=M*aQtg~;bj8TB3^3!>H@wNeNsnmcss_RM8rhMSQtR{j{J_ww;H zm1eZ4pJ+Pcm3LK1{bBe1%~=!N$nT(i)B5<4eOy8BYk0k2lT$#3-(Ky15yo!A&Ujbl z$z#AZiKf@Pcl9yQ(vNxRF{(=t@w_;(fa{Q+htQe*6KZ#83=d{4Hhv_(bAEM6Rqew; zEK)GW$>b6@`sR7GY~_*3K~Q3McJ7M)};E$GDL%`8TvXe?FwrDd+LS ze6(yCrD=dOzC+J)-JgWD2oC!M>`Bg{;QgX)+ZynT!TqeW`p)BBBg=rrJ%+U~u>+I# zq?xr5S2}q16_$&OHDw;NzEW%oabQQr9-#LrJMy5N<~=OO6$&;aeg5p(M@)QsY-p^L z>BN<|f@jmKzn1zwN zdZ=njt8?i~h6A$#DdQnr?U;?-f7{{4*A-v?mL@oWGT?w$PoMF}1(H-O+dQ27O{l>~ z5Ikww1gIqLqmYWeLB4qMLzzaGIH}@DWoz|-OEj)ODYCPUb(G^?12EQQWAh1_r5xU8 ztT7T?I;`t=5s;nySYSp`kosaT!6Fd^-x~twGSUYsO>J=Q7#RdfHW1rvPc*6_3j2>G zG+cG5qM!fmI$TL%Wt3*W>eQwQ2cJzn?C>8}4h+&NenmDIr6_;I2)||nmeH0go0*|} z@6fbAN@}}HPCSIRl@Gbo1Bve#La&`gJ*x^0(SDz!D8c-1s>#S7MO+iWNcAYCMNrX{ zk$$9$jGbHaX{rp)n!7aGX7{EOdA1xhWbkjtWta`X5i<)nCYen<^@>PyuYlR0{h7;b zqQvXJyv`Mp-su*fgXD1L5kuC2G}HdSM{wIf=(IQzf7`qto$h2>h|iK7EfZuuV8v8_ zJYnYLm-!K&_{m{7r-6ou89mk@loK21G5#gg%oHs$4!9q)u-VpJ7}YhQI}KldmLH(g zCkXw`%C5rHs?xc1so~n0K|sMdhCi^Akr|cqDO;A#e;alZ?)pIuN&IW!-hEw(87plI zM8bp&sc|B-pJB9hbXOIbJf#C@8%q%{8PVaz*H-<<4eJm;{rT_`;QVN|K-`UiHiC52EC7nG5cUEC_WtgRo~ zaprQ{3w1`EeOkWJun#y){@au-5EY0x&CO*4!mxkfdw=JjY08WnmKk=HPv(!ljoHwgNWB zv37ON(yhqh&Mk(%(_cGeeC>zF)<;6Tj^|3hK>G(^3*yEro7WIUh_|$~C~rmoASk2%c1J!M5$TIx3~{Q9Rzd+XZ*Y5Mu*1!he578-D{N%3MW682V) zqq$IiV{=83OVBSk0zV?`{p2G&7thM-GkR}Q*k<@rw&0B7xCnE|U!kXTMYkfcNjo_R zo{x1Y0+~?{VV3YwEXO35)#^Gf891(33H_!xjXNdzhG=;jq*hGXF9h2FPHqOXmIQx7 z54B(l{p9IB#|K%nLzBsJVWf-8QZ35vKJFP{>9<_9x0>P7KP>%Yg2yqnI`2&Aq{+_q zy#F}4Z>o}g0x`K}`H}YP%K@4qaDOMZ*MIl>4wCJC^-Z|JNpd|xY_Y;gR?ct!_(_XV zV1W0(WMQqdL~O#dVC2p9K!4&TP1TBtX44AZ} z)o*@Dm&ep& z1&q#zOn=d9c}(yoW(9ZB4$+3@a=~^BXv=|81!~aX;1FokS9~|FW7bKI{+LU`Tg$Qo z^alKrIi&lzsk%9B_#1!RalT)xaz(>zQX!vTpvt-AIsZV=~=28v%X`uC84OK@H>X{+ow1wrYJV?W(53MSXc+IhJ2^8 zGT+@Mqs@*vujxV^ny9cMsroT6`T z4{@kq^xs327}k=jEciJaLr>GOp>!X#s)Ol_%N^fls@SzF0d|!)@Vwodi z2GJ*#jx_$i<3w}f!oi)tON_%%d;BJ>gV(G3%ftQQ{d4E>MLn-yOFC32ldXkSFxGs$ zua4%R1FimXYI>0yz(z8rsiwzD)grwkxmJ6l#m0E(`it)~DqA3ul{x}u$Y&8u3qr8G zy3!djIdK@mjY%nBNdYE;t4|>G8{DqTXc7Q>NnG2qeio!`_zVbRc{T6F40v%73^Dp{ z(59ArJf$b^o?JUNgWo@9{A4r6e^5CuS8wLQ)yjnH68)c=$nTP6m6Zar`ZRpab9`(ashh_pa;}+kd19)|C}a|y zVh%LrE&Vch{)`-{Wnu%0)6}LbtDkn~Ei{b-qt>7sNvc2D=66<>c9lT)W3)9oF!>8+ zvGcs&Z&gekT6QKA7P5ZxuLzohjq96h-{M~t(+a9g# z5m2t$bI51fF_^+?FdP$1Hf?$@IH65>Cx1y z!9K}qkOzCGl+zpN;@otzfISXbXFFf9CGs0q^{kUufAa=EN|J3Oz^D4v`1K;yc6?q@ zO%+3h4q!q*w0!Unbg4|PP;-&yAKbUlQH^0b-w>k8hd&R?%k`S|bbBg&fv#CtZza{! zTyTHansEfcOog3-wintoaIGO}7yjX=(LsxJc7b;Xjhd8FqLgRJ;R_

GATAFSd9QJ8?f)rXOE`Uc7q6VB)(Y z5v{EG#=;(V_N2t~pE_s6yZf!6U0aYT`po{7?g@_9)C=&Kb$3aETKXm>um zLNb-CN%eO0_X!shx_AHII88Xbp_(lu1$1ql@*|mVPTZwhx$Aq)(B8vQGakG$!odic zh#xSDh+bRN9LLQ^&iY0YFZax!D93Te2^r&jOQTVd58&^^9({dJcsc2E1V@W5fp0v6 zdGn?C>?`PQziwKfN93O|D%M_f3G30l)y3snk4CAPgiY={8&>A39Z+F!^;rdK6V>qOa5#7ZtO>x$`=*dmRhn7Y~B!ogKUgGr7s51VeG;r6ybv z5GWzdZ(s;h8(;{iypY%*2`mlX^tJ2HCYv^FIw^KaSK7+-oPFy(oV{aGJF)j6Qsj-&r)fC9i6hK$yP7YaI z)Fwoz;8mlM$F9K~1evyff1Pa-S)34bgHe__nmMfWC_d9t%Wm^Gog%)_uB5&I@ZPih z67b6TpwQ#5?j z=FGGSKFzQj?Ro_T*r)dX#0zY0h@}KGKKWpiLKbO>?7*mJ-YKqg_k5D-w|`kMm%X#g z{CYLD>N?u=iml9j|Gm^KZNPpoi(y+&9P(;uEMo1?qi8Jwxl3%ohVBc%7aKsDiZA7) z1lr;o;6__IA$nWd!;@!qsozo?Rx?L{K)6c8+gOwIxxRiHkuFl}hThr_o#fr=WVujc z!3;2G5qb%iRjWAXoe7z^`!Oo&91lOsMkKr1MKJQ5DX|TXzFAXN7Hn#GQzK7yuhC<% zZGZZ&Q;5x)06|>;^N<#gjhZ8Ag-0hH$!7AkJfgR+jsE0w< zYbrwX$UItx6?$*4w<{qfpt7KUPvj}NYGO z=>b=$vtFd04tkUWksX&(KSG|z|6=G%qSq5PpFcQkt>m?Zwp`kmlh{#@b2 zsqgX{@2Z?AP`H}|<2!ZWgk9*as82Wh+xr=;bn~C&3-I++4&E0Ad1DUZkc>B&PI%ZJ zmhW!;TsV1t%Qcv(yHoMBe7|n)x-ZMU&(-grRcd;zJcifjR&K4i4fUD;|bV=3alI7BHzWz)PukF1-zP8L(^ZemeTj zGWnj8!-v$AP!iAuqs!{oLwQ>6GENV)Y~(-kn$|!|Xv9n_&OZr*0y0iDT!; z=tI(QnbYGMUp#o9|0ErWLQ@vQbfY^*TISpu|{%#s8Q8 zz=G(r2w+edtWn{7IqAKxhALs&lQHSRWJs#dd3-hAI$W zq2~=tv2s0%fossGfpJ!V-m~wJMEr{l|F2O0_bg}Mb2)|`Mec}f(X-a3Z!lctfVTc} z=Vk%eH&yvJM3;iXG%?X_hW?K-RK$ti?AI(t@%7lUNHk*`=5iigTXRdckf;!&Mf91= z@TV~(fv8Yv^Oyu?GvnTtO;_2BC;0oP#IuyYbxim3&wi;Ph+(eGOKob=)S>4`Yg3HN zt2A9@#ZOLDKU~Os%r*iVc(>z;d8v$FPui8*kyfR}GRM+v5T5Mw1FxK~6mLq(8f;Bc z#X5F;uAyE9MrlV00A%))#wD`{w?;!PVjYeteqZQOX3dC`1P$i$_$IGnjMw6c)uEKC z(C445UNz6V(@mB#dm|>P( zqP6g(qq$kLbt0D4m8zy|2N=44DGu(~HgwuevLca3iq(gz>_q#Y&Db=#z&Xw{NUbil zX5{`eyM3;uqC-zi0o444(8j}*(h!!L{qO}F_+>h7dCkVRPemmCNeOrw=A1G8z&1`i zH_8UlqoC1KF+#1F&~q%N_YIZ=F9kFzEG+EVocsGkP)E;M{w2kdDo2d@1jUQMC0=~J z=JJlt%vsZJCD8>|0Orue0$?l^aM(kI14slyK->4z@FBp}Fj-A-CPVkIhNAP$7p5x~ zzx`(PW$O)nh|GC?-Pci>sQ3q^P>fK#5M{jXLhB{+hP}n3 zph634A3KXL5dkb_?^xK@d2PyV-qbEdAF$c<9_Q}+`5|?CjZ zN4Yw8U+y(E`3u&r=jw6$pJQuNi~+}ASYBiRbII9W7Gq(n62O&(JRtXKnm9eRArf1? z1Mvx%HcopO;pBPZ+{6UdI5TIJi>Pff zUXqAUjVPqgEiFFFrWLjc2#{_SN9rL5CluT>9!13$LyLlf7!ekmRSc13ORZLD*y8 z62e~8BQ)FJ``@7NPs2Le)B^&0 z^vQ^(1k>gWN*a~p_cRN~`W(3>B zW?&E_yU)@!ktkX>cNaD7diK{z^M>`cME#{8V(2j9Z)r^jkm8d~GVPvvia!JJ%yM6J z%d84q%V&XbAj+ciKAs@51yRVpKAtj)%spPZ(?&Q_agaul`6I0zGWIRsY=85w;na!% zI2~nMcohlx^57ASP_u`Cf}8B?!xwEgJtP_MxeSKwtK+Y8{(qL{b@?NI_R0RuCC|kw zg=7eCKQz|LdX1;vdiUqvR1?*?unf_7QTyY+PssXk-1zk@e9KRB!=HfdC6<09TzvVN;PJoZt39=NW6ycUM}uGzH4m z-vgD2JD@CkF@M(L_uOEeP%;a^9{?}dgTGBUPBSQ?mCiMTHpkC&H?&I4WOL0|e6!~| z;DT{bHv-rQ^x^;+4pjdWUB*Vx~1W-92{0>k)8Vj7GXi=W>k6|MEx7LAFr9*uW< zE^{wyM_eIR$ZR9iT4c*XDVTxFZ=;0A;(fn{ta&htLK~V=SKho0V8pd(4WeAI2=ii` zZ?0h8@&+VEgrXMo93LbH%;EX=Ft#;km$z}!n85q-LV$D0fdI4#AeW#84JMNjna3e_ z#Zf9NUi-T>H&_)6FTc0W;+VaCC7m(Pf`q4mA74$!cElU$TAOLyy-m4~pO-IC)+k&% zjp!61TNE7v!5?rcs3A`p-l?2Lnhf6T;o)DlJoAp~ry4ud#L4p6y#Iuzj2l=I>UVensNHyyXxlbHI@{}l6tqwrbAYx)Q9DlqTs1dcbSCQZH{p~ zaWLsLP7muN-e}QiZX%gzI_mlA@8*zBHj=ZjQF#VRAhBVOFB5nZtz#GwI@8IB|1JWk zI#BQcz)^i&>c4|kX8ciF>F}P8xA0z7+F8Yo!pm<#2Qnh~Ic$R{luP{XLnKp=ci3VZ zs*~QiET)q_;vrtWylX}ADdW{YIB=2hK9LDQfK!-{Vgr)}>$Jy|!{-OSx@8HJ!0&=h zSBE7oPCsr{W}5PDjP0v%gq~^Z(-4i?AdA-17N+@vk?Uuq^y1X9C%bp!rjm z@tVCnVnz5F>+Rl#@c{_LspSFb`oku>I*Tw z{rn>C&860AEQvEgAe}Oc8^T?}PKR!6;{=ZYyL_2UxV>e_^0kTarIGex@%6-ON9#*1 z6Y)0#WCqMMM$Sr@t2H=_4*dGqd?&77_prh4u2Q-E@yxwVV}M%unQo_zrk9q zxq;K}+(Ty>;d-;aJBT%jM{_3d@7XvgIVH4+rH2LJl8_=1Xdw&@g<3?mF^Y8h31isP zD2Kk^UiIG+cd{Vsjn8U}5n(q2yOocfcCMGPfnHL{0ZXo7NYpB7=L z1q$Lmp^q)jmaG>iR7i^e!io3a!5Nc>x4M_h=WI4re+N?`xtg1*8i%+dMcA0kGO#h7 zJFuUM0ytL&$;GJyN{Y601Zii-7sZ!K3GWuqJj%9Y)VXq*b5^=KL*AaD66_C6Ui(yf z&Rt*^!CAu@yW%@EYLSU)fwXO{nFiT+Ajj0hiWiZO+Wkld z3wn+A6m(m901+tB&sv^Kr{}$PRZyTl{v|gupFQh&fv~XG)X$(rJ@!3t8@;N?(Mse5ZO=+^)vRrPJ#$l* zAGetBz@0@3>@#}osA*#+O)WDyzeJGJB@XqO3BEpUJng6f!xT*qwgA0}oDXB5FhW*9 zupxuv5GOc!NuDJ;+KUOiA$0;UJhuDQc7MNIMg%RD2$UxbIR9%JhVA1uwi-Hftp}WU zQ8G~Ow%SbJuT8V7m#>EFa17!dg*NmAT<+M+^;~1KiFC7A0pR^RHCH3TjFi%`BA@o{VO+h>mz~M4foDCf2G;M8qCSy&D5HhKk35hf=2@3ymHsXzfV|? z*9I`tc=rid0v5k+ATx<|LbJHzloMy5*PZ;+vz#u}EEMM}%PBF(`EVw|nq`WmD+2Ws z?DLz9Ks^*{7&&}w&2gbEPh1b&tzN=1MDJ?(kM0)SYajKb3y(rNgi)+76GPp+MHL?> zzFgyy5xXkr?v_Qsq8?V92}Rl;u5%rTey72)er!o9HH`RGb3`{YdHI;&+S-2>-8>8-_uLQD|>6z6_WH~Pe z{S)55X4dpJyGG5uwZW{*WlTD+&{F?XtW)=d49 zt+ap4@5g-v*m-k+aVCJr3p!dTg;QK8LC^Oeq>DMSiLM0YLG23aNS0enr}i-!#`nRV z%|-bdL#)%}=_Q3z+!6b~QZLiQxw?-3mZS?yl0LRq6##TKlnKj`E(s@nb6w<^-^+|o zl}*%`KnyjG?@zbMdOHNj#VBA;5DQtpyCToE^T1d2N8yh`YgV8sn*iQCBEZk*jQwwRP_{c`5 z`U7>b9Lp-L$Ji<&$)D^QKRgO{^vo4-VkV5m6a&YKhZ(+^!J=k(uU4g+2r$M=;N79X z83)G;Q4{pN{nbk%Q%ulzfKh5+04`Mjj-aOn5ys(g6PUBIvicSr4n^D}6PXV|>p{WC zWQ)OWN4-+7E>+LjU3w!KsXt4-W5hCGWTvAF2Iv7xw^g1ug6{nK04(XMjM4!MHQ^s3 zq^2&X!ioMhsi|q6pdOukpF@eCDnt_@{(kX^2`n{j`X&vw7kQoiPO$(0 zz}wbRRWkS(x)I!EWGIP;@xPMn%M|BHhMQzOFE#m^?PzbG=lZKS>FmXGkPPJjpzB5L z>CQ9I(@?#J#*a&85r6l#SLgPXwQe8iAI01Zgde(s_Ty{f?{#FukveZ< zT4}41w*iDh!Je#RBm=W^{$D0(8Y35N2It1co<%uV*VLpJl02>)_3#l0{01EWyoutl z??gaC7K?wp_8HWi)O(hA@P2M4ZmrdswDH$(+krBwqpK@0zdku7G45bs##M$;l{!jzij&KVr1 zrlzoP0=4qGx=)+)+Usk%eTDUfP)D+2VD+zP^88 zkjjQc2oU!GFjQ}8)1fSLI7 ze8>O}cIqXIROB~R-_i{x)x__F-irY$Q&3i z*y9n^AnxBv0a}n7F*h7y9C0_cpUpMoh=T+~I7K(8{q%=Bu;xVJ;>oiM;hX{F^aUA< z;-k*P!*`;1`^58`ozjMehMEL1`)Ou4W{4Bz^I@*o!bkB;bHqz4OPz6Vqk@*U06y|FO^wlmc%b)an(g$QW1Oer)At2|t)an>B`7p4ug;oY5JigZAFllUWt~gY z4Ntx_cCfncz{-RkobF7$j+wu4ipF0PlvH%$0xAG{lcS4m>455~Jjm%8p4;K!Ve#Ak z>xxA?x)0)uf^oGAfy&k+3NkY3v1Sx!OSKGZdN;4JjqX|KT!aYm@LA&`9rkx;>K3sH z)b@!9!^u!Y1;>M$fUCZrI3E!Ra1O!aGLm90)MJ&;{Y6g0HwY$YNd0sz$@blZ!_BOv7oI|Zb z4W!L6e-ApQI-{yfy;Bed=4=tx0IZ}ZonEJXpu09cQ6@iz`00(HH&w6t`;QIV*tq@*P3>({Rq zj*h!)Eoh*)9gm{S8zBO`uAZJpvY?41Zr#VQ_e9f{drP!W-kNU7xiut%n$Yj5=m@&@orZRpgMhd^A(dm!vh zXO6~{HZ@yJE>8YD^_jxM9Id;h$w4wT^b+*>^m!xmf}VCoCJgTlfDCFl**;1z&e4WDQ{pS)*h|Py@T@vu2PzjvxOFfjE|4EZ*CTTcK9SD`HCkn7W-G- zLjHDBbY%nB{l2k5tW_*N+4N_9^}@+p{H=)qonuW+m&S)vRgf{~{>}B}B9;=^-qxo2 zKv!N=bZGSKQj|c`tPDFHyuWj$4|DW!A{_dsZ&QU#qrQCkVu7^`p*5U%H*qZR$FB}Z z4LM+e3*2TL=!MkJ3h{_>NLx$O-@kv8-#c4cvTNhUr#t)mH_$RM30T|MymuUzSZve; z1+$wmqMFOk{!+Z*2g(C`&c!CVA1o{`S{z`zPb3$083Nu@eIiLhXy6?GZzgZ>ffbY; z8E~!0G(y;lfEqZ5=@x(6>IVD!vXG@OPavF4Z+0(E3gr%9q$bdWk%8y|-d%$x*vV4F zqi|%|=HJc6I=^@U+TlQ$dA4c9YtgS0v(az<}66 zgzg6UGd%l$gV>S}0186?Jh&SWJLh`rQ(orO^<~FiioHvL8g|oAbMa~Y?~d4^f$^#G zJMaHDRt`0g1obklh~s}3cSU23rODwhss9`6CH7hW|GIG?67a{%gWv~0XAl7U(Nfb< JEm1ZL`hUrto)Z87 literal 0 HcmV?d00001 diff --git a/torch/functional.py b/torch/functional.py index ab8f70f6bffaf..78f833eaf5417 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -330,23 +330,54 @@ def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]: return _meshgrid(*tensors) else: def meshgrid(*tensors): - r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional - vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by - expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs. + r"""Creates grids of coordinates specified by the 1D inputs in `attr`:tensors. + + This is helpful when you want to visualize data over some + range of inputs. See below for a plotting example. + + Given :math:`N` 1D tensors :math:`T_0 \ldots T_{N-1}` as + inputs with corresponding sizes :math:`S_0 \ldots S_{N-1}`, + this creates :math:`N` N-dimensional tensors :math:`G_0 \ldots + G_{N-1}`, each with shape :math:`(S_0, ..., S_{N-1})` where + the output :math:`G_i` is constructed by expanding :math:`T_i` + to the result shape. + + .. note:: + 0D inputs are treated equivalently to 1D inputs of a + single element. + + .. warning:: + `torch.meshgrid` has the same behavior as calling + `numpy.meshgrid(..., indexing='ij')`, and in the future + `torch.meshgrid` will also support the `indexing` + argument. + + https://github.com/pytorch/pytorch/issues/50276 tracks + this issue with the goal of migrating to NumPy's behavior. + + .. seealso:: + + :func:`torch.cartesian_prod` has the same effect but it + collects the data in a tensor of vectors. Args: tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be treated as tensors of size :math:`(1,)` automatically Returns: - seq (sequence of Tensors): If the input has :math:`k` tensors of size - :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors, - where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. + seq (sequence of Tensors): If the input has :math:`N` + tensors of size :math:`S_0 \ldots S_{N-1}``, then the + output will also have :math:`N` tensors, where each tensor + is of shape :math:`(S_0, ..., S_{N-1})`. Example:: >>> x = torch.tensor([1, 2, 3]) >>> y = torch.tensor([4, 5, 6]) + + Observe the element-wise pairings across the grid, (1, 4), + (1, 5), ..., (3, 6). This is the same thing as the + cartesian product. >>> grid_x, grid_y = torch.meshgrid(x, y) >>> grid_x tensor([[1, 1, 1], @@ -356,6 +387,28 @@ def meshgrid(*tensors): tensor([[4, 5, 6], [4, 5, 6], [4, 5, 6]]) + + This correspondence can be seen when these grids are + stacked properly. + >>> torch.equal(torch.cat(tuple(torch.dstack([grid_x, grid_y]))), + ... torch.cartesian_prod(x, y)) + True + + `torch.meshgrid` is commonly used to produce a grid for + plotting. + >>> import matplotlib.pyplot as plt + >>> xs = torch.linspace(-5, 5, steps=100) + >>> ys = torch.linspace(-5, 5, steps=100) + >>> x, y = torch.meshgrid(xs, ys) + >>> z = torch.sin(torch.sqrt(x * x + y * y)) + >>> ax = plt.axes(projection='3d') + >>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy()) + + >>> plt.show() + + .. image:: ../_static/img/meshgrid.png + :width: 512 + """ return _meshgrid(*tensors) From 2b303f3f315b566d1103859b34d55e4d6ee21cd7 Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Wed, 18 Aug 2021 04:04:43 -0700 Subject: [PATCH 013/530] enhance comparison tests for c10::optional (#62887) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62887 Reviewed By: VitalyFedyunin Differential Revision: D30305044 Pulled By: dagitses fbshipit-source-id: d0a3a9e4ea186915ef087543aaf81a606f943380 --- c10/test/util/optional_test.cpp | 86 +++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/c10/test/util/optional_test.cpp b/c10/test/util/optional_test.cpp index 1e34377282898..cac325f9188ab 100644 --- a/c10/test/util/optional_test.cpp +++ b/c10/test/util/optional_test.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -8,6 +9,14 @@ namespace { +using testing::Eq; +using testing::Ge; +using testing::Gt; +using testing::Le; +using testing::Lt; +using testing::Ne; +using testing::Not; + template class OptionalTest : public ::testing::Test { public: @@ -90,4 +99,81 @@ TYPED_TEST(OptionalTest, Initialized) { } } +class SelfCompareTest : public testing::TestWithParam> {}; + +TEST_P(SelfCompareTest, SelfCompare) { + c10::optional x = GetParam(); + EXPECT_THAT(x, Eq(x)); + EXPECT_THAT(x, Le(x)); + EXPECT_THAT(x, Ge(x)); + EXPECT_THAT(x, Not(Ne(x))); + EXPECT_THAT(x, Not(Lt(x))); + EXPECT_THAT(x, Not(Gt(x))); +} + +INSTANTIATE_TEST_CASE_P( + nullopt, + SelfCompareTest, + testing::Values(c10::nullopt)); +INSTANTIATE_TEST_CASE_P( + int, + SelfCompareTest, + testing::Values(c10::make_optional(2))); + +TEST(OptionalTest, Nullopt) { + c10::optional x = 2; + + EXPECT_THAT(c10::nullopt, Not(Eq(x))); + EXPECT_THAT(x, Not(Eq(c10::nullopt))); + + EXPECT_THAT(x, Ne(c10::nullopt)); + EXPECT_THAT(c10::nullopt, Ne(x)); + + EXPECT_THAT(x, Not(Lt(c10::nullopt))); + EXPECT_THAT(c10::nullopt, Lt(x)); + + EXPECT_THAT(x, Not(Le(c10::nullopt))); + EXPECT_THAT(c10::nullopt, Le(x)); + + EXPECT_THAT(x, Gt(c10::nullopt)); + EXPECT_THAT(c10::nullopt, Not(Gt(x))); + + EXPECT_THAT(x, Ge(c10::nullopt)); + EXPECT_THAT(c10::nullopt, Not(Ge(x))); +} + +// Ensure comparisons work... +using CmpTestTypes = testing::Types< + // between two optionals + std::pair, c10::optional>, + // between an optional and a value + std::pair, int>, + // between a value and an optional + std::pair>>; +template +class CmpTest : public testing::Test {}; +TYPED_TEST_CASE(CmpTest, CmpTestTypes); + +TYPED_TEST(CmpTest, Cmp) { + TypeParam pair = {2, 3}; + auto x = pair.first; + auto y = pair.second; + + EXPECT_THAT(x, Not(Eq(y))); + + EXPECT_THAT(x, Ne(y)); + + EXPECT_THAT(x, Lt(y)); + EXPECT_THAT(y, Not(Lt(x))); + + EXPECT_THAT(x, Le(y)); + EXPECT_THAT(y, Not(Le(x))); + + EXPECT_THAT(x, Not(Gt(y))); + EXPECT_THAT(y, Gt(x)); + + EXPECT_THAT(x, Not(Ge(y))); + EXPECT_THAT(y, Ge(x)); +} + } // namespace From 4a390a56c4a345a917f5a4d8ee6cd47818fa7e84 Mon Sep 17 00:00:00 2001 From: CodemodService FBSourceClangFormatLinterBot <> Date: Wed, 18 Aug 2021 04:18:47 -0700 Subject: [PATCH 014/530] [AutoAccept][Codemod][FBSourceClangFormatLinter] Daily `arc lint --take CLANGFORMAT` Reviewed By: zertosh Differential Revision: D30391472 fbshipit-source-id: d4eb1e7debea8905e7fee5f026c082bee65e78f3 --- torch/csrc/distributed/rpc/python_functions.cpp | 9 +++++---- torch/csrc/jit/passes/shape_analysis.cpp | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torch/csrc/distributed/rpc/python_functions.cpp b/torch/csrc/distributed/rpc/python_functions.cpp index 2d6533d797175..60d67c558dcae 100644 --- a/torch/csrc/distributed/rpc/python_functions.cpp +++ b/torch/csrc/distributed/rpc/python_functions.cpp @@ -155,9 +155,9 @@ c10::intrusive_ptr toPyJitFuture( } catch (py::error_already_set& e) { py::gil_scoped_acquire acquire; // FIXME: this is a temporary solution to add a special-case for - // ValueError and TypeError, as those are already used in our tests. - // We should have a more comprehensive coverage for other types of - // exceptions as well. + // ValueError and TypeError, as those are already used in our + // tests. We should have a more comprehensive coverage for other + // types of exceptions as well. if (e.matches(PyExc_ValueError)) { child->setErrorIfNeeded( std::make_exception_ptr(pybind11::value_error(e.what()))); @@ -165,7 +165,8 @@ c10::intrusive_ptr toPyJitFuture( child->setErrorIfNeeded( std::make_exception_ptr(pybind11::type_error(e.what()))); } else { - // py::error_already_set requires GIL to destruct, take special care. + // py::error_already_set requires GIL to destruct, take special + // care. child->setErrorIfNeeded( std::make_exception_ptr(std::runtime_error(e.what()))); } diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 3024811fef6bd..47cd30b3d43ac 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -41,7 +41,6 @@ bool mergeTypes( return changed; } - namespace prim { using namespace ::c10::prim; } From 30e1c74dc19ae2b622b46ebcdb7972c42775ac80 Mon Sep 17 00:00:00 2001 From: JackCaoG Date: Wed, 18 Aug 2021 06:42:51 -0700 Subject: [PATCH 015/530] Update cuda amp to also check xla device (#63413) Summary: Fixes https://github.com/pytorch/xla/issues/3086. Pytorch/XLA:GPU also use cuda amp. I verified the pt/xla `test_autocast` with this fix and all test passed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63413 Reviewed By: ngimel Differential Revision: D30380785 Pulled By: bdhirsh fbshipit-source-id: fd1a1de7d224c616fc3fa90b80a688a21f6b1ecc --- torch/autocast_mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/autocast_mode.py b/torch/autocast_mode.py index edf36d25745fc..ec9fdb0326d62 100644 --- a/torch/autocast_mode.py +++ b/torch/autocast_mode.py @@ -135,7 +135,7 @@ def __init__(self, device_type, enabled=True, **kwargs): self.fast_dtype = torch.get_autocast_cpu_dtype() else: raise RuntimeError('User specified autocast device_type must be \'cuda\' or \'cpu\'') - if not torch.cuda.is_available() and self.device == 'cuda': + if torch.cuda.amp.common.amp_definitely_not_available() and self.device == 'cuda': warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling') enabled = False for key, value in kwargs.items(): From a00d5878497af2a0bb599fe939369de49256b1ea Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 18 Aug 2021 07:36:22 -0700 Subject: [PATCH 016/530] add `OpInfo` for `torch.linalg.tensorinv` (#62326) Summary: Fixes https://github.com/pytorch/pytorch/issues/53739. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62326 Reviewed By: H-Huang Differential Revision: D30136376 Pulled By: zou3519 fbshipit-source-id: 04ec9450e8866667649af401c7559b96ddc91491 --- aten/src/ATen/native/LinearAlgebra.cpp | 9 ++---- .../_internal/common_methods_invocations.py | 29 +++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index bbb6fce844524..10576a0c63a49 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -2651,12 +2651,9 @@ Tensor linalg_tensorinv(const Tensor& self, int64_t ind) { shape_ind_end.insert(shape_ind_end.cend(), shape_start_ind.cbegin(), shape_start_ind.cend()); // If the reshaped self is not invertible catch this error - Tensor result; - try { - result = at::inverse(self.reshape({prod_ind_end, prod_ind_end})); - } catch (...) { - TORCH_CHECK(false, "Failed to invert the input tensor, because it is singular."); - } + Tensor result, info; + std::tie(result, info) = at::linalg_inv_ex(self.reshape({prod_ind_end, prod_ind_end}), /*check_errors=*/false); + TORCH_CHECK(info.item() == 0, "Failed to invert the input tensor, because it is singular."); return result.reshape(shape_ind_end); } diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index b281c5e474c41..f06d3ce899749 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -4986,6 +4986,22 @@ def sample_inputs_softplus(op_info, device, dtype, requires_grad, **kwargs): SampleInput(make_input(low=1), kwargs=dict(threshold=1)), ] +def sample_inputs_tensorinv(op_info, device, dtype, requires_grad, **kwargs): + def make_input(): + input = make_fullrank_matrices_with_distinct_singular_values(12, 12, device=device, dtype=dtype) + return input.requires_grad_(requires_grad) + + # lhs / rhs shape can have any number of dimensions as long as their product equals 12 + shapes = [ + ((2, 2, 3), (12, 1)), + ((4, 3), (6, 1, 2)), + ] + + return [ + SampleInput(make_input().reshape(*shape_lhs, *shape_rhs), kwargs=dict(ind=len(shape_lhs))) + for shape_lhs, shape_rhs in shapes + ] + def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs): _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -8673,6 +8689,19 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): ), ), ), + OpInfo( + "linalg.tensorinv", + ref=np.linalg.tensorinv, + dtypes=floating_and_complex_types(), + skips=( + # RuntimeError: aliasOp != torch::jit::getOperatorAliasMap().end() + # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":159, + # please report a bug to PyTorch. + SkipInfo('TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)), + ), + sample_inputs_func=sample_inputs_tensorinv, + supports_forward_ad=True, + ), OpInfo( "nn.functional.mse_loss", ref=reference_mse_loss, From 061b36e2f58fea2ec9c06577c4bef70a4519af20 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 18 Aug 2021 07:36:47 -0700 Subject: [PATCH 017/530] [fx2trt] Add dequantize support (#63448) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63448 Only available after TensorRT 8.0 Test Plan: buck run mode/opt caffe2/torch/fb/fx2trt:test_dequantize Reviewed By: 842974287 Differential Revision: D30296863 fbshipit-source-id: 44b9630ef0d210e7f20e650dc81c519f7e41f5f3 --- .../fx2trt/converters/acc_ops_converters.py | 32 +++++++++++++++++++ torch/fx/experimental/fx_acc/acc_ops.py | 32 +++++++++++++++++-- torch/fx/experimental/graph_manipulation.py | 2 +- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 0bca6e28c83b6..88a74fe9e32c0 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -1138,3 +1138,35 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): layer.axis = 0 layer.name = input_val.name + ".quant" return layer.get_output(0) + +@tensorrt_converter(acc_ops.dequantize) +def acc_ops_dequantize(network, target, args, kwargs, name): + """ + Currently just a no-op. + """ + input_val = kwargs["input"] + + if not isinstance(input_val, trt.tensorrt.ITensor): + raise RuntimeError(f"{name} received input {input_val} that is not part " + "of the TensorRT region!") + + q_scale = acc_utils.get_field_from_acc_out_ty(kwargs["input_tensor_meta"], "q_scale") + q_zero_point = acc_utils.get_field_from_acc_out_ty(kwargs["input_tensor_meta"], "q_zero_point") + dtype = acc_utils.get_field_from_acc_out_ty(kwargs["input_tensor_meta"], "dtype") + + if dtype not in (torch.quint8, torch.qint8, torch.qint32): + raise RuntimeError("Only support (torch.quint8, torch.qint8, torch.qint32) " + f"quantized type in dequantize, get {dtype}.") + + if q_zero_point != 0: + raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}") + + scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32))) + scale_layer.name = input_val.name + ".dequant.scale" + scale = scale_layer.get_output(0) + assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " + "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + layer = network.add_dequantize(input=input_val, scale=scale) + layer.name = input_val.name + ".dequant" + layer.axis = 0 + return layer.get_output(0) diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index bc4dfb3c4fe5f..9b2c7f95e0000 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -462,10 +462,12 @@ def quantize_per_tensor(*, input, acc_out_ty=None): ) -@register_acc_op_mapping(op_and_target=("call_function", torch.dequantize)) -@register_acc_op_mapping(op_and_target=("call_method", "dequantize")) @register_acc_op -def dequantize(*, input): +def dequantize(*, input, input_tensor_meta): + """ `input_tensor_meta` contains extra argument of quantization + parameters, e.g. scale/zero_point and will be using for + lowring dequantize op to TensorRT + """ return torch.dequantize(input) @@ -1174,3 +1176,27 @@ def packed_quantized_convrelu2d_mapper( ) relu_node.meta = node.meta return relu_node + +@register_custom_acc_mapper_fn( + op_and_target=("call_function", torch.dequantize), + arg_replacement_tuples=[ + ("input", "input") + ] +) +@register_custom_acc_mapper_fn( + op_and_target=("call_method", "dequantize"), + arg_replacement_tuples=[ + ("input", "input") + ] +) +def custom_dequantize_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node: + assert "tensor_meta" in node.kwargs["input"].meta + new_kwargs = {"input": node.kwargs["input"], "input_tensor_meta": node.kwargs["input"].meta["tensor_meta"]} + # `input_tensor_meta` contains quantization parameters that can be used to lower + # acc_ops.dequantize to TensorRT ops + with node.graph.inserting_before(node): + new_node = node.graph.create_node( + "call_function", dequantize, kwargs=new_kwargs, name=node.name + ) + new_node.meta = node.meta + return new_node diff --git a/torch/fx/experimental/graph_manipulation.py b/torch/fx/experimental/graph_manipulation.py index 9d0af5343ae9a..6daa000f609d1 100644 --- a/torch/fx/experimental/graph_manipulation.py +++ b/torch/fx/experimental/graph_manipulation.py @@ -412,7 +412,7 @@ def get_user_info(user_node: Argument) -> Any: def get_arg_info(arg: Argument) -> Any: if isinstance(arg, torch.fx.Node): return {"is_node": True, "name": str(arg)} - elif isinstance(arg, torch.dtype): + elif isinstance(arg, (torch.dtype, torch.memory_format, torch.qscheme)): return str(arg) else: return arg From c508433617687130bb45a0aad95ae454de425fc9 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 18 Aug 2021 07:45:45 -0700 Subject: [PATCH 018/530] Implement subclass priority for __torch_dispatch__ (#63411) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63411 In order to get this behavior, you have to use append_overloaded, which I forgot to use in the previous implementation. I exposed an internal helper function which is more appropriate for dispatch to Python where we know that an argument is definitely a Tensor (and this test no longer needs to be done). Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D30374489 Pulled By: ezyang fbshipit-source-id: 43b08c00d1958c9b26d82a025d19f0b67bb85590 --- test/test_python_dispatch.py | 33 +++++++++++++++++++++++++ torch/csrc/autograd/python_variable.cpp | 7 +++--- torch/csrc/utils/python_arg_parser.h | 10 ++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index adacc7efb7093..0f5b6b9cbd70e 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -246,6 +246,39 @@ def test_version(self) -> None: x.data.add_(2) self.assertEqual(cur_vc, x._version) + def test_subclass_priority(self) -> None: + class ErrorA(RuntimeError): + pass + + class ErrorB(RuntimeError): + pass + + # The big tests for code coverage are test_precedence_semantics in + # test_overrides.py; this is just to make sure it is wired up at all + # correctly for __torch_dispatch__ + class A(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + raise ErrorA + + class B(A): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + raise ErrorB + + self.assertRaises(ErrorA, lambda: torch.add(A(torch.empty(1)), A(torch.empty(1)))) + self.assertRaises(ErrorB, lambda: torch.add(A(torch.empty(1)), B(torch.empty(1)))) + self.assertRaises(ErrorB, lambda: torch.add(B(torch.empty(1)), A(torch.empty(1)))) + self.assertRaises(ErrorB, lambda: torch.add(B(torch.empty(1)), B(torch.empty(1)))) + def test_format(self) -> None: x = LoggingTensor(torch.ones(1)) s1 = str(x) diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 9496d668b3468..303584603aaa0 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -1562,7 +1562,7 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa if (ivalue.isTensor()) { const auto& tensor = ivalue.toTensor(); if (isPythonTensor(tensor)) { - overloaded_args.emplace_back(py::cast(tensor)); + append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr()); } } else if (ivalue.isList()) { const auto& list = ivalue.toListRef(); @@ -1571,7 +1571,7 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa if (nv.isTensor()) { const auto& tensor = nv.toTensor(); if (isPythonTensor(tensor)) { - overloaded_args.emplace_back(py::cast(tensor)); + append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr()); } } } @@ -1620,7 +1620,8 @@ c10::intrusive_ptr concrete_detach_fn(const c10::impl::PyInterpreter // TODO: fix the constness of target Tensor self_t = Tensor(c10::intrusive_ptr::unsafe_reclaim_from_nonowning(const_cast(self))); auto self_p = py::reinterpret_steal(THPVariable_Wrap(self_t)); - overloaded_args.emplace_back(self_p); + TORCH_INTERNAL_ASSERT(isPythonTensor(self_t)); + append_overloaded_arg(&overloaded_args, self_p.ptr()); auto args = py::reinterpret_steal(PyTuple_New(1)); PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr()); diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index c9a1e4a39aeef..d132185ccaefb 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -810,4 +810,14 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* ove */ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector* overloaded_args, int argnum, bool throw_error); +/* Given an argument that is definitely a tensor and is definitely overloaded, + * append it to the overloaded arguments list. Use this instead of + * is_tensor_and_append_overloaded in situations where you have a PyObject + * and you know it definitely is a Tensor and it is definitely overloaded. + * + * 'overloaded_args': the vector to append the overloaded args + * 'obj': the input tensor that is overloaded + */ +void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj); + } // namespace torch From 93582e3bba33fbb7aade5ce3560ebde6e889f5fc Mon Sep 17 00:00:00 2001 From: cyy Date: Wed, 18 Aug 2021 08:04:08 -0700 Subject: [PATCH 019/530] A tiny fix in MT19937RNGEngine (#63219) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63219 Reviewed By: VitalyFedyunin Differential Revision: D30341484 Pulled By: ezyang fbshipit-source-id: 0ff4499d0f4a3dfeb991c0f10fe3248c6ca1c992 --- aten/src/ATen/core/MT19937RNGEngine.h | 1 - 1 file changed, 1 deletion(-) diff --git a/aten/src/ATen/core/MT19937RNGEngine.h b/aten/src/ATen/core/MT19937RNGEngine.h index 033df304e4a8e..40c1ba5f584ad 100644 --- a/aten/src/ATen/core/MT19937RNGEngine.h +++ b/aten/src/ATen/core/MT19937RNGEngine.h @@ -157,7 +157,6 @@ class mt19937_engine { data_.state_[0] = seed & 0xffffffff; for(int j = 1; j < MERSENNE_STATE_N; j++) { data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j); - data_.state_[j] &= 0xffffffff; } data_.left_ = 1; data_.next_ = 0; From 383a33a0eb28ae454c0c8965650aea8ce1608943 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Wed, 18 Aug 2021 08:47:27 -0700 Subject: [PATCH 020/530] Make DataChunk support list in-place ops (#63422) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63422 Fixes #63095 Make `DataChunk` delegate to list method. Then it will support in-place operations: - `sort` - `reverse` - `append` - `extend` - `random.shuffle` Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30379027 Pulled By: ejguan fbshipit-source-id: d176bd0cc8b89b915c7bb184ff243ab1f605616d --- test/test_datapipe.py | 48 +++++++++++++++++++-- torch/utils/data/datapipes/iter/grouping.py | 10 ++--- torch/utils/data/dataset.py | 14 ++---- 3 files changed, 52 insertions(+), 20 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 9a7876e334639..9c2380112705d 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -110,14 +110,54 @@ def create_temp_dir_and_files(): class TestDataChunk(TestCase): + def setUp(self): + self.elements = list(range(10)) + random.shuffle(self.elements) + self.chunk: DataChunk[int] = DataChunk(self.elements) + + def test_getitem(self): + for i in range(10): + self.assertEqual(self.elements[i], self.chunk[i]) + + def test_iter(self): + for ele, dc in zip(self.elements, iter(self.chunk)): + self.assertEqual(ele, dc) + + def test_len(self): + self.assertEqual(len(self.elements), len(self.chunk)) + def test_as_string(self): + self.assertEqual(str(self.chunk), str(self.elements)) + + batch = [self.elements] * 3 + chunks: List[DataChunk[int]] = [DataChunk(self.elements)] * 3 + self.assertEqual(str(batch), str(chunks)) + + def test_sort(self): + chunk: DataChunk[int] = DataChunk(self.elements) + chunk.sort() + self.assertTrue(isinstance(chunk, DataChunk)) + for i, d in enumerate(chunk): + self.assertEqual(i, d) + + def test_reverse(self): + chunk: DataChunk[int] = DataChunk(self.elements) + chunk.reverse() + self.assertTrue(isinstance(chunk, DataChunk)) + for i in range(10): + self.assertEqual(chunk[i], self.elements[9 - i]) + + def test_random_shuffle(self): elements = list(range(10)) chunk: DataChunk[int] = DataChunk(elements) - self.assertEqual(str(chunk), str(elements)) - batch = [elements] * 3 - chunks: List[DataChunk] = [DataChunk(elements)] * 3 - self.assertEqual(str(chunk), str(elements)) + rng = random.Random(0) + rng.shuffle(chunk) + + rng = random.Random(0) + rng.shuffle(elements) + + self.assertEqual(chunk, elements) class TestIterableDataPipeBasic(TestCase): diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index 1bd8c4cf4c315..e6304c2de8217 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -133,14 +133,14 @@ def _dive(self, element, unbatch_level): else: raise IndexError(f"unbatch_level {self.unbatch_level} exceeds the depth of the DataPipe") -# TODO(ejguan): https://github.com/pytorch/pytorch/issues/63095 + def _in_batch_shuffle_fn(data: DataChunk): - d = list(data) - random.shuffle(d) - return DataChunk(d) + random.shuffle(data) + return data + class BucketBatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]): - r""" :class:`BucketBatcherIterDataPipe`. + r""":class:`BucketBatcherIterDataPipe`. Iterable DataPipe to create mini-batches of data from sorted bucket. An outer dimension will be added as `batch_size` if `drop_last` is set to `True`, diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py index 5b8102c235607..7a069d61de6cc 100644 --- a/torch/utils/data/dataset.py +++ b/torch/utils/data/dataset.py @@ -25,25 +25,17 @@ T = TypeVar('T') -class DataChunk(List[T]): +class DataChunk(list, Generic[T]): def __init__(self, items): + super().__init__(items) self.items = items - def __getitem__(self, key): - return self.items[key] - - def __len__(self): - return len(self.items) - def as_str(self, indent=''): res = indent + "[" + ", ".join([str(i) for i in iter(self)]) + "]" return res - def __repr__(self): - return self.as_str() - def __iter__(self) -> Iterator[T]: - for i in self.items: + for i in super().__iter__(): yield i def raw_iterator(self): From e2ddaec5cf6608b8e06667d4873505609ff1d674 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Wed, 18 Aug 2021 09:41:37 -0700 Subject: [PATCH 021/530] Reverting launch bounds change in topK that induced a regression in perf (#63431) Summary: [topkwsyncs.zip](https://github.com/pytorch/pytorch/files/7003077/topkwsyncs.zip) Running this script on nvidia containers 21.08 vs 21.07 we see the following perf drops: topk(input=(dtype=torch.float16,shape=[60, 201600]), k=2000, dim=1, sorted=True) - 0.63 topk(input=(dtype=torch.float32,shape=[120000]), k=12000, dim=0, sorted=False) - 0.55 topk(input=(dtype=torch.float16,shape=[5, 201600]), k=2000, dim=1, sorted=True) - 0.55 topk(input=(dtype=torch.float32,shape=[1, 10000]), k=1000, dim=1, sorted=False) - 0.33 The relative perf drop is reported as (21.08_time - 21.07_time) / 21.07_time I narrowed down the source of the regression to this commit: https://github.com/pytorch/pytorch/pull/60314 which reduced launch bounds from 1024 to 512. The perf did not seem to regress in the original evidence provided to change 1024 to 512 due to the input shapes in the benchmark being a lot smaller than the input shapes of the tensors which I am witnessing perf regression in. I suggest reverting back to 1024 as with 512 there was no considerable improvement in perf for small inputs and a major regression in perf for large tensors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63431 Reviewed By: mruberry Differential Revision: D30384087 Pulled By: ngimel fbshipit-source-id: 11eecbba82a069b1d4579d674c3f644ab8060ad2 --- aten/src/ATen/native/cuda/TensorTopK.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index d6b4fe2620191..c0bc353110b6f 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -15,7 +15,7 @@ namespace at { namespace native { namespace { template -C10_LAUNCH_BOUNDS_1(512) +C10_LAUNCH_BOUNDS_1(1024) __global__ void gatherTopK(at::cuda::detail::TensorInfo input, IndexType inputSliceSize, IndexType outputSliceSize, // aka `k` @@ -255,7 +255,7 @@ TORCH_IMPL_FUNC(topk_out_cuda) dim3 grid; \ TORCH_INTERNAL_ASSERT(getGridFromTiles(inputSlices, grid), "Too many slices to sort"); \ \ - dim3 block(std::min(at::cuda::ATenCeilDiv(sliceSize, (int64_t) C10_WARP_SIZE)*(int64_t) C10_WARP_SIZE, (int64_t) 512)); \ + dim3 block(std::min(at::cuda::ATenCeilDiv(sliceSize, (int64_t) C10_WARP_SIZE)*(int64_t) C10_WARP_SIZE, (int64_t) 1024)); \ \ /* This is used as a template parameter to calculate indices. */ \ /* We only specialize it if all collapsed dim sizes are the */ \ From 565578cdab2cbf18fac0ea97c1a1f954492cc8b5 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Wed, 18 Aug 2021 09:42:14 -0700 Subject: [PATCH 022/530] Use `fastAtomicAdd` in EmbeddingBag (mode "max") backward (#63298) Summary: Rel: https://github.com/pytorch/pytorch/issues/62695 ### This PR | n_tokens | num_embeddings | embedding_dim | mode | bwd_fp32 | bwd_fp16 | |-----------:|-----------------:|----------------:|:-------|------------:|------------:| | 4096 | 4096 | 4096 | max | 0.000326228 | 0.000181448 | | 4096 | 4096 | 16384 | max | 0.00102805 | 0.000618136 | | 4096 | 16384 | 4096 | max | 0.000907326 | 0.000530422 | | 4096 | 16384 | 16384 | max | 0.00334988 | 0.00264645 | | 16384 | 4096 | 4096 | max | 0.000366449 | 0.000320232 | | 16384 | 4096 | 16384 | max | 0.00126421 | 0.00104183 | | 16384 | 16384 | 4096 | max | 0.00087738 | 0.00065068 | | 16384 | 16384 | 16384 | max | 0.00379229 | 0.00298201 | ### Original | n_tokens | num_embeddings | embedding_dim | mode | bwd_fp32 | bwd_fp16 | |-----------:|-----------------:|----------------:|:-------|------------:|------------:| | 4096 | 4096 | 4096 | max | 0.00032407 | 0.000188231 | | 4096 | 4096 | 16384 | max | 0.00104356 | 0.000624001 | | 4096 | 16384 | 4096 | max | 0.000902069 | 0.000527382 | | 4096 | 16384 | 16384 | max | 0.00302202 | 0.00255153 | | 16384 | 4096 | 4096 | max | 0.000384343 | 0.000403249 | | 16384 | 4096 | 16384 | max | 0.00126445 | 0.00135069 | | 16384 | 16384 | 4096 | max | 0.000880814 | 0.000825679 | | 16384 | 16384 | 16384 | max | 0.00337611 | 0.00319515 | cc xwang233 ptrblck ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/63298 Reviewed By: mruberry Differential Revision: D30383583 Pulled By: ngimel fbshipit-source-id: 14dd9d67002c53a153721812709033c198f68c1e --- aten/src/ATen/native/cuda/EmbeddingBag.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 34a9d9dd82133..35094681a79c8 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -17,6 +17,7 @@ #include #include +#include #include @@ -235,7 +236,7 @@ template __global__ void EmbeddingBag_accGradParametersKernel_max( index_t *max_indices, scalar_t *gradOutput, scalar_t *gradWeight, int64_t stride, int64_t numBags, - index_t padding_idx) { + index_t padding_idx, const index_t numel) { using accscalar_t = acc_type; @@ -252,8 +253,9 @@ __global__ void EmbeddingBag_accGradParametersKernel_max( index_t word_idx = max_indices[bag * stride + featureDim]; if (word_idx >= 0 && word_idx != padding_idx) { // If bag is empty, we have max_indices[idx] set to -1 in forward. - gpuAtomicAddNoReturn(&(gradWeight[word_idx * stride + featureDim]), - gradOutput[bag * stride + featureDim]); + fastAtomicAdd( + gradWeight, static_cast(word_idx * stride + featureDim), + numel, gradOutput[bag * stride + featureDim], true); } } } @@ -289,7 +291,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad, scalar_t, index_t><<>>( max_indices.data_ptr(), grad.data_ptr(), grad_weight.data_ptr(), stride, numBags, - padding_idx); + padding_idx, grad_weight.numel()); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); From 7df2324120513c387524db604f8bcc7281aaf95f Mon Sep 17 00:00:00 2001 From: John Shen Date: Wed, 18 Aug 2021 10:35:55 -0700 Subject: [PATCH 023/530] [pytorch] Make qconv forward() thread safe (#63432) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63432 There's a race condition in quantized models when multiple threads call forward() due to qnnpack packing the weights the first time the operator is called. This locks the entire apply_impl function. Test Plan: https://github.com/pytorch/pytorch/issues/58055 Ran the script before and after, original crashes went away Reviewed By: kimishpatel Differential Revision: D30229520 fbshipit-source-id: d06cabe24199a80325cd57f24a7fd60624be2cf7 --- aten/src/ATen/native/quantized/cpu/qconv.cpp | 2 ++ aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp | 9 ++++----- aten/src/ATen/native/quantized/cpu/qnnpack_utils.h | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index bf5c596a9e0d2..3c0d79acac18c 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -563,6 +563,8 @@ at::Tensor PackedConvWeightsQnnp::apply_impl( const at::Tensor& act, double output_scale, int64_t output_zero_point) { + // QNNPack is not thread safe + std::lock_guard lock(qnnp_mutex_); const std::string func_name = transpose() ? "quantized::conv_transpose" : "quantized::conv"; TORCH_CHECK(!(kReluFused && transpose()), diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index dff28b141f6b1..87294c11adda0 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -276,9 +276,8 @@ c10::intrusive_ptr> PackedConvWeightsQnnp< // during the first invocation of operator run. Refer to qconv.cpp for more // details. TODO Update to actually call pre-pack here once bias is removed // from pre-packing step. - c10::intrusive_ptr> ret_ptr = - c10::make_intrusive>( - PackedConvWeightsQnnp{ + auto ret_ptr = + c10::intrusive_ptr>::make( nullptr, /* PrePackConvWeights */ weight_contig, /* int8_t weight */ bias_fp32.contiguous(), /* fp32 bias */ @@ -289,10 +288,10 @@ c10::intrusive_ptr> PackedConvWeightsQnnp< groups, transpose, c10::nullopt, /* input_scale */ - {kernel_h, kernel_w}, + std::vector{kernel_h, kernel_w}, w_scales, std::move(w_zero_points), - is_per_channel}); + is_per_channel); return ret_ptr; } diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h index 161be5a2f8fa3..91ede920b87e2 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h +++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h @@ -292,6 +292,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { } private: + std::mutex qnnp_mutex_; template at::Tensor apply_impl( const at::Tensor& input, From af3cbfed9510747c776418c260c5116f662c6452 Mon Sep 17 00:00:00 2001 From: Shirong Wu Date: Wed, 18 Aug 2021 10:39:53 -0700 Subject: [PATCH 024/530] Add validation check in fx2trt interpreter (#63424) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63424 Add validation check in fx2trt for missing converter operators. If any op missing, interpreter init will report missing operators. Test Plan: for call_function and call_method: manual test with feeds benchmark and verify init failed with expected message. {F642390780} for call_module: specify a module as leaf node and make acc_tracer trace it as a node; then in fx2trt.py, in CONVERTER initialize stage make it skip recording all modules; initialize interpreter and call validator function, verify the output includes the missing module name, return value print as screenshot below. {F643458718} Reviewed By: 842974287 Differential Revision: D30294832 fbshipit-source-id: 243dca3fdfc6a174ded65248938e2a234aec19c6 --- torch/fx/experimental/fx2trt/fx2trt.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py index 160b4a7317a69..9879fd7e0952e 100644 --- a/torch/fx/experimental/fx2trt/fx2trt.py +++ b/torch/fx/experimental/fx2trt/fx2trt.py @@ -229,6 +229,10 @@ def __init__( self.input_specs = input_specs self.input_specs_iter = 0 self.validate_input_specs() + missing_ops = self.validate_conversion + if not missing_ops: + warnings.warn("Interpretation may fail due to missing operations \n" + + "\n".join(f"{i}" for i in missing_ops)) self._cur_node_name: Optional[str] = None self._input_names: List[str] = [] self._output_names: List[str] = [] @@ -290,6 +294,19 @@ def validate_input_specs(self): len(shape_ranges) == 0 ), "shape_ranges are provided for input that doesn't have dynamic dim." + def validate_conversion(self): + missing_converter = set() + + for node in self.module.graph.nodes: + if node.op in ["call_function", "call_method"] and not CONVERTERS.get(node.target): + missing_converter.add(f"{node.op} {node.target}") + elif node.op == "call_module": + submod = self.fetch_attr(node.target) + if not CONVERTERS.get(type(submod)): + missing_converter.add(f"{node.op} {type(submod)}") + + return missing_converter + def run( self, max_batch_size=64, From d565a7bd6871b39b2aee978f6fd1c15fb290c3ca Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Wed, 18 Aug 2021 10:46:09 -0700 Subject: [PATCH 025/530] [6/N] Enable opt-asan for elastic and launcher tests. (#63442) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63442 Continuation of https://github.com/pytorch/pytorch/pull/62051, I've enabled elastic and launcher tests to run in opt-asan mode which is supported with spawn multiprocessing. This allows us to completely get rid of fork based tests from torch.distributed and have all tests run in spawn mode. ghstack-source-id: 136057123 Test Plan: waitforbuildbot Reviewed By: cbalioglu Differential Revision: D30384267 fbshipit-source-id: ad3447cfb9d6e31e7ec8332d64c8ff1054858dcb --- .../server/test/local_elastic_agent_test.py | 100 +++++++++--------- .../elastic/multiprocessing/api_test.py | 15 ++- .../elastic/timer/local_timer_example.py | 8 +- test/distributed/launcher/api_test.py | 14 +-- test/distributed/launcher/launch_test.py | 6 +- test/distributed/launcher/run_test.py | 28 ++--- test/distributed/test_launcher.py | 4 +- 7 files changed, 90 insertions(+), 85 deletions(-) diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py index 9becdeb663ef6..2536b1033d56c 100644 --- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py +++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py @@ -36,7 +36,7 @@ from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer from torch.distributed.rpc.backend_registry import BackendType from torch.testing._internal.common_utils import ( - TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -406,19 +406,19 @@ def dummy_compute(self): self.assertEqual((100, 100), return_value.shape) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_dummy_compute_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_dummy_compute_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_dummy_compute_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute) @@ -431,19 +431,19 @@ def run_happy_function(self): self.assertIsNone(res.return_values[1]) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_happy_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_happy_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_happy_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_happy_function) @@ -465,13 +465,13 @@ def check_master_addr_port_override(self): self.assertIsNone(res.return_values[0]) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_check_master_addr_port_override_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.check_master_addr_port_override) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_check_master_addr_port_override_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.check_master_addr_port_override) @@ -484,7 +484,7 @@ def run_check_env_function(self): self.assertFalse(res.is_failed()) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_check_env_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_check_env_function) @@ -497,19 +497,19 @@ def run_function_with_return_value(self): self.assertEqual("foo", res.return_values[1]) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_function_with_return_value_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_function_with_return_value) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_function_with_return_value_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_function_with_return_value) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_function_with_return_value_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_function_with_return_value) @@ -520,19 +520,19 @@ def simple_dist_sum(self): # _dist_sum internally checks that the sum computed is valid @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_simple_dist_sum_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_simple_dist_sum_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_simple_dist_sum_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum) @@ -556,19 +556,19 @@ def run_distributed_sum_homogeneous(self): self.assertSetEqual(set(range(4 + 4)), ranks) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_distributed_sum_homogeneous_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_homogeneous) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_distributed_sum_homogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_homogeneous) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_distributed_sum_homogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous) @@ -596,19 +596,19 @@ def run_distributed_sum_heterogeneous(self): self.assertSetEqual(set(range(1 + 2 + 3)), ranks) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_distributed_sum_heterogeneous_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_distributed_sum_heterogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_distributed_sum_heterogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous) @@ -636,19 +636,19 @@ def run_sad_function(self): self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_sad_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_sad_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_sad_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function) @@ -668,19 +668,19 @@ def run_bipolar_function(self): self.assertTrue(agent._total_execution_time > 0) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_bipolar_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_bipolar_function) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_bipolar_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_bipolar_function) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_run_bipolar_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_bipolar_function) @@ -711,13 +711,13 @@ def correct_rank_assignment_heterogeneous(self): ) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_correct_rank_assignment_heterogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_correct_rank_assignment_heterogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous) @@ -744,13 +744,13 @@ def correct_rank_assignment_homogeneous(self): ) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_correct_rank_assignment_homogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_correct_rank_assignment_homogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous) @@ -852,13 +852,13 @@ def double_agent_fault_tolerance(self): self.assertEqual(0, p.exitcode) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_double_agent_fault_tolerance_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_fault_tolerance) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_double_agent_fault_tolerance_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance) @@ -905,19 +905,19 @@ def double_agent_elastic(self): self.assertEqual(-signal.SIGKILL, p.exitcode) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_double_agent_elastic_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.double_agent_elastic) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_double_agent_elastic_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_elastic) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_double_agent_elastic_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_elastic) @@ -955,19 +955,19 @@ def torch_rpc(self): self.assertEqual([f"{msg} from worker"], list(master_retvals.values())) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_torch_rpc_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.torch_rpc) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_torch_rpc_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.torch_rpc) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_torch_rpc_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.torch_rpc) @@ -993,13 +993,13 @@ def workers_drift_success(self): self.assertEqual(rank, output) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_workers_drift_success_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_success) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_workers_drift_success_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_success) @@ -1024,13 +1024,13 @@ def workers_drift_fail(self): self.assertEqual(rank, output) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_workers_drift_fail_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_fail) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_workers_drift_fail_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_fail) @@ -1047,19 +1047,19 @@ def barrier_failed(self, barrier_mock): barrier_mock.assert_called_once() @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_barrier_failed_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_barrier_failed_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_barrier_failed_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed) @@ -1081,19 +1081,19 @@ def shutdown_called(self, start_processes_mock): pcontext_mock.close.assert_called_once() @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_shutdown_called_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_shutdown_called_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_shutdown_called_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called) diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py index cb1db294d2791..c27d932e43cb9 100644 --- a/test/distributed/elastic/multiprocessing/api_test.py +++ b/test/distributed/elastic/multiprocessing/api_test.py @@ -35,6 +35,7 @@ from torch.testing._internal.common_utils import ( NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, IS_IN_CI, IS_WINDOWS, @@ -222,7 +223,7 @@ def start_processes_zombie_test( # tests incompatible with tsan or asan -if not (TEST_WITH_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS): +if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS): class StartProcessesTest(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_") @@ -386,7 +387,7 @@ def test_void_function(self): self.assertEqual({0: None, 1: None}, results.return_values) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" ) def test_function_large_ret_val(self): # python multiprocessing.queue module uses pipes and actually PipedQueues @@ -548,7 +549,7 @@ def test_multiprocessing_context_poll_raises_exception(self): # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows -if not (TEST_WITH_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS): +if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS): class StartProcessesListTest(StartProcessesTest): ######################################## # start_processes as binary tests @@ -646,7 +647,7 @@ def test_binary_redirect_and_tee(self): # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows -if not (TEST_WITH_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS or IS_IN_CI): +if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS or IS_IN_CI): class StartProcessesNotCITest(StartProcessesTest): def test_wrap_bad(self): none = "" @@ -696,7 +697,11 @@ def test_binary_signal(self): failure = results.failures[0] self.assertNotEqual(signal.SIGSEGV, failure.exitcode) - self.assertEqual("SIGSEGV", failure.signal_name()) + if TEST_WITH_ASAN: + # ASAN exit code is 1. + self.assertEqual("", failure.signal_name()) + else: + self.assertEqual("SIGSEGV", failure.signal_name()) self.assertEqual("", failure.error_file_data["message"]) def test_function_redirect_and_tee(self): diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py index d73aa67ee75e7..7845c4b5001e5 100644 --- a/test/distributed/elastic/timer/local_timer_example.py +++ b/test/distributed/elastic/timer/local_timer_example.py @@ -14,7 +14,7 @@ import torch.distributed.elastic.timer as timer import torch.multiprocessing as torch_mp from torch.testing._internal.common_utils import ( - TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, run_tests, IS_WINDOWS, @@ -55,7 +55,7 @@ class LocalTimerExample(unittest.TestCase): unittest. As of now this will SIGSEGV. """ - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") def test_torch_mp_example(self): # in practice set the max_interval to a larger value (e.g. 60 seconds) mp_queue = mp.get_context("spawn").Queue() @@ -80,11 +80,11 @@ def test_torch_mp_example(self): server.stop() - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") def test_example_start_method_spawn(self): self._run_example_with(start_method="spawn") - # @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") + # @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") # def test_example_start_method_forkserver(self): # self._run_example_with(start_method="forkserver") diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py index 954b7e201a351..d2bfd360f9c31 100644 --- a/test/distributed/launcher/api_test.py +++ b/test/distributed/launcher/api_test.py @@ -30,7 +30,7 @@ _get_entrypoint_name, ) from torch.testing._internal.common_utils import ( - TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -128,7 +128,7 @@ def check_works_ran(self, world_size: int): ) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_script_python(self): nnodes = 1 @@ -145,7 +145,7 @@ def test_launch_script_python(self): self.check_works_ran(world_size) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_script_python_local_rank_transfer(self): nnodes = 1 @@ -162,7 +162,7 @@ def test_launch_script_python_local_rank_transfer(self): self.check_works_ran(world_size) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_script_bash(self): nnodes = 1 @@ -177,7 +177,7 @@ def test_launch_script_bash(self): self.check_works_ran(world_size) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_function(self): nnodes = 1 @@ -193,7 +193,7 @@ def test_launch_function(self): self.assertEqual(expected_res, actual_res) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_dist_sum_with_static_rdzv(self): nnodes = 1 @@ -224,7 +224,7 @@ def test_launch_dist_sum_with_static_rdzv(self): self.assertEqual(expected_res, actual_res) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_elastic(self): nproc_per_node = 4 diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py index 2d27269014246..73aed1a4ea59f 100644 --- a/test/distributed/launcher/launch_test.py +++ b/test/distributed/launcher/launch_test.py @@ -14,7 +14,7 @@ import torch.distributed.launch as launch from torch.distributed.elastic.utils import get_socket_with_port from torch.testing._internal.common_utils import ( - TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -36,7 +36,7 @@ def tearDown(self): shutil.rmtree(self.test_dir) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_without_env(self): nnodes = 1 @@ -58,7 +58,7 @@ def test_launch_without_env(self): launch.main(args) @sandcastle_skip_if( - TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" ) def test_launch_with_env(self): nnodes = 1 diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py index 7318bbd630687..a63ec0382dfa4 100644 --- a/test/distributed/launcher/run_test.py +++ b/test/distributed/launcher/run_test.py @@ -22,7 +22,7 @@ from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer from torch.distributed.elastic.utils import get_socket_with_port from torch.testing._internal.common_utils import ( - TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -138,7 +138,7 @@ def test_launch_user_script_python_caffe2_bc(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_user_script_bash(self): run_id = str(uuid.uuid4().int) nnodes = 1 @@ -169,7 +169,7 @@ def test_launch_user_script_bash(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_user_script_default_nproc(self): run_id = str(uuid.uuid4().int) nnodes = 1 @@ -198,7 +198,7 @@ def test_launch_user_script_default_nproc(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_with_env_vars(self): run_id = str(uuid.uuid4().int) nnodes = 1 @@ -256,27 +256,27 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_nproc_launch_auto_configurations(self): self._test_nproc_launch_configuration("auto", os.cpu_count()) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_nproc_launch_number_configurations(self): self._test_nproc_launch_configuration("4", 4) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_nproc_launch_unknown_configurations(self): with self.assertRaises(ValueError): self._test_nproc_launch_configuration("unknown", 4) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") @patch("torch.cuda.is_available", return_value=True) @patch("torch.cuda.device_count", return_value=3) def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): self._test_nproc_launch_configuration("auto", 3) self._test_nproc_launch_configuration("gpu", 3) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_elastic(self): run_id = str(uuid.uuid4().int) min_nodes = 1 @@ -304,7 +304,7 @@ def test_launch_elastic(self): ) @mock.patch("torch.distributed.elastic.events.record") - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_elastic_worker_raise_exception(self, record_mock): """ Asserts that when the worker program fails and lancher raieses exception @@ -332,7 +332,7 @@ def test_launch_elastic_worker_raise_exception(self, record_mock): record_mock.assert_called_once() - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") @mock.patch( "torch.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent.run" ) @@ -364,7 +364,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run) launch.main(args) record_mock.assert_called_once() - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_standalone(self): nnodes = 1 nproc_per_node = 4 @@ -386,7 +386,7 @@ def test_launch_standalone(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_run_path(self): nnodes = 1 nproc_per_node = 4 @@ -408,7 +408,7 @@ def test_launch_run_path(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") def test_launch_elastic_multiple_agents(self): run_id = str(uuid.uuid4().int) min_nodes = 1 diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py index 85ba293966f2d..53faefba95f95 100644 --- a/test/distributed/test_launcher.py +++ b/test/distributed/test_launcher.py @@ -11,7 +11,7 @@ sys.exit(0) from torch.testing._internal.common_utils import ( - TEST_WITH_ASAN, + TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, TestCase, run_tests, @@ -21,7 +21,7 @@ def path(script): return os.path.join(os.path.dirname(__file__), script) -if TEST_WITH_ASAN: +if TEST_WITH_DEV_DBG_ASAN: print("Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr) sys.exit(0) From 2f615f63135e834499b5d11a4fbced91d70913cc Mon Sep 17 00:00:00 2001 From: soulitzer Date: Wed, 18 Aug 2021 11:29:51 -0700 Subject: [PATCH 026/530] Improve custom function docs (#60312) Summary: - Adds some code examples for `ctx` methods and make requirements of arguments more clear - Type annotations for `save_for_backward`, `mark_dirty`, `mark_non_differentiable`, and `set_materialize_grads` (BC-breaking?) - Refactor `torch.autograd.Function` doc Pull Request resolved: https://github.com/pytorch/pytorch/pull/60312 Reviewed By: VitalyFedyunin Differential Revision: D30314961 Pulled By: soulitzer fbshipit-source-id: a284314b65662e26390417bd2b6b12cd85e68dc8 --- docs/source/autograd.rst | 8 +- docs/source/notes/extending.rst | 134 +++++++++++++++++--------- torch/autograd/function.py | 161 ++++++++++++++++++++++++++------ 3 files changed, 226 insertions(+), 77 deletions(-) diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst index 5958c639813f1..6423d5d6d088c 100644 --- a/docs/source/autograd.rst +++ b/docs/source/autograd.rst @@ -189,10 +189,10 @@ When creating a new :class:`Function`, the following methods are available to `c :toctree: generated :nosignatures: - function._ContextMethodMixin.mark_dirty - function._ContextMethodMixin.mark_non_differentiable - function._ContextMethodMixin.save_for_backward - function._ContextMethodMixin.set_materialize_grads + function.FunctionCtx.mark_dirty + function.FunctionCtx.mark_non_differentiable + function.FunctionCtx.save_for_backward + function.FunctionCtx.set_materialize_grads .. _grad-check: diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst index 1c89bcf10eb0a..a8d3983f9f0d9 100644 --- a/docs/source/notes/extending.rst +++ b/docs/source/notes/extending.rst @@ -13,60 +13,110 @@ Extending :mod:`torch.autograd` .. currentmodule:: torch.autograd Adding operations to :mod:`~torch.autograd` requires implementing a new -:class:`Function` subclass for each operation. Recall that :class:`Function` s -are what :mod:`~torch.autograd` uses to compute the results and gradients, and -encode the operation history. Every new function requires you to implement 2 methods: - -- :meth:`~Function.forward` - the code that performs the operation. It can take +:class:`Function` subclass for each operation. Recall that Functions +are what :mod:`~torch.autograd` uses to encode the operation history and compute +gradients. + +When to use +^^^^^^^^^^^ +In general, implement a custom function if you want to perform computations in your model +that are not differentiable or rely on non-Pytorch libraries (e.g., NumPy), but +still wish for your operation to chain with other ops and work with the autograd engine. + +In some situations, custom functions can also be used to improve performance and +memory usage: If you implemented your forward and backward passes using a +`C++ extension `_, +you can wrap them in :class:`~Function` to interface with the autograd +engine. If you'd like to reduce the number of buffers saved for the backward pass, +custom functions can be used to combine ops together. + +When not to use +^^^^^^^^^^^^^^^ +If you can already write your function in terms of PyTorch's built-in ops, its +backward graph is (most likely) already able to be recorded by autograd. In this case, you do +not need to implement the backward function yourself. Consider using a plain +old Python function. + +If you need to maintain state, i.e., trainable parameters, you should (also) use a +custom module. See the section below for more information on extending :mod:`torch.nn`. + +If you'd like to alter the gradients during the backward pass or perform a side +effect, consider registering a +`tensor `_ or +`Module `_ hook. + +How to use +^^^^^^^^^^ +Take the following steps: +1. Subclass :class:`~Function` and implement the :meth:`~Function.forward` and +:meth:`~Function.backward` methods. +2. Call the proper methods on the `ctx` argument. +3. Declare whether your function supports double backward. +4. Validate whether your gradients are correct using gradcheck. + +**Step 1:** After subclassing :class:`Function`, you'll need to define 2 methods: + +- :meth:`~Function.forward` is the code that performs the operation. It can take as many arguments as you want, with some of them being optional, if you specify the default values. All kinds of Python objects are accepted here. :class:`Tensor` arguments that track history (i.e., with ``requires_grad=True``) will be converted to ones that don't track history before the call, and their use will be registered in the graph. Note that this logic won't traverse lists/dicts/any other data structures and will only - consider :class:`Tensor` s that are direct arguments to the call. You can + consider tensors that are direct arguments to the call. You can return either a single :class:`Tensor` output, or a :class:`tuple` of - :class:`Tensor` s if there are multiple outputs. Also, please refer to the + tensors if there are multiple outputs. Also, please refer to the docs of :class:`Function` to find descriptions of useful methods that can be called only from :meth:`~Function.forward`. -- :meth:`~Function.backward` - gradient formula. It will be given +- :meth:`~Function.backward` defines the gradient formula. It will be given as many :class:`Tensor` arguments as there were outputs, with each of them - representing gradient w.r.t. that output. It should return as many - :class:`Tensor` s as there were inputs, with each of them containing the - gradient w.r.t. its corresponding input. If your inputs didn't require - gradient (:attr:`~ctx.needs_input_grad` is a tuple of booleans indicating + representing gradient w.r.t. that output. It is important NEVER to modify + these in-place. It should return as many tensors as there + were inputs, with each of them containing the gradient w.r.t. its + corresponding input. If your inputs didn't require gradient + (:attr:`~ctx.needs_input_grad` is a tuple of booleans indicating whether each input needs gradient computation), or were non-:class:`Tensor` objects, you can return :class:`python:None`. Also, if you have optional arguments to :meth:`~Function.forward` you can return more gradients than there were inputs, as long as they're all :any:`python:None`. -.. note:: - - It's the user's responsibility to use the special functions in the forward's `ctx` - properly in order to ensure that the new :class:`Function` works properly with - the autograd engine. - - - :meth:`~torch.autograd.function._ContextMethodMixin.save_for_backward` must be - used when saving input or output of the forward to be used later in the backward. - - :meth:`~torch.autograd.function._ContextMethodMixin.mark_dirty` must be used to - mark any input that is modified inplace by the forward function. - - :meth:`~torch.autograd.function._ContextMethodMixin.mark_non_differentiable` must - be used to tell the engine if an output is not differentiable. - - :meth:`~torch.autograd.function._ContextMethodMixin.set_materialize_grads` can be - used to tell the autograd engine to optimize gradient computations in the cases where - the output does not depend on the input by not materializing grad tensors given to backward - function. That is, if set to False, None object in python or "undefined tensor" (tensor x for - which x.defined() is False) in C++ will not be converted to a tensor filled with zeros prior - to calling backward. However, supporting this optimization means your custom autograd function - has to handle gradients that are represented in this way and is thus opt-in. Default value is True. - -.. note:: - - By default, all the output Tensors that are of differentiable type will be set to - require gradient and have all autograd metadata set for them. If you don't want - them to require gradients, you can use the `mark_non_differentiable` method mentioned - above. For output Tensors that are not of differentiable type (integer types for example), - they won't be marked as requiring gradients. +**Step 2:** It is your responsibility to use the functions in the forward's `ctx` +properly in order to ensure that the new :class:`Function` works properly with +the autograd engine. + +- :meth:`~torch.autograd.function.FunctionCtx.save_for_backward` must be + used when saving input or output tensors of the forward to be used later in the backward. + Anything else, i.e., non-tensors and tensors that are neither input nor output + should be stored directly on `ctx`. +- :meth:`~torch.autograd.function.FunctionCtx.mark_dirty` must be used to + mark any input that is modified inplace by the forward function. +- :meth:`~torch.autograd.function.FunctionCtx.mark_non_differentiable` must + be used to tell the engine if an output is not differentiable. By + default all output tensors that are of differentiable type will be set + to require gradient. Tensors of non-differentiable type (i.e., integral types) + are never marked as requiring gradients. +- :meth:`~torch.autograd.function.FunctionCtx.set_materialize_grads` can be + used to tell the autograd engine to optimize gradient computations in the cases where + the output does not depend on the input by not materializing grad tensors given to backward + function. That is, if set to False, None object in python or "undefined tensor" (tensor x for + which x.defined() is False) in C++ will not be converted to a tensor filled with zeros prior + to calling backward, and so your code will need to handle such objects as if they were + tensors filled with zeros. The default value of this setting is True. + +**Step 3:** If your :class:`~Function` does not support double backward +you should explicitly declare this by decorating backward with the +:func:`~function.once_differentiable`. With this decorator, attempts to +perform double backward through your function will produce an error. +See our double backward tutorial for more information on double backward. + +**Step 4:** It is recommended that you use :func:`torch.autograd.gradcheck` +to check whether your backward function correctly computes gradients of the +forward by computing the Jacobian matrix using your backward function and +comparing the value element-wise with the Jacobian computed numerically using +finite-differencing. + +Example +^^^^^^^ Below you can find code for a ``Linear`` function from :mod:`torch.nn`, with additional comments:: @@ -151,12 +201,12 @@ And here, we optimize the above example by calling set_materialize_grads(False): return grad_output * ctx.constant, None .. note:: - Inputs to ``backward``, i.e., :attr:`grad_output`, can also be Tensors that + Inputs to ``backward``, i.e., :attr:`grad_output`, can also be tensors that track history. So if ``backward`` is implemented with differentiable operations, (e.g., invocation of another custom :class:`~torch.autograd.function`), higher order derivatives will work. - In this case, the Tensors saved with ``save_for_backward`` can also be used - in the backward and have gradients flowing back but Tensors saved in the ``ctx`` + In this case, the tensors saved with ``save_for_backward`` can also be used + in the backward and have gradients flowing back but tensors saved in the ``ctx`` won't have gradients flowing back for them. If you need gradients to flow back for a Tensor saved in the ``ctx``, you should make it an output of the custom ``Function`` and save it with ``save_for_backward``. diff --git a/torch/autograd/function.py b/torch/autograd/function.py index 4fc25c5951d11..4d6122924ec14 100644 --- a/torch/autograd/function.py +++ b/torch/autograd/function.py @@ -8,24 +8,53 @@ from collections import OrderedDict from typing import Any, List, Optional +# Formerly known as: _ContextMethodMixin +class FunctionCtx(object): -class _ContextMethodMixin(object): - - def save_for_backward(self, *tensors): + def save_for_backward(self, *tensors: torch.Tensor): r"""Saves given tensors for a future call to :func:`~Function.backward`. **This should be called at most once, and only from inside the** - :func:`forward` **method.** + :func:`forward` **method. This should only be called with input or + output tensors** - Later, saved tensors can be accessed through the :attr:`saved_tensors` + In :func:`backward`, saved tensors can be accessed through the :attr:`saved_tensors` attribute. Before returning them to the user, a check is made to ensure they weren't used in any in-place operation that modified their content. - Arguments can also be ``None``. + Arguments can also be ``None``. This is a no-op. + + See :ref:`extending-autograd` for more details on how to use this method. + + Example:: + >>> class Func(Function): + >>> @staticmethod + >>> def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int): + >>> w = x * y * z + >>> out = x * y + y * z + w + >>> ctx.save_for_backward(x, y, out) + >>> ctx.z = z # z is not a tensor + >>> ctx.w = w # w is neither input nor output + >>> return out + >>> + >>> @staticmethod + >>> def backward(ctx, grad_out): + >>> x, y, out = ctx.saved_tensors + >>> z = ctx.z + >>> gx = grad_out * (y + y * z) + >>> gy = grad_out * (x + z + x * z) + >>> gz = None + >>> return gx, gy, gz + >>> + >>> a = torch.tensor(1., requires_grad=True, dtype=torch.double) + >>> b = torch.tensor(2., requires_grad=True, dtype=torch.double) + >>> c = 4 + >>> d = Func.apply(a, b, c) + """ self.to_save = tensors - def mark_dirty(self, *args): + def mark_dirty(self, *args: torch.Tensor): r"""Marks given tensors as modified in an in-place operation. **This should be called at most once, only from inside the** @@ -35,6 +64,28 @@ def mark_dirty(self, *args): should be given to this function, to ensure correctness of our checks. It doesn't matter whether the function is called before or after modification. + + Examples:: + >>> class Inplace(Function): + >>> @staticmethod + >>> def forward(ctx, x): + >>> x_npy = x.numpy() # x_npy shares storage with x + >>> x_npy += 1 + >>> ctx.mark_dirty(x) + >>> return x + >>> + >>> @staticmethod + >>> @once_differentiable + >>> def backward(ctx, grad_output): + >>> return grad_output + >>> + >>> a = torch.tensor(1., requires_grad=True, dtype=torch.double).clone() + >>> b = a * a + >>> Inplace.apply(a) # This would lead to wrong gradients! + >>> # but the engine would not know unless we mark_dirty + >>> b.backward() # RuntimeError: one of the variables needed for gradient + >>> # computation has been modified by an inplace operation + """ self.dirty_tensors = args @@ -44,11 +95,11 @@ def mark_shared_storage(self, *pairs): 'Tensors with shared storages are automatically tracked. Note ' 'that calls to `set_()` are not tracked') - def mark_non_differentiable(self, *args): + def mark_non_differentiable(self, *args: torch.Tensor): r"""Marks outputs as non-differentiable. **This should be called at most once, only from inside the** - :func:`forward` **method, and all arguments should be outputs.** + :func:`forward` **method, and all arguments should be tensor outputs.** This will mark outputs as not requiring gradients, increasing the efficiency of backward computation. You still need to accept a gradient @@ -56,20 +107,73 @@ def mark_non_differentiable(self, *args): be a zero tensor with the same shape as the shape of a corresponding output. - This is used e.g. for indices returned from a max :class:`Function`. + This is used e.g. for indices returned from a sort. See example:: + >>> class Func(Function): + >>> @staticmethod + >>> def forward(ctx, x): + >>> sorted, idx = x.sort() + >>> ctx.mark_non_differentiable(idx) + >>> ctx.save_for_backward(x, idx) + >>> return sorted, idx + >>> + >>> @staticmethod + >>> @once_differentiable + >>> def backward(ctx, g1, g2): # still need to accept g2 + >>> x, idx = ctx.saved_tensors + >>> grad_input = torch.zeros_like(x) + >>> grad_input.index_add_(0, idx, g1) + >>> return grad_input + """ self.non_differentiable = args - def set_materialize_grads(self, value): - r"""Sets whether to materialize output grad tensors. Default is true. + def set_materialize_grads(self, value: bool): + r"""Sets whether to materialize output grad tensors. Default is ``True``. **This should be called only from inside the** :func:`forward` **method** - If true, undefined output grad tensors will be expanded to tensors full + If ``True``, undefined output grad tensors will be expanded to tensors full of zeros prior to calling the :func:`backward` method. + + Example:: + >>> class SimpleFunc(Function): + >>> @staticmethod + >>> def forward(ctx, x): + >>> return x.clone(), x.clone() + >>> + >>> @staticmethod + >>> @once_differentiable + >>> def backward(ctx, g1, g2): + >>> return g1 + g2 # No check for None necessary + >>> + >>> # We modify SimpleFunc to handle non-materialized grad outputs + >>> class Func(Function): + >>> @staticmethod + >>> def forward(ctx, x): + >>> ctx.set_materialize_grads(False) + >>> ctx.save_for_backward(x) + >>> return x.clone(), x.clone() + >>> + >>> @staticmethod + >>> @once_differentiable + >>> def backward(ctx, g1, g2): + >>> x, = ctx.saved_tensors + >>> grad_input = torch.zeros_like(x) + >>> if g1 is not None: # We must check for None now + >>> grad_input += g1 + >>> if g2 is not None: + >>> grad_input += g2 + >>> return grad_input + >>> + >>> a = torch.tensor(1., requires_grad=True) + >>> b, _ = Func.apply(a) # induces g2 to be undefined + """ self.materialize_grads = value +# DO NOT USE: This is only defined to be able to load old serialized models +_ContextMethodMixin = FunctionCtx + class _HookMixin(object): @staticmethod @@ -81,7 +185,7 @@ def _register_hook(backward_hooks, hook): return backward_hooks, handle -class BackwardCFunction(_C._FunctionBase, _ContextMethodMixin, _HookMixin): +class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin): def apply(self, *args): # _forward_cls is defined by derived class return self._forward_cls.backward(self, *args) # type: ignore[attr-defined] @@ -103,28 +207,23 @@ def __init__(cls, name, bases, attrs): # mypy doesn't understand `with_metaclass` from torch._six -class Function(with_metaclass(FunctionMeta, _C._FunctionBase, _ContextMethodMixin, _HookMixin)): # type: ignore[misc] - r"""Records operation history and defines formulas for differentiating ops. +class Function(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _HookMixin)): # type: ignore[misc] + r"""Base class to create custom `autograd.Function` - See the Note on extending the autograd engine for more details on how to use - this class: https://pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd + To create a custom `autograd.Function`, subclass this class and implement + the :meth:`forward` and :meth`backward` static methods. Then, to use your custom + op in the forward pass, call the class method ``apply``. Do not call + :meth:`forward` directly. - Every operation performed on :class:`Tensor` s creates a new function - object, that performs the computation, and records that it happened. - The history is retained in the form of a DAG of functions, with edges - denoting data dependencies (``input <- output``). Then, when backward is - called, the graph is processed in the topological ordering, by calling - :func:`backward` methods of each :class:`Function` object, and passing - returned gradients on to next :class:`Function` s. + To ensure correctness and best performance, make sure you are calling the + correct methods on ``ctx`` and validating your backward function using + :func:`torch.autograd.gradcheck`. - Normally, the only way users interact with functions is by creating - subclasses and defining new operations. This is a recommended way of - extending torch.autograd. + See :ref:`extending-autograd` for more details on how to use this class. Examples:: >>> class Exp(Function): - >>> >>> @staticmethod >>> def forward(ctx, i): >>> result = i.exp() @@ -136,7 +235,7 @@ class Function(with_metaclass(FunctionMeta, _C._FunctionBase, _ContextMethodMixi >>> result, = ctx.saved_tensors >>> return grad_output * result >>> - >>> #Use it by calling the apply method: + >>> # Use it by calling the apply method: >>> output = Exp.apply(input) """ def __init__(self, *args, **kwargs): @@ -224,7 +323,7 @@ def wrapper(ctx, *args): outputs = (outputs,) err_fn = _functions.DelayedError( - b"trying to differentiate twice a function that was marked" + b"trying to differentiate twice a function that was marked " b"with @once_differentiable", len(outputs)) # Create aliases of each output that has requires_grad=True. We need From 50a3b6a6a81395106cbbf0f2a80885d4d43a614f Mon Sep 17 00:00:00 2001 From: Heitor Schueroff Date: Wed, 18 Aug 2021 11:30:44 -0700 Subject: [PATCH 027/530] Make SkipInfo with expected_failure an XFAIL (#63481) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63481 This PR changes the SkipInfo decorators to use unittest.expectedFailure so that the test reports as XFAIL as opposed to PASSED. Note that changing the expectedFailure here https://github.com/pytorch/pytorch/blob/30e1c74dc19ae2b622b46ebcdb7972c42775ac80/torch/testing/_internal/common_device_type.py#L879 to an XFAIL is not possible because the decision of whether to decorate is delayed until the wrapper function is called. fixes https://github.com/pytorch/pytorch/issues/63363 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30397154 Pulled By: heitorschueroff fbshipit-source-id: c5e4911969ad8667763eec4203dbbc6a51178592 --- torch/testing/_internal/common_methods_invocations.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index f06d3ce899749..5d55f0ec64291 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -6,6 +6,7 @@ import operator import random import numbers +import unittest import torch import numpy as np @@ -21,7 +22,7 @@ integral_types_and, all_types, double_types) from .._core import _dispatch_dtypes from torch.testing._internal.common_device_type import \ - (expectedFailure, onlyOnCPUAndCUDA, skipIf, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver, + (onlyOnCPUAndCUDA, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, precisionOverride, toleranceOverride, tol) from torch.testing._internal.common_cuda import CUDA11OrLater, SM53OrLater, SM60OrLater from torch.testing._internal.common_utils import \ @@ -86,7 +87,7 @@ def __init__( active_if: whether tests matching the above arguments should be skipped expected_failure: whether to assert that skipped tests fail """ - decorator = expectedFailure(device_type) if expected_failure else skipIf(True, "Skipped!") + decorator = unittest.expectedFailure if expected_failure else unittest.skip("Skipped!") super().__init__(decorators=decorator, cls_name=cls_name, test_name=test_name, device_type=device_type, dtypes=dtypes, active_if=active_if) From 4e1d84ae8fae49995c8966ccbe0f34360978492f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 18 Aug 2021 11:37:07 -0700 Subject: [PATCH 028/530] [doc] pre-commit fix instructions (#61717) Summary: fix invalid instruction Pull Request resolved: https://github.com/pytorch/pytorch/pull/61717 Reviewed By: zhouzhuojie, driazati Differential Revision: D30359218 Pulled By: malfet fbshipit-source-id: 61771babeac4d34425a61ce49f38a7099b521eec --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7d8659a8babff..2d820a360dd7c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1151,7 +1151,7 @@ formatting and semantic checking of code. We provide a pre-commit git hook for performing these checks, before a commit is created: ```bash - ln -s ../../tools/git-pre-commit .git/hooks/pre-commit + ln -s tools/git-pre-commit .git/hooks/pre-commit ``` You'll need to install an appropriately configured flake8; see From ac1ece054b89ec8e69d5e7f5df08dcc8fd145a1e Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 18 Aug 2021 11:38:11 -0700 Subject: [PATCH 029/530] [DDP][Grad compression] Fix fp16 cpp hook (#63375) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63375 I think tensor.copy_(tensor.to(torch::kFloat16)); will keep it as float32. Tested by add the following line: ``` LOG(INFO) << "Type is: " << compressed_tensor.scalar_type(); ``` before: ``` I0816 17:03:09.823688 364141 default_comm_hooks.cpp:21] Type is: Float ``` after: ``` I0816 17:01:16.779052 353924 default_comm_hooks.cpp:21] Type is: Half ``` ghstack-source-id: 136056092 Test Plan: ci Reviewed By: SciPioneer Differential Revision: D30356256 fbshipit-source-id: 8208a705acd7628541cd43c8bf61d007dfdd2435 --- .../csrc/distributed/c10d/default_comm_hooks.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp index 9d13099c424c6..91700baa2e4a5 100644 --- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp +++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp @@ -16,21 +16,23 @@ c10::intrusive_ptr AllReduceCommHook::runHook( c10::intrusive_ptr FP16CompressCommHook::runHook( GradBucket& bucket) { - auto& tensor = bucket.getBufferRef(); - tensor.copy_(tensor.to(torch::kFloat16)); - std::vector tensors = {tensor}; + + auto compressed_tensor = bucket.getBufferRef().to(torch::kFloat16); // Apply the division first to avoid overflow. - tensors[0] /= state_->getSize(); + compressed_tensor /= state_->getSize(); + std::vector tensors = {compressed_tensor}; auto allreduce_fut = state_->allreduce(tensors)->getFuture(); - auto decompress = [](c10::ivalue::Future& allreduce_fut) { + auto decompressed_tensor = bucket.getBufferRef(); + auto decompress = [decompressed_tensor](c10::ivalue::Future& allreduce_fut) { auto result = allreduce_fut.value(); TORCH_INTERNAL_ASSERT( result.isTensorList(), "ProcessGroup::allreduce should return TensorList"); + auto reduce_tensor = result.toTensorVector()[0]; - reduce_tensor.copy_(reduce_tensor.to(torch::kFloat)); - return c10::IValue(reduce_tensor); + decompressed_tensor.copy_(reduce_tensor); + return c10::IValue(decompressed_tensor); }; return allreduce_fut->then(decompress, allreduce_fut->elementType()); From f0f5cffde9196dd5ef46a6d6ba17975ea995c3ca Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 18 Aug 2021 11:38:11 -0700 Subject: [PATCH 030/530] [DDP] Add a debug check in cpp fp16 compress (#63379) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63379 this codepath has been prone to bugs as seen in the below diff, this will help ensure against changes/refactors that touch this, as a basic sanity check. Enabled it in debug-only builds to not affect the perf. ghstack-source-id: 136056093 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D30358440 fbshipit-source-id: e1b3893a223722c2593ceed8696a09c7d07d47c1 --- torch/csrc/distributed/c10d/default_comm_hooks.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp index 91700baa2e4a5..30bc96b16f7db 100644 --- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp +++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include @@ -31,6 +33,11 @@ c10::intrusive_ptr FP16CompressCommHook::runHook( "ProcessGroup::allreduce should return TensorList"); auto reduce_tensor = result.toTensorVector()[0]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + reduce_tensor.scalar_type() == at::ScalarType::Half, + "Expected reduced tensor to be fp16 in FP16CompressHook, but got type ", + reduce_tensor.scalar_type() + ); decompressed_tensor.copy_(reduce_tensor); return c10::IValue(decompressed_tensor); }; From d4593d9d08ac7df2e5cf6f0f24c364c120be51ec Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Wed, 18 Aug 2021 11:39:12 -0700 Subject: [PATCH 031/530] document why wrappers exist in `torch.functional` (#62847) Summary: Fixes https://github.com/pytorch/pytorch/issues/62844. These wrappers are not super obvious, but ultimately stem from the lack of support for functions with variadic args in native_functions.yaml. https://github.com/pytorch/pytorch/issues/62845 tracks that issue. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62847 Reviewed By: VitalyFedyunin Differential Revision: D30305016 Pulled By: dagitses fbshipit-source-id: 716fcecb0417b770bc92cfd8c54f7ead89070896 --- torch/functional.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/torch/functional.py b/torch/functional.py index 78f833eaf5417..81b3de234e1ca 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -66,6 +66,7 @@ def broadcast_tensors(*tensors): tensor([[0, 1, 2], [0, 1, 2]]) """ + # This wrapper exists to support variadic args. if has_torch_function(tensors): return handle_torch_function(broadcast_tensors, tensors, *tensors) return _VF.broadcast_tensors(tensors) # type: ignore[attr-defined] @@ -96,6 +97,7 @@ def broadcast_shapes(*shapes): Raises: RuntimeError: If shapes are incompatible. """ + # This wrapper exists to support variadic args. # TODO Movie this to C++ once the jit has better support for torch.Size. with torch.no_grad(): scalar = torch.zeros((), device="cpu") @@ -277,6 +279,7 @@ def einsum(*args): tensor([[-0.3430, -5.2405, 0.4494], [ 0.3311, 5.5201, -3.0356]]) """ + # This wrapper exists to support variadic args. if len(args) < 2: raise ValueError('einsum(): must specify the equation string and at least one operand, ' 'or at least one operand and its subscripts list') @@ -324,6 +327,7 @@ def parse_subscript(n: int) -> str: return _VF.einsum(equation, operands) # type: ignore[attr-defined] +# This wrapper exists to support variadic args. if TYPE_CHECKING: # The JIT doesn't understand Union, so only add type annotation for mypy def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]: @@ -1042,6 +1046,7 @@ def cartesian_prod(*tensors): [3, 4], [3, 5]]) """ + # This wrapper exists to support variadic args. if has_torch_function(tensors): return handle_torch_function(cartesian_prod, tensors, *tensors) return _VF.cartesian_prod(tensors) # type: ignore[attr-defined] @@ -1076,6 +1081,7 @@ def block_diag(*tensors): [0, 0, 0, 0, 0, 0, 0, 0, 0, 5], [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]]) """ + # This wrapper exists to support variadic args. if has_torch_function(tensors): return handle_torch_function(block_diag, tensors, *tensors) return torch._C._VariableFunctions.block_diag(tensors) # type: ignore[attr-defined] @@ -1163,6 +1169,7 @@ def atleast_1d(*tensors): >>> torch.atleast_1d((x,y)) (tensor([0.5000]), tensor([1.])) """ + # This wrapper exists to support variadic args. if has_torch_function(tensors): return handle_torch_function(atleast_1d, tensors, *tensors) if len(tensors) == 1: @@ -1199,6 +1206,7 @@ def atleast_2d(*tensors): >>> torch.atleast_2d((x,y)) (tensor([[0.5000]]), tensor([[1.]])) """ + # This wrapper exists to support variadic args. if has_torch_function(tensors): return handle_torch_function(atleast_2d, tensors, *tensors) if len(tensors) == 1: @@ -1243,6 +1251,7 @@ def atleast_3d(*tensors): >>> torch.atleast_3d((x,y)) (tensor([[[0.5000]]]), tensor([[[1.]]])) """ + # This wrapper exists to support variadic args. if has_torch_function(tensors): return handle_torch_function(atleast_3d, tensors, *tensors) if len(tensors) == 1: @@ -1479,6 +1488,7 @@ def chain_matmul(*matrices, out=None): .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition """ + # This wrapper exists to support variadic args. if has_torch_function(matrices): return handle_torch_function(chain_matmul, matrices, *matrices) From 9bb1371cc20a14907dbc47bc98e3ac5de866e34b Mon Sep 17 00:00:00 2001 From: peterjc123 Date: Wed, 18 Aug 2021 11:41:42 -0700 Subject: [PATCH 032/530] Disable RDYNAMIC check with MSVC (#62949) Summary: When testing with clang-cl, the flag is added though it is unsupported and that generates a few warnings. Tried a few alternatives like https://cmake.org/cmake/help/latest/module/CheckLinkerFlag.html, but they just don't work. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62949 Reviewed By: zhouzhuojie, driazati Differential Revision: D30359206 Pulled By: malfet fbshipit-source-id: 1bd27ad5772fe6757fa8c3a4bddf904f88d70b7b --- cmake/MiscCheck.cmake | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake index efac9e0dfa8e6..1497b0044a0b0 100644 --- a/cmake/MiscCheck.cmake +++ b/cmake/MiscCheck.cmake @@ -178,10 +178,12 @@ endif() # -to add all (including unused) symbols into the dynamic symbol # -table. We need this to get symbols when generating backtrace at # -runtime. -check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC) -if(${COMPILER_SUPPORTS_RDYNAMIC}) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") +if(NOT MSVC) + check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC) + if(${COMPILER_SUPPORTS_RDYNAMIC}) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") + endif() endif() # ---[ If we are using msvc, set no warning flags From 11fbd3958cb7ab69a9764d9ae9b000f24333ec40 Mon Sep 17 00:00:00 2001 From: Basil Hosmer Date: Wed, 18 Aug 2021 12:06:53 -0700 Subject: [PATCH 033/530] MaybeOwned page for dev wiki (#63450) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63450 Brief guide to understanding `MaybeOwned`, aimed at C++ PT devs who are obliged to interact with existing uses of it, rather than encouraging new usage. For reviewers: I haven't yet added a link to this page from anywhere. I'm thinking the right place is the [dev wiki main page C++ section](https://github.com/pytorch/pytorch/wiki#c) but happy to put it wherever makes sense, suggestions welcome. Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30402313 Pulled By: bhosmer fbshipit-source-id: 69b15909ecafcd8d88e44f664f88c3ad4eb26d84 --- docs/cpp/source/notes/maybe_owned.rst | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 docs/cpp/source/notes/maybe_owned.rst diff --git a/docs/cpp/source/notes/maybe_owned.rst b/docs/cpp/source/notes/maybe_owned.rst new file mode 100644 index 0000000000000..8fa05f1b6aea7 --- /dev/null +++ b/docs/cpp/source/notes/maybe_owned.rst @@ -0,0 +1,59 @@ +MaybeOwned +================== + +``MaybeOwned`` is a C++ smart pointer class that dynamically +encodes whether a Tensor is *owned* or *borrowed*. It is used in +certain performance-sensitive situations to avoid unnecessarily +incrementing a Tensor’s reference count (at a small cost in +overhead from the extra indirection). + +.. warning:: + MaybeOwned must be used with **extreme** care. Claims of (non-)ownership + are not statically checked, and mistakes can cause reference undercounting + and use-after-free crashes. + + Due to this lack of safety net, we discourage the use of MaybeOwned + outside code paths that are known to be highly performance sensitive. + However, if you encounter pre-existing uses of MaybeOwned in code that + you want to modify, it’s critical to understand how to use it correctly. + +The primary use case for ``MaybeOwned`` is a function or method that +dynamically chooses between returning one of its arguments (typically +from a passthrough or “no-op” code path) and returning a freshly constructed +Tensor. Such a function would return a ``MaybeOwned`` in both cases, +the former in a "borrowed" state via a call to ``MaybeOwned::borrowed()``, +and the latter in an "owned" state via a call to ``MaybeOwned::owned()``. + +The canonical example is ``Tensor``'s ``expect_contiguous`` method, which shortcuts +and returns a borrowed self-reference when already contiguous: + +.. code-block:: cpp + + inline c10::MaybeOwned Tensor::expect_contiguous(MemoryFormat memory_format) const & { + if (is_contiguous(memory_format)) { + return c10::MaybeOwned::borrowed(*this); + } else { + return c10::MaybeOwned::owned(__dispatch_contiguous(memory_format)); + } + } + +Using the vocabulary of lifetimes, the essential safety requirement for borrowing +is that a borrowed Tensor must outlive any borrowing references to it. Here, for +example, we can safely borrow ``*this``, but the Tensor returned by +``__dispatch_contiguous()`` is freshly created, and borrowing a reference would +effectively leave it ownerless. + +So, general rules of thumb: + +- When in doubt, don’t use ``MaybeOwned`` at all - in particular, prefer + avoiding using it in code that doesn’t use it already. New usage should only be + introduced when critical (and demonstrable) performance gains result. + +- When modifying or calling code that already uses ``MaybeOwned``, remember + that it's always safe to produce a ``MaybeOwned`` from a Tensor in hand + via a call to ``MaybeOwned::owned()``. This may result in an unnecessary + reference count, but never in misbehavior - so it's always the safer bet, unless + the lifetime of the Tensor you're looking to wrap is crystal clear. + +More details and implementation code can be found at and +. From d661e646add3256b85687faab4197d2d1aab3dec Mon Sep 17 00:00:00 2001 From: James Reed Date: Wed, 18 Aug 2021 13:16:01 -0700 Subject: [PATCH 034/530] [FX] Fix GraphModule deepcopy to use deepcopied graph (#63090) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63090 Test Plan: Imported from OSS Reviewed By: ansley Differential Revision: D30252471 Pulled By: jamesr66a fbshipit-source-id: cafd7d7917935a5ea6ffa2a7fe9e9b2a9578b3e3 --- test/test_fx.py | 19 +++++++++++++++++++ torch/fx/graph_module.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_fx.py b/test/test_fx.py index f0a3291d07d4a..1708634653a64 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -1943,6 +1943,25 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo with self.assertRaisesRegex(RuntimeError, 'cannot contain a Node'): traced_graph = MyTracer().trace(CallsModWithDict()) + def test_module_deepcopy_edit_nodes(self): + class Foo(torch.nn.Module): + def forward(self, x): + return torch.relu(x) + + traced1 = symbolic_trace(Foo()) + copied = copy.deepcopy(traced1) + + for node in copied.graph.nodes: + if node.target == torch.relu: + node.target = torch.neg + + copied.recompile() + traced1.recompile() + + x = torch.randn(15, 15) + torch.testing.assert_allclose(traced1(x), torch.relu(x)) + torch.testing.assert_allclose(copied(x), torch.neg(x)) + def test_direct_param_use(self): class TransposeTest(torch.nn.Module): def __init__(self): diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index 0cbbd9373027a..85479f069d53d 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -615,7 +615,7 @@ def __reduce__(self): def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) - return GraphModule(fake_mod, self.graph) + return GraphModule(fake_mod, fake_mod.__dict__['_graph']) def __copy__(self): return GraphModule(self, self.graph) From 99b154b8bebb8eed79d10f5c83798263ae04153f Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 18 Aug 2021 13:25:19 -0700 Subject: [PATCH 035/530] [ONNX] Support lstm_cell symbolic (#61476) (#62757) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62757 Support lstm_cell symbolic Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30349061 Pulled By: msaroufim fbshipit-source-id: f236177e3e5c62a30b7e4d91a623bcaef21b5eb1 Co-authored-by: jiafatom --- test/onnx/test_pytorch_onnx_onnxruntime.py | 16 ++++++++++++++++ torch/onnx/symbolic_opset9.py | 11 +++++++++++ 2 files changed, 27 insertions(+) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index b92568c03cba2..dc376dc817e4f 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -3723,6 +3723,22 @@ def forward(self, x, h0, c0): c0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE) self.run_test(LSTMModel(), (input, h0, c0)) + @skipIfUnsupportedMinOpsetVersion(9) + def test_lstm_cell(self): + class LSTMCellModel(torch.nn.Module): + def __init__(self, bias): + super().__init__() + self.lstm_cell = torch.nn.LSTMCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, bias=bias) + + def forward(self, x, h0, c0): + return self.lstm_cell(x, (h0, c0)) + + input = torch.randn(BATCH_SIZE, RNN_INPUT_SIZE) + h0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE) + c0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE) + for bias in [True, False]: + self.run_test(LSTMCellModel(bias), (input, h0, c0)) + @skipIfUnsupportedMinOpsetVersion(9) def test_lstm_default_init_state(self): class LSTMModel(torch.nn.Module): diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 36c1753ab252b..611749f83f8f7 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2316,6 +2316,17 @@ def lstm(g, *args): return _lstm_full(g, *args) +def lstm_cell(g, self, hidden, w_ih, w_hh, b_ih, b_hh): + input = sym_help._unsqueeze_helper(g, self, [0]) + hidden = sym_help._unpack_list(hidden) + hidden = [sym_help._unsqueeze_helper(g, x, [0]) for x in hidden] + weight = (w_ih, w_hh, b_ih, b_hh) if sym_help._is_tensor(b_ih) else (w_ih, w_hh) + has_biases = True if sym_help._is_tensor(b_ih) else False + _, h_outs, c_outs = _generic_rnn(g, 'LSTM', input, hidden, weight, has_biases, num_layers=1, + dropout=0, train=0, bidirectional=False, batch_first=False) + return sym_help._squeeze_helper(g, h_outs, [0]), sym_help._squeeze_helper(g, c_outs, [0]) + + def _one_hidden_rnn(kind): @parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i") def _rnn_full(g, input, hidden, weight_v, has_biases, num_layers, dropout, train, bidirectional, batch_first): From 3a7bbf5fb70f288050287dd6168c7af0f10de3ec Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 18 Aug 2021 13:25:19 -0700 Subject: [PATCH 036/530] [ONNX] Add support for opset14 in PT-ONNX exporter (#59486) (#62758) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62758 * Add initial changes for opset14 * Fixed flake * Add onnx submodule changes and removed utility func tests * Add updated batchNorm symbolic * Add triu/tril symbolics * Fix lint * Fixed test failures * Add reshape with allowzero * Added tests/refactored opset versioning * Bump onnxruntime version * Fix clang/lint failures * Add reshape shape inference for opset 14 * Changes for allowzero * Fix lint/clang and test failures * Updated PR * Flake fixes * Fix flake * Remove new_jit_api tests * Add opset14 models * Update allowzero * Fix test failures Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30349063 Pulled By: msaroufim fbshipit-source-id: 54724246149b01a2f627c43d7396253a7e9c9eb9 Co-authored-by: Shubham Bhokare --- .jenkins/caffe2/test.sh | 2 +- aten/src/ATen/core/interned_strings.h | 3 +- scripts/onnx/test.sh | 2 +- test/onnx/test_models_onnxruntime.py | 2 +- test/onnx/test_pytorch_common.py | 10 +++ test/onnx/test_pytorch_onnx_onnxruntime.py | 75 +++++++++++++++- test/onnx/test_utility_funs.py | 30 +++---- torch/csrc/jit/passes/onnx/constant_fold.cpp | 13 +-- torch/csrc/jit/passes/onnx/constant_fold.h | 1 + .../jit/passes/onnx/shape_type_inference.cpp | 23 +++-- torch/onnx/__init__.py | 2 +- torch/onnx/symbolic_helper.py | 50 ++++++++++- torch/onnx/symbolic_opset11.py | 23 ++--- torch/onnx/symbolic_opset12.py | 6 +- torch/onnx/symbolic_opset14.py | 54 +++++++++++ torch/onnx/symbolic_opset9.py | 89 +++++++------------ 16 files changed, 276 insertions(+), 109 deletions(-) create mode 100644 torch/onnx/symbolic_opset14.py diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index e66b7ae958a1e..4c577ed437439 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -170,7 +170,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then - pip install -q --user onnxruntime==1.7.0 + pip install -q --user onnxruntime==1.8.0 fi "$ROOT_DIR/scripts/onnx/test.sh" fi diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 2f527cdde5e91..c1dbc75a26b99 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -466,7 +466,8 @@ namespace c10 { _(attr, keepdims) \ _(attr, cache_id) \ _(attr, new_axis) \ - _(attr, warn_id) + _(attr, warn_id) \ + _(attr, allowzero) // 'prim' symbols are synthetic operators that occur only in the IR // and don't have corresponding implementations in ATen. diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index 4ee0cdad92ad1..f39d4f0fa5abf 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -79,7 +79,7 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test1* ]]; then fi if [[ "$BUILD_ENVIRONMENT" == *ort_test2* ]]; then # Update the loop for new opsets - for i in $(seq 10 13); do + for i in $(seq 10 14); do pytest "${args[@]}" \ "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i" done diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py index be7f8c62176e8..59909db5958cc 100644 --- a/test/onnx/test_models_onnxruntime.py +++ b/test/onnx/test_models_onnxruntime.py @@ -7,7 +7,7 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None): - opset_versions = opset_versions if opset_versions else [7, 8, 9, 10, 11, 12] + opset_versions = opset_versions if opset_versions else [7, 8, 9, 10, 11, 12, 13, 14] for opset_version in opset_versions: self.opset_version = opset_version diff --git a/test/onnx/test_pytorch_common.py b/test/onnx/test_pytorch_common.py index 0695a989013c7..09ab7a26f4967 100644 --- a/test/onnx/test_pytorch_common.py +++ b/test/onnx/test_pytorch_common.py @@ -60,6 +60,16 @@ def wrapper(self): return wrapper return skip_dec +# skips tests for all opset versions. +def skipForAllOpsetVersions(): + def skip_dec(func): + def wrapper(self): + if self.opset_version: + raise unittest.SkipTest("Skip verify test for unsupported opset_version") + return func(self) + return wrapper + return skip_dec + # Enables tests for scripting, instead of only tracing the model. def enableScriptTest(): def script_dec(func): diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index dc376dc817e4f..fe877948666dd 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -17,7 +17,7 @@ RnnModelWithPackedSequenceWithoutState) from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion, skipIfNoLapack, disableScriptTest, skipIfONNXShapeInference, - skipIfUnsupportedMaxOpsetVersion) + skipIfUnsupportedMaxOpsetVersion, skipForAllOpsetVersions) from test_pytorch_common import BATCH_SIZE from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE from typing import List, Tuple, Optional, Dict @@ -2489,6 +2489,18 @@ def forward(self, x): x = torch.empty(2, 3, 3, dtype=torch.double).uniform_(0, 1) self.run_test(Bernoulli(), x) + # Enable test when fix for allowzero is in ORT + @skipForAllOpsetVersions() + @skipIfUnsupportedMinOpsetVersion(14) + def test_reshape_allowzero(self): + class ReshapeModel(torch.nn.Module): + def forward(self, x): + x = x.reshape(3, 4, 0) + return x + + x = torch.randn(0, 3, 4) + self.run_test(ReshapeModel(), x) + def test_reshape_different_rank(self): class ReshapeModel(torch.nn.Module): def forward(self, x): @@ -3283,6 +3295,7 @@ def test_batchnorm1d_noaffine(self): x = torch.randn(10, 10, 128) self.run_test(model, x) + @skipIfUnsupportedOpsetVersion([14]) def test_batchnorm1d_norunningstats(self): x = torch.randn(10, 10) model = torch.nn.BatchNorm1d(10, track_running_stats=False) @@ -3301,6 +3314,7 @@ def test_batchnorm2d_noaffine(self): model = torch.nn.BatchNorm2d(3, affine=False) self.run_test(model, x) + @skipIfUnsupportedOpsetVersion([14]) def test_batchnorm2d_norunningstats(self): x = torch.randn(10, 3, 128, 128) model = torch.nn.BatchNorm2d(3, track_running_stats=False) @@ -5709,6 +5723,52 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(SiLUModel(), (x)) + @skipIfUnsupportedMinOpsetVersion(14) + def test_tril(self): + class trilModel(torch.nn.Module): + def forward(self, x): + return torch.tril(x) + + x = torch.randn(2, 3, 4) + self.run_test(trilModel(), (x)) + + class trilModelwithDiagonal(torch.nn.Module): + def forward(self, x): + return torch.tril(x, diagonal=1) + + x = torch.randn(2, 3, 4) + self.run_test(trilModelwithDiagonal(), (x)) + + class trilModelwithNegDiagonal(torch.nn.Module): + def forward(self, x): + return torch.tril(x, diagonal=-1) + + x = torch.randn(2, 3, 4) + self.run_test(trilModelwithNegDiagonal(), (x)) + + @skipIfUnsupportedMinOpsetVersion(14) + def test_triu(self): + class triuModel(torch.nn.Module): + def forward(self, x): + return torch.triu(x) + + x = torch.randn(2, 3, 4) + self.run_test(triuModel(), (x)) + + class triuModelwithDiagonal(torch.nn.Module): + def forward(self, x): + return torch.triu(x, diagonal=1) + + x = torch.randn(2, 3, 4) + self.run_test(triuModelwithDiagonal(), (x)) + + class trilModelwithNegDiagonal(torch.nn.Module): + def forward(self, x): + return torch.tril(x, diagonal=-1) + + x = torch.randn(2, 3, 4) + self.run_test(trilModelwithNegDiagonal(), (x)) + def test_mish(self): class MishModel(torch.nn.Module): def __init__(self): @@ -7586,6 +7646,8 @@ def forward(self, input): x = torch.randn(6, 4, 3, 3) self.run_test(FakeQuantizePerChannelModel(), (x)) + # Tests skipped temporarliy as latest onnxruntime release does not include training ops + @skipForAllOpsetVersions() def test_batchnorm_training(self): class MyModule(torch.nn.Module): def __init__(self): @@ -7709,6 +7771,8 @@ def forward(self, x): np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01) + # Tests skipped temporarliy as latest onnxruntime release does not include training ops + @skipForAllOpsetVersions() def test_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): @@ -7745,6 +7809,8 @@ def forward(self, x): [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)] + # Tests skipped temporarliy as latest onnxruntime release does not include training ops + @skipForAllOpsetVersions() def test_multiple_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): @@ -9563,5 +9629,12 @@ def setup_rnn_tests(): keep_initializers_as_inputs=False, onnx_shape_inference=True)) +# opset 14 tests +TestONNXRuntime_opset14 = type(str("TestONNXRuntime_opset14"), + (unittest.TestCase,), + dict(TestONNXRuntime.__dict__, opset_version=14, + keep_initializers_as_inputs=False, + onnx_shape_inference=True)) + if __name__ == "__main__": unittest.main() diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 71f52b306b8c4..06faf410f865e 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -5,7 +5,8 @@ from torch.onnx import utils, OperatorExportTypes, TrainingMode from torch.onnx.symbolic_helper import _set_opset_version, _set_operator_export_type, _set_onnx_shape_inference import torch.utils.cpp_extension -from test_pytorch_common import skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion +from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion, + skipIfUnsupportedMaxOpsetVersion) import caffe2.python.onnx.backend as backend from verify import verify @@ -635,7 +636,7 @@ def test_aten_fallthrough(self): # Test aten export of op with no symbolic class Module(torch.nn.Module): def forward(self, x): - return torch.triu(x) + return torch.erfc(x) x = torch.randn(2, 3, 4) _set_opset_version(self.opset_version) @@ -643,8 +644,7 @@ def forward(self, x): operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) iter = graph.nodes() - assert next(iter).kind() == "onnx::Constant" - assert next(iter).kind() == "aten::triu" + assert next(iter).kind() == "aten::erfc" def test_custom_op_fallthrough(self): # Test custom op @@ -731,7 +731,7 @@ def forward(self, x): assert next(iter).kind() == "aten::dequantize" # prim::ListConstruct is exported as onnx::SequenceConstruct for opset >= 11 - @skipIfUnsupportedOpsetVersion([11, 12, 13]) + @skipIfUnsupportedMaxOpsetVersion(10) def test_prim_fallthrough(self): # Test prim op class PrimModule(torch.jit.ScriptModule): @@ -836,6 +836,7 @@ def forward(self, x): x = torch.tensor([1, 2]) verify(MyModel(), x, backend, do_constant_folding=False) + @skipIfUnsupportedOpsetVersion([14]) def test_fuse_conv_bn(self): class Fuse(torch.nn.Module): def __init__(self): @@ -857,6 +858,7 @@ def forward(self, x): assert len(list(graph.nodes())) == 1 + @skipIfUnsupportedOpsetVersion([14]) def test_fuse_resnet18(self): model = torchvision.models.resnet18(pretrained=True) x = torch.randn(2, 3, 224, 224, requires_grad=True) @@ -917,20 +919,10 @@ def forward(self, x, y): (TestCase,), dict(TestUtilityFuns.__dict__, opset_version=13)) -# opset 11 tests -TestUtilityFuns_opset11_new_jit_API = type(str("TestUtilityFuns_opset11_new_jit_API"), - (TestCase,), - dict(TestUtilityFuns.__dict__, opset_version=11)) - -# opset 12 tests -TestUtilityFuns_opset12_new_jit_API = type(str("TestUtilityFuns_opset12_new_jit_API"), - (TestCase,), - dict(TestUtilityFuns.__dict__, opset_version=12)) - -# opset 13 tests -TestUtilityFuns_opset13_new_jit_API = type(str("TestUtilityFuns_opset13_new_jit_API"), - (TestCase,), - dict(TestUtilityFuns.__dict__, opset_version=13)) +# opset 14 tests +TestUtilityFuns_opset14 = type(str("TestUtilityFuns_opset14"), + (TestCase,), + dict(TestUtilityFuns.__dict__, opset_version=14)) if __name__ == "__main__": diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp index 901844cd62380..76c0674e11fd8 100644 --- a/torch/csrc/jit/passes/onnx/constant_fold.cpp +++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp @@ -266,9 +266,7 @@ c10::optional runTorchBackendForOnnx( if (node->kind() == onnx::Slice) { if (opset_version == ONNX_OPSET_9) { return runTorchSlice_opset9(node, inputTensorValues); - } else if ( - opset_version == ONNX_OPSET_10 || opset_version == ONNX_OPSET_11 || - opset_version == ONNX_OPSET_12 || opset_version == ONNX_OPSET_13) { + } else if (opset_version >= ONNX_OPSET_10) { return runTorchSlice_opset10(node, inputTensorValues); } else { std::cerr << "Warning: Constant folding - unsupported opset version. " @@ -351,7 +349,7 @@ c10::optional runTorchBackendForOnnx( } } else if (node->kind() == onnx::Squeeze) { assert(inputTensorValues.size() == 2 || inputTensorValues.size() == 1); - if (opset_version == ONNX_OPSET_13) { + if (opset_version >= ONNX_OPSET_13) { // Squeeze version 13 input axes is optional, inputTensorValues.size() == // 1 means axes equal to None updated_val = inputTensorValues[0]; @@ -415,13 +413,18 @@ c10::optional runTorchBackendForOnnx( std::vector shape(inputTensorValues[1].sizes()[0], 0); auto shape_a = inputTensorValues[1].accessor(); assert(inputTensorValues[1].sizes()[0] >= 0); + // Set value of allowzero + int64_t allowzero = 0; + if (node->hasAttributeS("allowzero")) { + allowzero = node->i(attr::allowzero); + } for (size_t i = 0; i < (size_t)(inputTensorValues[1].sizes()[0]); ++i) { // All shape dim values should be >= -1 // onnx::Reshape supports a shape dim value to be zero, in // which case the actual dim value remains unchanged. However, // at::reshape does not support shape dim value to be zero assert(shape_a[i] >= -1); - if (shape_a[i] == 0) { + if (shape_a[i] == 0 && !allowzero) { if (i >= inputTensorValues[0].sizes().size()) { throw std::runtime_error( "Dimension with value 0 exceeds the input size dimensions."); diff --git a/torch/csrc/jit/passes/onnx/constant_fold.h b/torch/csrc/jit/passes/onnx/constant_fold.h index 1c54412ccd7a1..8bfb0dd081c39 100644 --- a/torch/csrc/jit/passes/onnx/constant_fold.h +++ b/torch/csrc/jit/passes/onnx/constant_fold.h @@ -13,6 +13,7 @@ const int ONNX_OPSET_10 = 10; const int ONNX_OPSET_11 = 11; const int ONNX_OPSET_12 = 12; const int ONNX_OPSET_13 = 13; +const int ONNX_OPSET_14 = 14; namespace onnx_constant_fold { diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 634d8d2e1db99..f630cf023f7b5 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -404,8 +404,10 @@ c10::optional ComputeConstantFolding(Node* n, int opset_version) { // When the Reshape node's two inputs are constant, compute the output shape. // The reshape value 0 and -1 are converted to the real value explicitly. std::vector ComputeShapeFromReshape( + Node* n, const std::vector& input_shape, - const std::vector& reshape) { + const std::vector& reshape, + int opset_version) { TORCH_INTERNAL_ASSERT( input_shape.size() > 0 || reshape.size() > 0, "Reshape node should have at least one input size > 0 when constant folding."); @@ -427,6 +429,17 @@ std::vector ComputeShapeFromReshape( auto reshape_size = static_cast(reshape.size()); auto it_0 = std::find(reshape.begin(), reshape.end(), 0); auto reshape_has_zero = it_0 != reshape.end(); + + // Allowzero is set to 0 by default + // When opset version > 14, assign appropriate allowzero value + int allowzero = 0; + if (opset_version >= 14 && n->hasAttributeS("allowzero")) { + allowzero = n->i(attr::allowzero); + if (allowzero == 1 && reshape_has_zero) { + return reshape; + } + } + auto input_shape_size = static_cast(input_shape.size()); auto it_minus_one = std::find(reshape.begin(), reshape.end(), -1); int minus_one_pos = it_minus_one == reshape.end() @@ -594,7 +607,7 @@ c10::optional> GetValueFromListConstructNode( : c10::nullopt; } -void ProcessReshapeNode(Node* n) { +void ProcessReshapeNode(Node* n, int opset_version) { if (ConstantValueMap::HasValue(n->input(1)->debugName())) { auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector(n->input(1)->debugName()); @@ -602,8 +615,8 @@ void ProcessReshapeNode(Node* n) { ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown( n->input(0)->debugName()); if (shape_vector_0.has_value()) { - auto final_shape = - ComputeShapeFromReshape(shape_vector_0.value(), shape_temp); + auto final_shape = ComputeShapeFromReshape( + n, shape_vector_0.value(), shape_temp, opset_version); UpdateShapeFromVector(n->output(), final_shape); return; } @@ -865,7 +878,7 @@ void ComputeConstant(Node* n, int opset_version) { break; } case ::c10::onnx::Reshape: { - ProcessReshapeNode(n); + ProcessReshapeNode(n, opset_version); break; } case ::c10::onnx::Gather: { diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index c859419cf38a5..c143d0ce8984d 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -13,7 +13,7 @@ ir_version = _C._onnx.IR_VERSION producer_name = "pytorch" producer_version = _C._onnx.PRODUCER_VERSION -constant_folding_opset_versions = [9, 10, 11, 12, 13] +constant_folding_opset_versions = [9, 10, 11, 12, 13, 14] class ExportTypes: diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 5b378ecc214ce..66f276ce42f55 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -298,7 +298,7 @@ def _select_helper(g, self, dim, index, apply_reshape=True): elif index_dim is not None and apply_reshape: if index_dim == 0: # Index is a scalar. Reshape it to a size 1 tensor. - index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1]))) + index = _reshape_helper(g, index, g.op("Constant", value_t=torch.LongTensor([1]))) index_scalar_type = index.type().scalarType() if index_scalar_type is None or index_scalar_type not in ["Long", "Int"]: @@ -367,7 +367,7 @@ def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None): if not _is_value(k): k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64)) else: - k = g.op("Reshape", k, g.op("Constant", value_t=torch.tensor([1]))) + k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1]))) if _export_onnx_opset_version <= 10: if not largest: _unimplemented("TopK", "Ascending is not supported") @@ -704,6 +704,48 @@ def _index_fill_reshape_helper(g, self, dim, index): expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None) return expanded_index_shape, expanded_index +# When using reshape helper (opset_version >= 14), if reshape has -1, +# allowzero cannot be set to 1 +def _reshape_helper(g, input, shape, allowzero=0): + shape = _maybe_get_const(shape, "is") + if not _is_value(shape): + shape = g.op("Constant", value_t=torch.LongTensor(shape)) + if _export_onnx_opset_version <= 13: + return g.op("Reshape", input, shape) + else: + warnings.warn("allowzero=0 by default. In order to honor zero value in shape use allowzero=1") + return g.op("Reshape", input, shape, allowzero_i=allowzero) + +def _batchnorm_helper(g, input, weight, bias, running_mean, running_var): + from torch.onnx.symbolic_opset9 import _var_mean + batch_size = _get_tensor_dim_size(input, 0) + channel_size = _get_tensor_dim_size(input, 1) + + if weight is None or _is_none(weight): + if channel_size is None: + raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown " + "channel size.") + weight_value = torch.tensor([1.] * channel_size).type( + "torch." + input.type().scalarType() + "Tensor") + weight = g.op("Constant", value_t=weight_value) + if bias is None or _is_none(bias): + if channel_size is None: + raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown " + "channel size.") + bias_value = torch.tensor([0.] * channel_size).type( + "torch." + input.type().scalarType() + "Tensor") + bias = g.op("Constant", value_t=bias_value) + # If track_running_stats is set to False batch statistics are instead used during evaluation time + if running_mean is None or _is_none(running_mean) or running_var is None or _is_none(running_var): + assert batch_size is not None and channel_size is not None + reshape_in = _reshape_helper(g, input, + g.op("Constant", value_t=torch.tensor([batch_size, channel_size, -1], + dtype=torch.int64))) + trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1]) + running_var, running_mean = _var_mean(g, trans_in, + g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)), + False, False) + return weight, bias, running_mean, running_var def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name): if divisor_override and divisor_override.node().kind() != "prim::Constant": @@ -787,8 +829,8 @@ def _handle_reduce_dim_none(g, self, op_name): _default_onnx_opset_version = 9 -_onnx_main_opset = 13 -_onnx_stable_opsets = [7, 8, 9, 10, 11, 12] +_onnx_main_opset = 14 +_onnx_stable_opsets = [7, 8, 9, 10, 11, 12, 13] _export_onnx_opset_version = _default_onnx_opset_version diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index eaa49c29e1546..ed7abf263f31c 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -179,7 +179,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False): rank = sym_help._get_tensor_rank(values) if rank is not None and rank == 0: values = expand(g, values, values_shape, None) - values = g.op("Reshape", values, values_shape) + values = sym_help._reshape_helper(g, values, values_shape) dtype = self.type().scalarType() if dtype is not None and dtype != values.type().scalarType(): @@ -266,12 +266,12 @@ def masked_select(g, self, mask): def masked_scatter(g, self, mask, source): - from torch.onnx.symbolic_opset9 import nonzero, expand_as, view, size + from torch.onnx.symbolic_opset9 import nonzero, expand_as, size index = nonzero(g, expand_as(g, mask, self)) # NOTE: source can have more elements than needed. # It could also have arbitrary shape. # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor. - source = view(g, source, torch.LongTensor([-1])) + source = sym_help._reshape_helper(g, source, torch.LongTensor([-1])) source = sym_help._slice_helper(g, source, axes=torch.LongTensor([0]), starts=torch.LongTensor([0]), @@ -453,9 +453,9 @@ def _prepare_onnx_paddings(g, dim, pad): # paddings = [[..., 0, dim_n-1_begin, dim_n_begin], # [..., 0, dim_n-1_end, dim_n_end]] # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end] - paddings = g.op("Reshape", paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))) + paddings = sym_help._reshape_helper(g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))) paddings = g.op("Transpose", torch.onnx.symbolic_opset10.flip(g, paddings, [0]), perm_i=[1, 0]) - paddings = g.op("Reshape", paddings, g.op("Constant", value_t=torch.tensor([-1]))) + paddings = sym_help._reshape_helper(g, paddings, g.op("Constant", value_t=torch.tensor([-1]))) padding_c = g.op("Cast", paddings, to_i=sym_help.cast_pytorch_to_onnx["Long"]) return padding_c @@ -695,7 +695,7 @@ def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d, padding # Broadcast and add kernel staring positions (indices) with # kernel_grid along dim d, to get block indices along dim d blocks_d_indices = sym_help._unsqueeze_helper(g, blocks_d_indices, [0]) # Reshape to [1, -1] - kernel_mask = g.op("Reshape", kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))) + kernel_mask = sym_help._reshape_helper(g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))) block_mask = g.op("Add", blocks_d_indices, kernel_mask) return block_mask @@ -766,7 +766,7 @@ def im2col(g, input, kernel_size, dilation, padding, stride): output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2) output = g.op("Gather", output, blocks_col_indices, axis_i=4) output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5]) - return g.op("Reshape", output, output_shape) + return sym_help._reshape_helper(g, output, output_shape) def narrow(g, input, dim, start, length): @@ -895,13 +895,12 @@ def chunk(g, self, chunks, dim): return split(g, self, chunk_vec, dim) def repeat_interleave(g, self, repeats, dim=None, output_size=None): - from torch.onnx.symbolic_opset9 import reshape input = self final_dim = dim # if dim is None flatten # By default, use the flattened input array, and return a flat output array if sym_help._is_none(dim): - input = reshape(g, self, g.op("Constant", value_t=torch.tensor([-1]))) + input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1]))) dim = 0 else: dim = sym_help._maybe_get_scalar(dim) @@ -983,7 +982,8 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None): loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))] r_concat = loop_block.op("Concat", *r_concat, axis_i=0) i_split = expand(loop_block, i_split, r_concat, None) - i_split = reshape(loop_block, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))) + i_split = sym_help._reshape_helper(loop_block, i_split, + g.op("Constant", value_t=torch.LongTensor(output_sizes))) # Loop outputs cond_out = loop_block.op("Cast", loop_condition, to_i=9) @@ -995,7 +995,8 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None): # the zero'th dimension (by default). In order to avoid this and concatenate # along the dimension provided, some post-processing is required loop_out = g.op("Transpose", loop_out, perm_i=perm_i) - return reshape(g, loop_out, g.op("Constant", value_t=torch.LongTensor(output_sizes))) + return sym_help._reshape_helper(g, loop_out, + g.op("Constant", value_t=torch.LongTensor(output_sizes))) def normal(g, loc, scale, seed): diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py index 58420a2bc7749..8e989cccf9867 100644 --- a/torch/onnx/symbolic_opset12.py +++ b/torch/onnx/symbolic_opset12.py @@ -123,8 +123,7 @@ def celu(g, self, alpha): def argmax(g, input, dim, keepdim): if sym_help._is_none(dim): - from torch.onnx.symbolic_opset9 import reshape - flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1]))) + flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1]))) return g.op("ArgMax", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False) else: dim = _parse_arg(dim, "i") @@ -134,8 +133,7 @@ def argmax(g, input, dim, keepdim): def argmin(g, input, dim, keepdim): if sym_help._is_none(dim): - from torch.onnx.symbolic_opset9 import reshape - flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1]))) + flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1]))) return g.op("ArgMin", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False) else: dim = _parse_arg(dim, "i") diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py new file mode 100644 index 0000000000000..443aac97c9404 --- /dev/null +++ b/torch/onnx/symbolic_opset14.py @@ -0,0 +1,54 @@ +# EDITING THIS FILE? READ THIS FIRST! +# see Note [Edit Symbolic Files] in symbolic_helper.py + +# This file exports ONNX ops for opset 14 +import torch + +import torch.onnx.symbolic_helper as sym_help +from torch.onnx.symbolic_helper import parse_args + +# Note [ONNX operators that are added/updated in opset 14] +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# New operators: +# HardSwish, Trilu +# +# Updated operators: +# Reshape +# Add, Sub, Mul, Div +# GRU, LSTM, RNN +# BatchNorm, Cumsum, Relu + +@parse_args("v") +def hardswish(g, self): + return g.op("HardSwish", self) + +@parse_args("v", "i") +def tril(g, self, diagonal, out=None): + k = g.op("Constant", value_t=torch.tensor(diagonal, dtype=torch.int64)) + return g.op("Trilu", self, k, upper_i=0) + +@parse_args("v", "i") +def triu(g, self, diagonal, out=None): + k = g.op("Constant", value_t=torch.tensor(diagonal, dtype=torch.int64)) + return g.op("Trilu", self, k, upper_i=1) + +@parse_args("v", "v") +def reshape(g, self, shape): + return sym_help._reshape_helper(g, self, shape) + +@parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i") +def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled): + sym_help.assert_training_mode(training, "batch_norm") + weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var) + out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var, + epsilon_f=eps, + momentum_f=1 - momentum, + training_mode_i=0 if not training else 1, + outputs=1 if not training else 3) + if not training: + return out + else: + res, new_running_mean, new_running_var = out + new_running_mean.setType(running_mean.type()) + new_running_var.setType(running_var.type()) + return res diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 611749f83f8f7..616c2fcaff5b8 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -70,11 +70,11 @@ def _shape_as_tensor(g, input): def _reshape_from_tensor(g, input, shape): if (isinstance(shape, list)): shape = g.op("Concat", *shape, axis_i=0) - return g.op("Reshape", input, shape) + return reshape(g, input, shape) def reshape(g, self, shape): - return view(g, self, shape) + return sym_help._reshape_helper(g, self, shape) def reshape_as(g, self, other): @@ -461,7 +461,7 @@ def expand(g, self, size, implicit): # Expand with -1 dim value means dim is unchanged. # Since onnx::expand supports two-way broadcasting, # -1 dim value can be exported to onnx as 1 - size = view(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))) + size = sym_help._reshape_helper(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))) dtype = 4 # dim type is int64 ones = ones_like(g, size, dtype) neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1))) @@ -566,17 +566,12 @@ def permute(g, self, dims): def view(g, self, size): - size = sym_help._maybe_get_const(size, "is") - if sym_help._is_value(size): - shape = size - else: - shape = g.op("Constant", value_t=torch.LongTensor(size)) - return g.op("Reshape", self, shape) + return reshape(g, self, size) def view_as(g, self, other): shape = g.op("Shape", other) - return g.op("Reshape", self, shape) + return reshape(g, self, shape) def prim_ConstantSplit(g, self, split_size, dim): @@ -1349,32 +1344,7 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i") def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled): sym_help.assert_training_mode(training, "batch_norm") - batch_size = sym_help._get_tensor_dim_size(input, 0) - channel_size = sym_help._get_tensor_dim_size(input, 1) - - if weight is None or sym_help._is_none(weight): - if channel_size is None: - raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown " - "channel size.") - weight_value = torch.tensor([1.] * channel_size).type( - "torch." + input.type().scalarType() + "Tensor") - weight = g.op("Constant", value_t=weight_value) - if bias is None or sym_help._is_none(bias): - if channel_size is None: - raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown " - "channel size.") - bias_value = torch.tensor([0.] * channel_size).type( - "torch." + input.type().scalarType() + "Tensor") - bias = g.op("Constant", value_t=bias_value) - # If track_running_stats is set to False batch statistics are instead used during evaluation time - if running_mean is None or sym_help._is_none(running_mean) or running_var is None or sym_help._is_none(running_var): - assert batch_size is not None and channel_size is not None - reshape_in = g.op("Reshape", input, - g.op("Constant", value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64))) - trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1]) - running_var, running_mean = _var_mean(g, trans_in, - g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)), - False, False) + weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var) out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var, epsilon_f=eps, momentum_f=1 - momentum, @@ -1771,7 +1741,7 @@ def tensor(g, data, dtype=None, device=None, requires_grad=False): input_list = list() for t in sym_help._unpack_list(data): shape_reference = g.op("Constant", value_t=torch.LongTensor([1])) - t = g.op("Reshape", t, shape_reference) + t = sym_help._reshape_helper(g, t, shape_reference) t = g.op("Cast", t, to_i=sym_help.scalar_type_to_onnx[dtype]) input_list.append(t) return g.op("Concat", *input_list, axis_i=0) @@ -2060,7 +2030,7 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None): # if dim is None flatten # By default, use the flattened input array, and return a flat output array if sym_help._is_none(dim): - input = reshape(g, self, g.op("Constant", value_t=torch.tensor([-1]))) + input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1]))) dim = 0 else: dim = sym_help._maybe_get_scalar(dim) @@ -2115,7 +2085,7 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None): g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1:]))] r_concat = g.op("Concat", *r_concat, axis_i=0) i_split = expand(g, i_split, r_concat, None) - i_split = reshape(g, i_split, g.op("Constant", value_t=torch.LongTensor(input_sizes))) + i_split = sym_help._reshape_helper(g, i_split, g.op("Constant", value_t=torch.LongTensor(input_sizes)), allowzero=0) final_splits.append(i_split) return g.op("Concat", *final_splits, axis_i=dim) @@ -2128,12 +2098,17 @@ def pixel_shuffle(g, self, upscale_factor): if any([i is None for i in dims[1:]]): return _unimplemented("pixel_shuffle", "only support static input shape, except for batch size") output_channel = dims[1] // upscale_factor // upscale_factor - after_view = view(g, self, g.op("Constant", value_t=torch.tensor([-1, output_channel, upscale_factor, - upscale_factor, dims[2], dims[3]]))) + after_view = sym_help._reshape_helper(g, self, + g.op("Constant", value_t=torch.tensor([-1, output_channel, + upscale_factor, upscale_factor, + dims[2], dims[3]])), + allowzero=0) after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3]) - return view(g, after_transpose, - g.op("Constant", value_t=torch.tensor([-1, output_channel, dims[2] * upscale_factor, - dims[3] * upscale_factor]))) + return sym_help._reshape_helper(g, after_transpose, + g.op("Constant", value_t=torch.tensor([-1, output_channel, + dims[2] * upscale_factor, + dims[3] * upscale_factor])), + allowzero=0) def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases, @@ -2277,7 +2252,8 @@ def retrieve_state(x, start, end): # Transpose, and then combining it with hidden_size # with Reshape. prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3]) - prev_output = g.op("Reshape", prev_output, g.op("Constant", value_t=torch.LongTensor([0, 0, -1]))) + prev_output = sym_help._reshape_helper(g, prev_output, + g.op("Constant", value_t=torch.LongTensor([0, 0, -1])), allowzero=0) else: prev_output = sym_help._squeeze_helper(g, prev_output, [1]) @@ -2525,7 +2501,7 @@ def narrow(g, input, dim, start, length): def argmax(g, input, dim, keepdim): if sym_help._is_none(dim): - flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1]))) + flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1]))) return g.op("ArgMax", flattened, axis_i=0, keepdims_i=False) else: dim = _parse_arg(dim, "i") @@ -2535,7 +2511,7 @@ def argmax(g, input, dim, keepdim): def argmin(g, input, dim, keepdim): if sym_help._is_none(dim): - flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1]))) + flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1]))) return g.op("ArgMin", flattened, axis_i=0, keepdims_i=False) else: dim = _parse_arg(dim, "i") @@ -2868,7 +2844,7 @@ def try_mask_to_index(index): folded_adv_idx_shape_list = [g.op("Constant", value_t=torch.LongTensor([-1]))] \ + [dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices] folded_adv_idx_shape = g.op("Concat", *folded_adv_idx_shape_list, axis_i=0) - self = g.op("Reshape", self, folded_adv_idx_shape) + self = sym_help._reshape_helper(g, self, folded_adv_idx_shape) # Transpose folded advanced indexed axis to its original location. adv_idx_permute = list(range(1, adv_idx_indices[0] + 1)) \ @@ -2887,7 +2863,7 @@ def try_mask_to_index(index): *[dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices], axis_i=0) - return g.op("Reshape", self, final_shape) + return sym_help._reshape_helper(g, self, final_shape) @parse_args("v", "is", "i") @@ -2919,7 +2895,8 @@ def baddbmm(g, self, batch1, batch2, beta, alpha): def meshgrid(g, tensor_list): - tensors = [view(g, t, g.op("Constant", value_t=torch.LongTensor([-1]))) for t in sym_help._unpack_list(tensor_list)] + tensors = [sym_help._reshape_helper(g, t, g.op("Constant", value_t=torch.LongTensor([-1]))) + for t in sym_help._unpack_list(tensor_list)] tensors_shape = [g.op("Shape", t) for t in tensors] out_shape = g.op("Concat", *tensors_shape, axis_i=0) out = [] @@ -2959,7 +2936,8 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): return _unimplemented("group_norm", "unknown input rank") # 0 in the shape list keeps dimension value unchanged. shape = [0, num_groups, -1] - input_reshaped = g.op("Reshape", input, g.op("Constant", value_t=torch.LongTensor(shape))) + input_reshaped = sym_help._reshape_helper(g, input, + g.op("Constant", value_t=torch.LongTensor(shape))) # C is always divisible by num_groups # Due to shape difference. we need to apply weight and bias after @@ -2970,7 +2948,7 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): "torch." + input.type().scalarType() + "Tensor")) norm_reshaped = g.op("InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps) - norm = g.op("Reshape", norm_reshaped, g.op("Shape", input)) + norm = sym_help._reshape_helper(g, norm_reshaped, g.op("Shape", input)) if weight is None or weight.node().mustBeNone(): weight_value = torch.tensor([1.]).type( @@ -3027,7 +3005,7 @@ def item(g, self): def take(g, self, index): - self_flattened = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + self_flattened = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) out = index_select(g, self_flattened, 0, index) out = reshape_as(g, out, index) return out @@ -3071,7 +3049,7 @@ def kl_div(g, input, target, reduction, log_target): def as_strided(g, self, sizes, strides, offset=None): sizes = sym_help._maybe_get_const(sizes, "is") rank = len(strides) - self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + self_1d = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) ind: Optional[torch.Tensor] if not sym_help._is_value(sizes): ind = torch.tensor([0], dtype=torch.long) @@ -3088,7 +3066,8 @@ def as_strided(g, self, sizes, strides, offset=None): r_size = [1] * rank r_size[i] = -1 size = select(g, sizes, g.op("Constant", value_t=torch.tensor([0])), g.op("Constant", value_t=torch.tensor(i))) - tmp_ind = g.op("Reshape", arange(g, size, 4, None, None, None), g.op("Constant", value_t=torch.tensor(r_size))) + tmp_ind = sym_help._reshape_helper(g, arange(g, size, 4, None, None, None), + g.op("Constant", value_t=torch.tensor(r_size))) tmp_ind = g.op("Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))) if ind is None: ind = tmp_ind From e182401062323ab613ca9f1e3786272e5ffc6eb4 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 18 Aug 2021 13:25:19 -0700 Subject: [PATCH 037/530] [ONNX] Remove aten parameter (#61652) (#62759) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62759 * remove aten argument in export() * add export_to_pretty_string default value OperatorExportTypes.ONNX * add DPYTORCH_ONNX_CAFFE2_BUNDLE description Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30349062 Pulled By: msaroufim fbshipit-source-id: d9738f3aa8b80eac54548d0b9494f9f1e544f20f Co-authored-by: Gary Miguel --- torch/onnx/__init__.py | 24 +++++++++++++----------- torch/onnx/utils.py | 26 ++++++++------------------ 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index c143d0ce8984d..b726b2b55e8b6 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -30,11 +30,11 @@ def _export(*args, **kwargs): def export(model, args, f, export_params=True, verbose=False, training=TrainingMode.EVAL, - input_names=None, output_names=None, aten=False, - operator_export_type=None, opset_version=None, _retain_param_name=True, - do_constant_folding=True, example_outputs=None, strip_doc_string=True, - dynamic_axes=None, keep_initializers_as_inputs=None, custom_opsets=None, - enable_onnx_checker=True, use_external_data_format=False): + input_names=None, output_names=None, operator_export_type=None, + opset_version=None, _retain_param_name=True, do_constant_folding=True, + example_outputs=None, strip_doc_string=True, dynamic_axes=None, + keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=True, + use_external_data_format=False): r""" Exports a model into ONNX format. If ``model`` is not a :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs @@ -116,9 +116,12 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM input nodes of the graph, in order. output_names (list of str, default empty list): names to assign to the output nodes of the graph, in order. - aten (bool, default False): [DEPRECATED. use operator_export_type] equivalent to - setting ``operator_export_type=OperatorExportTypes.ONNX_ATEN``. - operator_export_type (enum, default OperatorExportTypes.ONNX): + operator_export_type (enum, default None): + + None usually means ``OperatorExportTypes.ONNX``. + However if PyTorch was built with ``-DPYTORCH_ONNX_CAFFE2_BUNDLE``, None means + ``OperatorExportTypes.ONNX_ATEN_FALLBACK``. + * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops (in the default opset domain). * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops @@ -303,9 +306,8 @@ def forward(self, x): from torch.onnx import utils return utils.export(model, args, f, export_params, verbose, training, - input_names, output_names, aten, - operator_export_type, opset_version, _retain_param_name, - do_constant_folding, example_outputs, + input_names, output_names, operator_export_type, opset_version, + _retain_param_name, do_constant_folding, example_outputs, strip_doc_string, dynamic_axes, keep_initializers_as_inputs, custom_opsets, enable_onnx_checker, use_external_data_format) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index f5dc2f2270165..41ba20f3ad102 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -71,15 +71,12 @@ def select_model_mode_for_export(model, mode): def export(model, args, f, export_params=True, verbose=False, training=None, - input_names=None, output_names=None, aten=False, - operator_export_type=None, opset_version=None, _retain_param_name=True, - do_constant_folding=True, example_outputs=None, strip_doc_string=True, - dynamic_axes=None, keep_initializers_as_inputs=None, custom_opsets=None, + input_names=None, output_names=None, operator_export_type=None, + opset_version=None, _retain_param_name=True, do_constant_folding=True, + example_outputs=None, strip_doc_string=True, dynamic_axes=None, + keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=True, use_external_data_format=False): - if aten: - assert operator_export_type is None - operator_export_type = OperatorExportTypes.ONNX_ATEN - elif operator_export_type is None: + if operator_export_type is None: if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE: operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK else: @@ -529,18 +526,11 @@ def _model_to_graph(model, args, verbose=False, def export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=None, - input_names=None, output_names=None, aten=False, - operator_export_type=None, export_type=ExportTypes.PROTOBUF_FILE, - example_outputs=None, google_printer=False, - opset_version=None, _retain_param_name=True, + input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX, + export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, + google_printer=False, opset_version=None, _retain_param_name=True, keep_initializers_as_inputs=None, custom_opsets=None, add_node_names=True, do_constant_folding=True): - if aten: - assert operator_export_type is None - assert aten - operator_export_type = OperatorExportTypes.ONNX_ATEN - elif operator_export_type is None: - operator_export_type = OperatorExportTypes.ONNX return _export_to_pretty_string(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, example_outputs, google_printer, From 2aa19f33c6272cb016c629792ddc70cf9636fc9a Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 18 Aug 2021 13:25:19 -0700 Subject: [PATCH 038/530] [ONNX] Fix for batchnorm training op mode (#52758) (#62760) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62760 * Rebase # Conflicts: # torch/csrc/jit/passes/onnx/eval_peephole.cpp # Conflicts: # test/onnx/test_utility_funs.py # torch/onnx/symbolic_opset9.py * Update symbolic_opset12.py * Update test.sh # Conflicts: # .jenkins/caffe2/test.sh * Merge * Fix utility tests # Conflicts: # test/onnx/test_pytorch_onnx_onnxruntime.py # test/onnx/test_utility_funs.py * Fix for comment * Enable BN tests * Fix for test * Update test_pytorch_onnx_onnxruntime.py * Update test_pytorch_onnx_onnxruntime.py * Update test_utility_funs.py * Update test_pytorch_onnx_onnxruntime.py Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30349060 Pulled By: msaroufim fbshipit-source-id: 93312c17607974731c17099ae181acb6e4c1c409 --- .jenkins/caffe2/test.sh | 4 +- .../TestOperators.test_dropout_default.expect | 16 +- test/onnx/test_operators.py | 2 +- test/onnx/test_pytorch_onnx_onnxruntime.py | 148 +++++++++--------- test/onnx/test_utility_funs.py | 13 +- torch/csrc/jit/passes/onnx/eval_peephole.cpp | 23 +-- torch/onnx/symbolic_helper.py | 18 ++- torch/onnx/symbolic_opset12.py | 8 +- torch/onnx/symbolic_opset14.py | 2 +- torch/onnx/symbolic_opset9.py | 12 +- 10 files changed, 135 insertions(+), 111 deletions(-) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 4c577ed437439..75e269d6f6909 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -170,7 +170,9 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then - pip install -q --user onnxruntime==1.8.0 + pip install -q --user flatbuffers==2.0 + wget https://ortpypackage.blob.core.windows.net/ort-nightly/ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + pip install -q --user ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl fi "$ROOT_DIR/scripts/onnx/test.sh" fi diff --git a/test/onnx/expect/TestOperators.test_dropout_default.expect b/test/onnx/expect/TestOperators.test_dropout_default.expect index dcbc25a55045f..550bc65f2700b 100644 --- a/test/onnx/expect/TestOperators.test_dropout_default.expect +++ b/test/onnx/expect/TestOperators.test_dropout_default.expect @@ -5,7 +5,19 @@ graph { node { input: "x" output: "1" - name: "ReduceMax_0" + output: "2" + name: "Dropout_0" + op_type: "Dropout" + attribute { + name: "ratio" + f: 0.5 + type: FLOAT + } + } + node { + input: "1" + output: "3" + name: "ReduceMax_1" op_type: "ReduceMax" attribute { name: "keepdims" @@ -31,7 +43,7 @@ graph { } } output { - name: "1" + name: "3" type { tensor_type { elem_type: 1 diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 9fe38ca7b2455..b9e391b540663 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -681,7 +681,7 @@ def test_dropout_training(self): def test_dropout_opset12(self): x = torch.randn(3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.max(functional.dropout(x)), x, opset_version=12) + self.assertONNX(lambda x: torch.max(functional.dropout(x, training=False)), x, opset_version=12) def test_dropout_training_opset12(self): x = torch.randn(3, 4, requires_grad=True) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index fe877948666dd..67903fb0bd94c 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -100,7 +100,10 @@ def run_model_test(self, model, batch_size=2, state_dict=None, input_names=None, output_names=None, fixed_batch_size=False, dict_check=True, training=None, remained_onnx_input_idx=None): - model.eval() + if training is not None and training == torch.onnx.TrainingMode.TRAINING: + model.train() + elif training is None or training == torch.onnx.TrainingMode.EVAL: + model.eval() if input is None: input = torch.randn(batch_size, 3, 224, 224, requires_grad=True) with torch.no_grad(): @@ -281,11 +284,14 @@ def _run_test(m, remained_onnx_input_idx): def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7, example_outputs=None, do_constant_folding=True, dynamic_axes=None, input_names=None, output_names=None, - ort_optim_on=True): + ort_optim_on=True, training=None): import os import tempfile - model.eval() + if training is not None and training == torch.onnx.TrainingMode.TRAINING: + model.train() + elif training is None or training == torch.onnx.TrainingMode.EVAL: + model.eval() with torch.no_grad(): if isinstance(input, torch.Tensor): input = (input,) @@ -3295,7 +3301,6 @@ def test_batchnorm1d_noaffine(self): x = torch.randn(10, 10, 128) self.run_test(model, x) - @skipIfUnsupportedOpsetVersion([14]) def test_batchnorm1d_norunningstats(self): x = torch.randn(10, 10) model = torch.nn.BatchNorm1d(10, track_running_stats=False) @@ -3314,7 +3319,6 @@ def test_batchnorm2d_noaffine(self): model = torch.nn.BatchNorm2d(3, affine=False) self.run_test(model, x) - @skipIfUnsupportedOpsetVersion([14]) def test_batchnorm2d_norunningstats(self): x = torch.randn(10, 3, 128, 128) model = torch.nn.BatchNorm2d(3, track_running_stats=False) @@ -7646,50 +7650,79 @@ def forward(self, input): x = torch.randn(6, 4, 3, 3) self.run_test(FakeQuantizePerChannelModel(), (x)) - # Tests skipped temporarliy as latest onnxruntime release does not include training ops - @skipForAllOpsetVersions() def test_batchnorm_training(self): class MyModule(torch.nn.Module): def __init__(self): super(MyModule, self).__init__() - self.bn = torch.nn.BatchNorm2d(3, affine=True) + self.bn1 = torch.nn.BatchNorm2d(3, affine=False) + self.cv1 = torch.nn.Conv2d(3, 3, 10) + self.bn2 = torch.nn.BatchNorm2d(3, affine=True) + self.cv2 = torch.nn.Conv2d(3, 3, 10) + self.bn3 = torch.nn.BatchNorm2d(3, affine=False) def forward(self, x): - bn = self.bn(x) - return bn - - model = MyModule() - x = torch.randn(10, 3, 128, 128) + x = self.bn1(x) + x = self.cv1(x) + x = self.bn2(x) + x = self.cv2(x) + x = self.bn3(x) + return x - model.train() - out = model(x) + x = torch.randn(10, 3, 20, 20) * 2 + model_export = MyModule() + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + model_export.train() + self.run_test(model_export, (x, ), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) - # state after 1 train epoch - running_mean = model.bn.running_mean - running_var = model.bn.running_var - saved_mean = x.mean((0, 2, 3)) - saved_var = x.var((0, 2, 3), correction=1) + def test_batchnorm_training_mode_fix_layer(self): + class MyModule(torch.nn.Module): + def __init__(self): + super(MyModule, self).__init__() + self.bn1 = torch.nn.BatchNorm2d(3, affine=True) + self.cv1 = torch.nn.Conv2d(3, 3, 10) + self.bn2 = torch.nn.BatchNorm2d(3, affine=False) + self.cv2 = torch.nn.Conv2d(3, 3, 10) + self.bn3 = torch.nn.BatchNorm2d(3, affine=True) + self.bn3.eval() - pytorch_out = [out.detach().numpy(), - running_mean.cpu().numpy(), running_var.cpu().numpy(), - saved_mean.cpu().numpy(), saved_var.cpu().numpy()] + def forward(self, x): + x = self.bn1(x) + x = self.cv1(x) + x = self.bn2(x) + x = self.cv2(x) + x = self.bn3(x) + return x + x = torch.randn(10, 3, 128, 128) model_export = MyModule() - f = io.BytesIO() + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + model_export.train() + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) - ort_sess = convert_to_onnx(model_export, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs = run_ort(ort_sess, input=(x,)) - [np.testing.assert_allclose(p_out, ort_out, atol=10e-3, rtol=10e-3) for p_out, ort_out in zip(pytorch_out, ort_outs)] + def test_batchnorm_eval_mode_train_layer(self): + class MyModule(torch.nn.Module): + def __init__(self): + super(MyModule, self).__init__() + self.bn1 = torch.nn.BatchNorm2d(3, affine=True) + self.cv1 = torch.nn.Conv2d(3, 3, 10) + self.bn2 = torch.nn.BatchNorm2d(3, affine=False) + self.cv2 = torch.nn.Conv2d(3, 3, 10) + self.bn3 = torch.nn.BatchNorm2d(3, affine=True) + self.bn3.train() - model_export = torch.jit.script(MyModule()) - ort_sess = convert_to_onnx(model_export, input=(x,), opset_version=self.opset_version, - example_outputs=out, - training=torch.onnx.TrainingMode.TRAINING, - onnx_shape_inference=True) - ort_outs = run_ort(ort_sess, input=(x,)) - [np.testing.assert_allclose(p_out, ort_out, atol=10e-3, rtol=10e-3) for p_out, ort_out in - zip(pytorch_out, ort_outs)] + def forward(self, x): + x = self.bn1(x) + x = self.cv1(x) + x = self.bn2(x) + x = self.cv2(x) + x = self.bn3(x) + return x + + x = torch.randn(10, 3, 128, 128) + model_export = MyModule() + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-5) + model_export.eval() + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) @skipIfUnsupportedMinOpsetVersion(12) def test_dropout_training(self): @@ -7704,7 +7737,6 @@ def forward(self, x): model = MyModule() x = torch.randn(10) - model.train() ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, @@ -7741,7 +7773,6 @@ def forward(self, x): nb_elements = torch.numel(input) model.train() - ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING) ort_outs = run_ort(ort_sess, input=(x,)) @@ -7771,8 +7802,6 @@ def forward(self, x): np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01) - # Tests skipped temporarliy as latest onnxruntime release does not include training ops - @skipForAllOpsetVersions() def test_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): @@ -7785,32 +7814,11 @@ def forward(self, x): bn = self.bn(x) return bn - model = MyModule() + model_export = MyModule() x = torch.randn(10, 3, 128, 128) - ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs1 = run_ort(ort_sess1, input=(x,)) - ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.EVAL) - ort_outs2 = run_ort(ort_sess2, input=(x,)) - [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in - zip(ort_outs1, ort_outs2)] + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL) + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) - script_model = torch.jit.script(model) - outputs = model(x) - ort_sess1 = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version, - example_outputs=outputs, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs1 = run_ort(ort_sess1, input=(x,)) - ort_sess2 = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version, - example_outputs=outputs, - training=torch.onnx.TrainingMode.EVAL) - ort_outs2 = run_ort(ort_sess2, input=(x,)) - [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in - zip(ort_outs1, ort_outs2)] - - # Tests skipped temporarliy as latest onnxruntime release does not include training ops - @skipForAllOpsetVersions() def test_multiple_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): @@ -7836,16 +7844,10 @@ def forward(self, x): x = self.relu(x) return x - model = MyModule() + model_export = MyModule() x = torch.randn(2, 3, 224, 224) - ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs1 = run_ort(ort_sess1, input=(x,)) - ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.EVAL) - ort_outs2 = run_ort(ort_sess2, input=(x,)) - [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in - zip(ort_outs1, ort_outs2)] + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL) def test_script_custom_class_error(self): class BoxCoder(object): diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 06faf410f865e..02da90dd3066e 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -5,7 +5,7 @@ from torch.onnx import utils, OperatorExportTypes, TrainingMode from torch.onnx.symbolic_helper import _set_opset_version, _set_operator_export_type, _set_onnx_shape_inference import torch.utils.cpp_extension -from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion, +from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedMaxOpsetVersion) import caffe2.python.onnx.backend as backend from verify import verify @@ -37,7 +37,10 @@ def _model_to_graph(self, model, input, operator_export_type=OperatorExportTypes.ONNX, input_names=None, dynamic_axes=None): - + if training == torch.onnx.TrainingMode.TRAINING: + model.train() + elif training == torch.onnx.TrainingMode.EVAL: + model.eval() # Need disable onnx_shape_inference for this test because it puts const node to initializers. _set_onnx_shape_inference(False) utils._validate_dynamic_axes(dynamic_axes, model, None, None) @@ -811,11 +814,11 @@ def forward(self, x): model = torch.jit.script(MyModule()) x = torch.randn(10, 3, 128, 128) example_outputs = model(x) - f = io.BytesIO() _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True, example_outputs=example_outputs, operator_export_type=OperatorExportTypes.ONNX, + training=torch.onnx.TrainingMode.TRAINING, input_names=['x'], dynamic_axes={'x': [0, 1, 2, 3]}) graph_input_params = [param.debugName() for param in graph.inputs()] @@ -836,7 +839,6 @@ def forward(self, x): x = torch.tensor([1, 2]) verify(MyModel(), x, backend, do_constant_folding=False) - @skipIfUnsupportedOpsetVersion([14]) def test_fuse_conv_bn(self): class Fuse(torch.nn.Module): def __init__(self): @@ -858,11 +860,11 @@ def forward(self, x): assert len(list(graph.nodes())) == 1 - @skipIfUnsupportedOpsetVersion([14]) def test_fuse_resnet18(self): model = torchvision.models.resnet18(pretrained=True) x = torch.randn(2, 3, 224, 224, requires_grad=True) graph, _, __ = self._model_to_graph(model, (x, ), + training=TrainingMode.EVAL, input_names=['x'], dynamic_axes={'x': [0, 1, 2, 3]}) for node in graph.nodes(): @@ -882,7 +884,6 @@ def __init__(self): def forward(self, x, y): return f(x, y) - model = MyModule() input_1 = torch.tensor(11) input_2 = torch.tensor(12) _set_opset_version(self.opset_version) diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp index 18dea16cb97ae..05afb69ef0f23 100644 --- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp +++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp @@ -47,14 +47,20 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) { fuseConvBatchNorm(child_block, valsToParamsMap); } if (it->kind() == onnx::Conv) { - if (it->output()->uses().size() != 1) { + auto oldConv = *it; + if (oldConv->outputs().at(0)->uses().size() != 1) { continue; } - auto bnNode = it->output()->uses()[0].user; + auto bnNode = oldConv->outputs().at(0)->uses()[0].user; if (bnNode->kind() != onnx::BatchNormalization) { continue; } - auto oldConv = *it; + + if (oldConv->outputs().size() != + bnNode->outputs().size()) { // BN layer is not in eval mode + continue; + } + auto epsilon = bnNode->f(attr::epsilon); auto convInputVals = getValues(oldConv, valsToParamsMap); if (convInputVals.size() < 1 || @@ -109,11 +115,8 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) { convB = bnB; } - Node* newConv = - b->owningGraph()->create(onnx::Conv, bnNode->outputs().size()); - for (size_t i = 0; i < newConv->outputs().size(); ++i) { - newConv->outputs()[i]->copyMetadata(bnNode->outputs()[i]); - } + Node* newConv = b->owningGraph()->create(onnx::Conv, 1); + newConv->outputs().at(0)->copyMetadata(bnNode->outputs().at(0)); newConv->copyAttributes(*oldConv); newConv->insertBefore(bnNode); @@ -131,9 +134,7 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) { newConvB->inferTypeFrom(convB); newConv->addInput(newConvB); - bnNode->replaceAllUsesWith(newConv); - bnNode->removeAllInputs(); - it->removeAllInputs(); + bnNode->outputs().at(0)->replaceAllUsesWith(newConv->outputs().at(0)); bnNode->destroy(); it.destroyCurrent(); } diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 66f276ce42f55..13bc4800a6700 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -755,19 +755,23 @@ def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, na padding = tuple(tuple_fn(padding)) return padding -def assert_training_mode(op_mode, op_name): + +def check_training_mode(op_train_mode, op_name): global _training_mode - op_mode = True if op_mode == 1 else False - if op_mode != _training_mode: - op_mode = "training " if op_mode else "inference" + op_train_mode = True if op_train_mode == 1 else False + if _training_mode is not None and op_train_mode != _training_mode: + op_mode = "training " if op_train_mode else "inference" training_mode = "training " if _training_mode else "inference" # setting the model mode could result in op_mode != _training_mode # if the model is a FuncModule. In this case we warn the user of - # the state and export depending on training_mode + # the state and export depending on op_mode + # This is to support use-cases of fixing certain layer weights + # in training. warnings.warn("ONNX export mode is set to " + training_mode + " mode, but operator " + op_name + " is set to " + - op_mode + " mode. The model will be exported in " + - training_mode + ", as specified by the export mode.") + op_mode + " mode. The operators will be exported in " + + op_mode + ", as specified by the functional operator.") + def _flatten_helper(g, input, start_dim, end_dim, dim): input_size = g.op("Shape", input) diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py index 8e989cccf9867..d8f954148a1ee 100644 --- a/torch/onnx/symbolic_opset12.py +++ b/torch/onnx/symbolic_opset12.py @@ -4,6 +4,7 @@ from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block from sys import maxsize from torch.onnx.symbolic_opset9 import permute, _reshape_from_tensor +import warnings # EDITING THIS FILE? READ THIS FIRST! @@ -25,11 +26,12 @@ def outer(g, input, other): @parse_args("v", "f", "i") def dropout(g, input, p, train): - sym_help.assert_training_mode(train, "dropout") + sym_help.check_training_mode(train, "dropout") # in eval mode, dropout is non-op - if the node's train param is set to False, dropout is non-op - if not sym_help._training_mode: + if not train: return input - + warnings.warn("Dropout is a training op and should not be exported in inference mode. " + "For inference, make sure to call eval() on the model and to export it with param training=False.") p = g.op("Constant", value_t=torch.tensor(p)) t = g.op("Constant", value_t=torch.tensor(True)) r, _ = g.op("Dropout", input, p, t, outputs=2) diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py index 443aac97c9404..d4775b553da8d 100644 --- a/torch/onnx/symbolic_opset14.py +++ b/torch/onnx/symbolic_opset14.py @@ -38,7 +38,7 @@ def reshape(g, self, shape): @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i") def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled): - sym_help.assert_training_mode(training, "batch_norm") + sym_help.check_training_mode(training, "batch_norm") weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var) out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var, epsilon_f=eps, diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 616c2fcaff5b8..993284a292a96 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -1343,13 +1343,13 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i") def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled): - sym_help.assert_training_mode(training, "batch_norm") + sym_help.check_training_mode(training, "batch_norm") weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var) out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var, epsilon_f=eps, momentum_f=1 - momentum, - outputs=1 if not sym_help._training_mode else 5) - if not sym_help._training_mode: + outputs=1 if not training else 5) + if not training: return out else: res, new_running_mean, new_running_var, saved_mean, saved_var = out @@ -1624,12 +1624,12 @@ def exp(g, self): @parse_args("v", "f", "i") def dropout(g, input, p, train): - sym_help.assert_training_mode(train, "dropout") + sym_help.check_training_mode(train, "dropout") # in eval mode, dropout is non-op - if the node's train param is set to False, dropout is non-op - if not sym_help._training_mode: + if not train: return input warnings.warn("Dropout is a training op and should not be exported in inference mode. " - "Make sure to call eval() on the model, and to export it with param training=False.") + "For inference, make sure to call eval() on the model and to export it with param training=False.") r, _ = g.op("Dropout", input, ratio_f=p, outputs=2) return r From 877e6f2be3e78258247fb969577cb86be392e90c Mon Sep 17 00:00:00 2001 From: Charles David Hernandez Date: Wed, 18 Aug 2021 13:30:35 -0700 Subject: [PATCH 039/530] Bugfix for fuse qconfig comparison (#63384) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63384 In some cases the changes to qconfig on module would cause the fusions to fail. This bugfix solves that problem by adding a qconfig_function_comparison that compares the functions within the qconfig rather than the modules the qconfigs are on. The comparison looks at the partial object within QConfig.activation/weight.p and compares args, keywords and func. This is necessary to do mannually because partial doesn't have __eq__ implemented and so == reverts to is. Test Plan: python test/test_quantization.py TestFuseFx.test_problematic_fuse_example Imported from OSS Reviewed By: supriyar, ejguan Differential Revision: D30386264 fbshipit-source-id: 51e358c021c39d6f48dc12ad2a82b2838677b9de --- test/quantization/fx/test_quantize_fx.py | 32 ++++++++++++++++++++++++ torch/quantization/fx/prepare.py | 4 +-- torch/quantization/qconfig.py | 17 +++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 2f5f7c4a27f6e..bf15a06831bac 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -314,6 +314,38 @@ def test_qconfig_fused_module(self): self.checkGraphModuleNodes(quantized, expected_node_list=node_list) + def test_problematic_fuse_example(self): + class LinearRelu(nn.Sequential): + def __init__(self): + super().__init__( + nn.Linear(5, 5), + nn.ReLU(), + ) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.lin_relu = LinearRelu() + self.linear = nn.Linear(5, 5) + + def forward(self, x): + x = self.lin_relu(x) + x = self.linear(x) + return x + + model = M().eval() + # these qconfigs somehow fail equality where default_qconfig does not + qconfig_dict = { + "": None, + "object_type": [ + (torch.nn.Linear, get_default_qconfig('fbgemm')), + (torch.nn.ReLU, get_default_qconfig('fbgemm')), + ], + } + m = prepare_fx(model, qconfig_dict) + + self.checkGraphModuleNodes(m, expected_node=ns.call_module(torch.nn.intrinsic.modules.fused.LinearReLU)) + def test_fuse_custom_config_dict_validity(self): r""" Verifies that if a user passes an invalid key or makes a typo when diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index 873d11acaa82e..23d1d40bb543b 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -15,7 +15,7 @@ ) from torch.fx.node import Argument -from ..qconfig import QConfigAny +from ..qconfig import QConfigAny, qconfig_function_equality from .qconfig_utils import ( convert_dict_to_ordered_dict, generate_qconfig_map, @@ -195,7 +195,7 @@ def update_qconfig_for_fusion( # Raise an error if the modules in the fused module have # different qconfigs specified in the qconfig_dict for op in ops: - if object_type_dict.get(op, None) != fused_qconfig: + if not qconfig_function_equality(object_type_dict.get(op, None), fused_qconfig): raise LookupError("During fusion, we need to specify the same " + f"qconfigs for both modules in {module_type}.") diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py index 15eb174f021b9..01d67ddcbd8b1 100644 --- a/torch/quantization/qconfig.py +++ b/torch/quantization/qconfig.py @@ -209,3 +209,20 @@ def configure_constructor_to_put_obs_on_module_device(original_constructor): return QConfig(activation, weight) else: return QConfigDynamic(activation, weight) + + +def qconfig_function_equality(q1: QConfigAny, q2: QConfigAny): + # functools.partial has no __eq__ operator defined so '==' defaults to 'is' + def compare_partial(p1, p2): + same = p1.func == p2.func + same = same and p1.args == p2.args + return same and p1.keywords == p2.keywords + + if q1 is None or q2 is None: + return q1 == q2 + else: + assert q1 is not None and q2 is not None + try: + return compare_partial(q1.activation.p, q2.activation.p) and compare_partial(q1.weight.p, q2.weight.p) + except AttributeError: + return q1 == q2 From 9253dc1e5819a5638e4c60b0721fe14258bbae55 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 18 Aug 2021 13:33:36 -0700 Subject: [PATCH 040/530] Fix segmentation fault due to access to destroyed CudaIPCGlobalEntities instance (#56141) Summary: There is an instance of the static destruction order fiasco where cuda_ipc_global_entities may be accessed after it is destroyed. See https://github.com/pytorch/pytorch/issues/51961 This change uses a flag and avoids accesses to the destroyed class when it is set to false. Fixes https://github.com/pytorch/pytorch/issues/51961 This removes the function to clear shared_blocks introduced by https://github.com/pytorch/pytorch/issues/53080 which had multiple issues: Unprotected access to a shared structure and modification of the vector which is being cleared by the destructors of the objects contained. I.e. what happened was: - `CudaIPCSentDataLimbo_.clear_shared_blocks();` is called from the destructor of CudaIPCGlobalEntities as of your PR - This deletes instances of `CudaIPCSentData` which hold `at::DataPtr` created by `GetNewRefCountedSentData` - This means `CudaIPCSentDataDelete` is called with still active pointers - Hence `CudaIPCSentDataLimbo_.add` is called adding a new value to `shared_blocks_` Pull Request resolved: https://github.com/pytorch/pytorch/pull/56141 Reviewed By: ejguan Differential Revision: D30397279 Pulled By: VitalyFedyunin fbshipit-source-id: ce4b8b90fa1c90d275e5eca93ba84321cbc6140a --- torch/csrc/CudaIPCTypes.cpp | 39 ++++++++++++++++++++++++++----------- torch/csrc/CudaIPCTypes.h | 5 +---- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/torch/csrc/CudaIPCTypes.cpp b/torch/csrc/CudaIPCTypes.cpp index 6b42ca078024a..9033d445081ea 100644 --- a/torch/csrc/CudaIPCTypes.cpp +++ b/torch/csrc/CudaIPCTypes.cpp @@ -19,24 +19,27 @@ void warnProducerTerminatedBeforeSharedTensorsReleased() { } struct CudaIPCGlobalEntities { + // This class is used as a singleton (see cuda_ipc_global_entities) + // This variable is used to track its lifetime to avoid accessing it + // after it was destroyed which would lead to segmentation faults + // Note that a trvial type is used which doesn't suffer from construction + // and destruction order issues + static bool alive; + std::mutex ref_counters_mutex_; std::atomic sync_events_used_{0}; std::map> ref_counters_files_; std::shared_ptr next_available_ref_counters_file_; CudaIPCSentDataLimbo CudaIPCSentDataLimbo_; - CudaIPCGlobalEntities() = default; + CudaIPCGlobalEntities() { alive = true; } ~CudaIPCGlobalEntities() { CudaIPCSentDataLimbo_.collect(); - // Clear shared blocks to avoid releasing shared blocks after - // ~CudaIPCGlobalEntities is done since circular references causes the - // destructor of ~CudaIPCSentData to access the cuda_ipc_global_entities - // again. - CudaIPCSentDataLimbo_.clear_shared_blocks(); safe_clean_current_file(); if (next_available_ref_counters_file_) { warnProducerTerminatedBeforeSharedTensorsReleased(); } + alive = false; } void safe_clean_current_file() { std::lock_guard lock(ref_counters_mutex_); @@ -48,19 +51,16 @@ struct CudaIPCGlobalEntities { } }; +bool CudaIPCGlobalEntities::alive = false; CudaIPCGlobalEntities cuda_ipc_global_entities; CudaIPCSentDataLimbo::~CudaIPCSentDataLimbo() { collect(); - if (shared_blocks_.size() > 0) { + if (size() > 0) { warnProducerTerminatedBeforeSharedTensorsReleased(); } } -void CudaIPCSentDataLimbo::clear_shared_blocks() { - shared_blocks_.clear(); -} - bool CudaIPCSentDataLimbo::collect() { bool freed_memory = false; std::vector> reset_blocks; @@ -99,9 +99,17 @@ void CudaIPCSentDataLimbo::add(std::unique_ptr shared_block) { shared_blocks_.push_back(std::move(shared_block)); } +uint64_t CudaIPCSentDataLimbo::size() { + std::lock_guard lock(limbo_mutex_); + return shared_blocks_.size(); +} + void CudaIPCSentDataDelete(void* ptr) { std::unique_ptr sent_data( static_cast(ptr)); + if(!CudaIPCGlobalEntities::alive) { + return; + } if (sent_data->counter_value() > 0) { cuda_ipc_global_entities.CudaIPCSentDataLimbo_.add(std::move(sent_data)); } @@ -109,6 +117,9 @@ void CudaIPCSentDataDelete(void* ptr) { } void ReturnRefCounter(const std::string& handle, uint64_t offset /* unused */) { + if(!CudaIPCGlobalEntities::alive) { + return; + } std::lock_guard lock( cuda_ipc_global_entities.ref_counters_mutex_); auto& map = cuda_ipc_global_entities.ref_counters_files_; @@ -180,6 +191,9 @@ CudaIPCSentData::~CudaIPCSentData() { if (event_sync_required_) { at::cuda::CUDAGuard device_guard(device_.index()); cudaEventDestroy(event_); + if(!CudaIPCGlobalEntities::alive) { + return; + } cuda_ipc_global_entities.sync_events_used_ --; } } catch (...) { /* No throw */ @@ -226,6 +240,9 @@ at::DataPtr GetNewRefCountedSentData(void* data, at::Device device) { } bool CudaIPCCollect() { + if(!CudaIPCGlobalEntities::alive) { + return true; + } bool freed_memory = cuda_ipc_global_entities.CudaIPCSentDataLimbo_.collect(); if (cuda_ipc_global_entities.CudaIPCSentDataLimbo_.size() == 0) { cuda_ipc_global_entities.safe_clean_current_file(); diff --git a/torch/csrc/CudaIPCTypes.h b/torch/csrc/CudaIPCTypes.h index 63e1d1d416a5a..ab9ede006916d 100644 --- a/torch/csrc/CudaIPCTypes.h +++ b/torch/csrc/CudaIPCTypes.h @@ -63,11 +63,8 @@ constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000; struct CudaIPCSentDataLimbo final { ~CudaIPCSentDataLimbo(); bool collect(); - void clear_shared_blocks(); void add(std::unique_ptr shared_block); - uint64_t size() { - return shared_blocks_.size(); - } + uint64_t size(); private: // TODO: Can be changed to FIFO in order to avoid full traverse on every From feba6806c9b7a5e13ffd5839f8bc6ddc2e016a26 Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Wed, 18 Aug 2021 13:43:54 -0700 Subject: [PATCH 041/530] clarify that `torch.finfo.tiny` is the smallest normal number (#63241) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63241 This is a common source of confusion, but it matches the NumPy behavior. Fixes #44010 Fixes #59526 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30307646 Pulled By: dagitses fbshipit-source-id: d848140ba267560387d83f3e7acba8c3cdc53d82 --- docs/source/type_info.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/type_info.rst b/docs/source/type_info.rst index fe8eaa1769adf..0647cca544c0f 100644 --- a/docs/source/type_info.rst +++ b/docs/source/type_info.rst @@ -26,13 +26,18 @@ bits int The number of bits occupied by the type. eps float The smallest representable number such that ``1.0 + eps != 1.0``. max float The largest representable number. min float The smallest representable number (typically ``-max``). -tiny float The smallest positive representable number. +tiny float The smallest positive normal number. See notes. resolution float The approximate decimal resolution of this type, i.e., ``10**-precision``. ========== ===== ======================================== .. note:: The constructor of :class:`torch.finfo` can be called without argument, in which case the class is created for the pytorch default dtype (as returned by :func:`torch.get_default_dtype`). +.. note:: + `tiny` returns the smallest *normal* number, but there are smaller + subnormal numbers. See https://en.wikipedia.org/wiki/Denormal_number + for more information. + .. _iinfo-doc: From 8bdd5424173cd08ddafd77cb45d38c7540ae72d6 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 18 Aug 2021 14:46:25 -0700 Subject: [PATCH 042/530] [TensorExpr] Add debug logging to LoopNest::computeInline. (#63196) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63196 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30292778 Pulled By: ZolotukhinM fbshipit-source-id: d8a111b75466a9354f6d048119cc6f814c9d5abb --- torch/csrc/jit/tensorexpr/loopnest.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index ea6f09349e444..e9bc76c6e8791 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -558,12 +559,20 @@ class FunctionInliner : public IRMutator { } // Add a mapping for each function parameter to it's source name. inline_mapping_[func_callee_arg] = func_caller_param; + GRAPH_DEBUG( + "ComputeInline: Inline mapping: ", + std::to_string(func_callee_arg), + " -> ", + std::to_string(func_caller_param)); index_vars.push_back(func_callee_arg); } // Call the actual replacement. ExprPtr body = producer_->value(); + GRAPH_DEBUG("ComputeInline: Before rewriting body: ", std::to_string(body)); ExprPtr result = Expr::clone(body)->accept_mutator(this); + GRAPH_DEBUG( + "ComputeInline: After rewriting body: ", std::to_string(result)); // Remove the mappings we created for this function parameters. for (auto v : index_vars) { @@ -575,6 +584,7 @@ class FunctionInliner : public IRMutator { } } } + GRAPH_DEBUG("ComputeInline: Inline mapping: erasing", std::to_string(v)); inline_mapping_.erase(v); } return result; @@ -617,6 +627,8 @@ class FunctionInliner : public IRMutator { const std::string& name = buf_->name_hint(); VarPtr new_var = alloc(name, v->dtype()); random_bindings_[alloc(new_var, v)] = index_vars_; + GRAPH_DEBUG( + "ComputeInline: created random bindings for ", std::to_string(new_var)); return new_var; } @@ -731,6 +743,7 @@ bool LoopNest::computeInline(BufPtr b) { TORCH_INTERNAL_ASSERT(relevant_store); + GRAPH_DEBUG("ComputeInline: Def: ", std::to_string(relevant_store)); FunctionInliner inliner(relevant_store, output_bufs_); root_stmt_ = root_stmt_->accept_mutator(&inliner); From 7fdba4564af4c21727811aeaf3d58703d189f76d Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 18 Aug 2021 14:46:25 -0700 Subject: [PATCH 043/530] [TensorExpr] IRSimplifier: sort terms in polynomials, terms, minterms, maxterms. (#63197) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63197 This solves non-determinism from using hash values in sort methods. Changes in tests are mostly mechanical. Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30292776 Pulled By: ZolotukhinM fbshipit-source-id: 74f57b53c3afc9d4be45715fd74781271373e055 --- test/cpp/tensorexpr/test_cuda.cpp | 20 +- test/cpp/tensorexpr/test_loopnest.cpp | 47 +-- test/cpp/tensorexpr/test_reductions.cpp | 8 +- test/cpp/tensorexpr/test_registerizer.cpp | 76 ++--- test/cpp/tensorexpr/test_simplify.cpp | 322 +++++++------------- test/cpp/tensorexpr/test_utils.h | 5 + torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 99 +++++- torch/csrc/jit/tensorexpr/ir_simplifier.h | 40 +-- 8 files changed, 288 insertions(+), 329 deletions(-) diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index 3ca6e0d9f5c3a..e36e17ad432f9 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -1575,10 +1575,10 @@ TEST(Cuda, MaskMultiDim_CUDA) { const std::string& verification_pattern = R"IR( # CHECK-NOT: if ( -# CHECK: C[100 * blockIdx.x + threadIdx.x] = +# CHECK: C[threadIdx.x + 100 * blockIdx.x] = # CHECK: __syncthreads(); # CHECK: if (threadIdx.x<50 -# CHECK: D[50 * blockIdx.x + threadIdx.x] =)IR"; +# CHECK: D[threadIdx.x + 50 * blockIdx.x] =)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -1705,10 +1705,10 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) { const std::string& verification_pattern = R"IR( # CHECK: if (threadIdx.x loops = {outer_for, inner_for}; ForPtr flattened = nullptr; @@ -3420,7 +3431,7 @@ TEST(LoopNest, FlattenSimpleLoopNest3D) { auto for1 = For::make(k, 0, 7, for_body); auto for2 = For::make(j, 0, 5, for1); auto for3 = For::make(i, 0, 10, for2); - Block::make({for3}); + auto parent_block = Block::make({for3}); std::vector loops = {for3, for2, for1}; ForPtr flattened = nullptr; @@ -3463,7 +3474,7 @@ TEST(LoopNest, FlattenLoopNestAfterNormalize) { auto for_body = Block::make({Store::make(a_buf, {i - 2, j - 3}, i * j)}); auto inner_for = For::make(j, 3, 15, for_body); auto outer_for = For::make(i, 2, 10, inner_for); - Block::make({outer_for}); + auto parent_block = Block::make({outer_for}); std::vector loops = {outer_for, inner_for}; ForPtr flattened = nullptr; @@ -3712,7 +3723,7 @@ TEST(LoopNest, CacheReadsSimple) { #CHECK: A_local[j_1] = A[ #CHECK: } #CHECK: for (int j_2 -#CHECK: B[10 * i_1 + j_2] = A_local[j_2]; +#CHECK: B[j_2 + 10 * i_1] = A_local[j_2]; #CHECK: } #CHECK: } #CHECK: for (int i_2 @@ -3769,7 +3780,7 @@ TEST(LoopNest, CacheReadsOuter) { checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[21, 11] #CHECK: A_local[j_1 + 11 * i_1] = -#CHECK: B[10 * i_2 + j_2] = (A_local[(j_2 + 11 * i_2) + 12]) + (A_local[j_2 + 11 * i_2]); +#CHECK: B[j_2 + 10 * i_2] = (A_local[j_2 + 11 * i_2]) + (A_local[(j_2 + 11 * i_2) + 12]); )IR"); std::vector b_data(200, 0); @@ -3816,7 +3827,7 @@ TEST(LoopNest, CacheReadsInternal) { checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[2, 11] #CHECK: A_local[j_1 + 11 * i_2] = -#CHECK: B[10 * i_1 + j_2] = (A_local[j_2 + 12]) + (A_local[j_2]); +#CHECK: B[j_2 + 10 * i_1] = (A_local[j_2 + 12]) + (A_local[j_2]); )IR"); std::vector b_data(200, 0); @@ -3863,8 +3874,8 @@ TEST(LoopNest, CacheReadsInner) { checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[5, 2] -#CHECK: A_local[2 * i_2 + j_2] = -#CHECK: B[10 * i_1 + j_1] = (A_local[1]) + (A_local[8]); +#CHECK: A_local[j_2 + 2 * i_2] = +#CHECK: B[j_1 + 10 * i_1] = (A_local[1]) + (A_local[8]); )IR"); std::vector b_data(200, 0); @@ -3914,7 +3925,7 @@ TEST(LoopNest, CacheWritesSimple) { #CHECK: for (int j = 0; j < 64 #CHECK: A_local[j] = i * j; #CHECK: for (int j_1 = 0; j_1 < 64 -#CHECK: A[64 * i + j_1] = A_local[ +#CHECK: A[j_1 + 64 * i] = A_local[ #CHECK: Free(A_local); #CHECK-NOT: A_local )IR"); diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index bd71a4fd8da14..0d033e0bd8a1f 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -1578,8 +1578,8 @@ TEST(Reductions, ReductionCacheBodyAccess) { #CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12] #CHECK: for (int j = 0; j < 32; j++) { #CHECK: for (int k = 0; k < 12; k++) { -#CHECK: scale_local[k + 12 * j] = scale[(k + 384 * l1) + 12 * j]; -#CHECK: sum[l1] = (sum[l1]) + (scale_local[12 * n1_1 + m1_1]); +#CHECK: scale_local[k + 12 * j] = scale[(k + 12 * j) + 384 * l1]; +#CHECK: sum[l1] = (sum[l1]) + (scale_local[m1_1 + 12 * n1_1]); #CHECK: scale_1[l] = (b[l]) * (sum[l]); #CHECK: Free(scale_local); )IR"; @@ -1667,7 +1667,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { const std::string& expected_ir = R"IR( #CHECK: Allocate(sum_local); // dtype=float, dims=[4] -#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((12 * n1_1 + 384 * l1_inner) + m1_1) + 1536 * l1_outer]); +#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((m1_1 + 12 * n1_1) + 1536 * l1_outer) + 384 * l1_inner]); #CHECK: for (int i = 0; i < 4 #CHECK: sum_local[i] = sum[i + 4 * l_outer]; #CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]); @@ -1716,7 +1716,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { const std::string& expected_ir = R"IR( #CHECK: Allocate(sum_local); // dtype=float, dims=[4] -#CHECK: sum[l1] = (sum[l1]) + (scale[(12 * n1_1 + m1_1) + 384 * l1]); +#CHECK: sum[l1] = (sum[l1]) + (scale[(m1_1 + 12 * n1_1) + 384 * l1]); #CHECK: for (int i = 0; i < 4 #CHECK: sum_local[i] = sum[i + 4 * l_outer]; #CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]); diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp index a0ac095db757f..98a53058a1a65 100644 --- a/test/cpp/tensorexpr/test_registerizer.cpp +++ b/test/cpp/tensorexpr/test_registerizer.cpp @@ -192,8 +192,8 @@ TEST(Registerizer, RegisterizerLoopInternal) { R"IR( # CHECK: for (int x = 0; x < 10; x++) # CHECK: int A_1 = A[x]; -# CHECK: A_1 = x + A_1; -# CHECK: A_1 = x + A_1; +# CHECK: A_1 = A_1 + x; +# CHECK: A_1 = A_1 + x; # CHECK: A[x] = A_1; # CHECK: })IR"; @@ -273,12 +273,12 @@ TEST(Registerizer, RegisterizerLoopInternalRepeated) { * int A_1 = A[1]; * int A_2 = A[0]; * for (int x = 0; x < 10; x++) { - * A_2 = x + A_1; - * A_2 = x + A_1; + * A_2 = A_1 + x; + * A_2 = A_1 + x; * } * for (int x = 0; x < 10; x++) { - * A_2 = x + A_1; - * A_2 = x + A_1; + * A_2 = A_1 + x; + * A_2 = A_1 + x; * } * A[0] = A_2; */ @@ -291,12 +291,12 @@ TEST(Registerizer, RegisterizerLoopInternalRepeated) { # CHECK: int A_1 = A[1]; # CHECK: int A_2 = A[0]; # CHECK: for (int x = 0; x < 10; x++) -# CHECK: A_2 = x + A_1; -# CHECK: A_2 = x + A_1; +# CHECK: A_2 = A_1 + x; +# CHECK: A_2 = A_1 + x; # CHECK: } # CHECK: for (int x = 0; x < 10; x++) -# CHECK: A_2 = x + A_1; -# CHECK: A_2 = x + A_1; +# CHECK: A_2 = A_1 + x; +# CHECK: A_2 = A_1 + x; # CHECK: } # CHECK-NOT: A[1] # CHECK: A[0] = A_2; @@ -357,7 +357,7 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) { BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); - StmtPtr stmt = Block::make( + StmtPtr stmt = IRSimplifier::simplify(Block::make( {For::make( x, 0, @@ -373,7 +373,7 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) { {Store::make(a, {0}, Add::make(x, Load::make(a, {y}))), Store::make(a, {0}, Add::make(x, Load::make(a, {y})))})) - }); + })); /* * for (int x = 0; x < 10; x++) { @@ -2044,7 +2044,7 @@ TEST(Registerizer, RegisterizerPartialAfter) { /* * int A_1 = 0; * for (int x = 0; x < 10; x++) { - * A_1 = x + A_1; + * A_1 = A_1 + x; * } * A[0] = A_1; * for (int x = 1; x < 10; x++) { @@ -2059,7 +2059,7 @@ TEST(Registerizer, RegisterizerPartialAfter) { R"IR( # CHECK: int A_1 = 0; # CHECK: for ( -# CHECK: A_1 = x + A_1; +# CHECK: A_1 = A_1 + x; # CHECK: } # CHECK: A[0] = A_1; # CHECK: for ( @@ -2104,7 +2104,7 @@ TEST(Registerizer, RegisterizerPartialBefore) { * } * int A_1 = 0; * for (int x = 0; x < 10; x++) { - * A_1 = x + A_1; + * A_1 = A_1 + x; * } * A[0] = A_1; */ @@ -2120,7 +2120,7 @@ TEST(Registerizer, RegisterizerPartialBefore) { # CHECK: } # CHECK: int A_1 = 0; # CHECK: for ( -# CHECK: A_1 = x + A_1; +# CHECK: A_1 = A_1 + x; # CHECK: } # CHECK: A[0] = A_1;)IR"; @@ -2161,7 +2161,7 @@ TEST(Registerizer, RegisterizerPartialInside) { /* * int A_1 = 2; * for (int x1 = 0; x1 < 10; x1++) { - * A_1 = x1 + A_1; + * A_1 = A_1 + x1; * } * A[0] = A_1; * for (int x2 = 1; x2 < 10; x2++) { @@ -2169,7 +2169,7 @@ TEST(Registerizer, RegisterizerPartialInside) { * } * int A_2 = A[0]; * for (int x3 = 0; x3 < 10; x3++) { - * A_2 = x3 + A_2; + * A_2 = A_2 + x3; * } * A[0] = A_2; */ @@ -2181,7 +2181,7 @@ TEST(Registerizer, RegisterizerPartialInside) { R"IR( # CHECK: int A_1 = 2; # CHECK: for ( -# CHECK: A_1 = x1 + A_1; +# CHECK: A_1 = A_1 + x1; # CHECK: } # CHECK: A[0] = A_1; # CHECK: for ( @@ -2189,7 +2189,7 @@ TEST(Registerizer, RegisterizerPartialInside) { # CHECK: } # CHECK: int A_2 = A[0]; # CHECK: for ( -# CHECK: A_2 = x3 + A_2; +# CHECK: A_2 = A_2 + x3; # CHECK: } # CHECK: A[0] = A_2;)IR"; @@ -2232,7 +2232,7 @@ TEST(Registerizer, RegisterizerPartialCondition) { /* * int A_1 = 2; * for (int x = 0; x < 10; x++) { - * A_1 = x + A_1; + * A_1 = A_1 + x; * } * A[0] = A_1; * if (x<5 ? 1 : 0) { @@ -2240,7 +2240,7 @@ TEST(Registerizer, RegisterizerPartialCondition) { * } * int A_2 = A[0]; * for (int x = 0; x < 10; x++) { - * A_2 = x + A_2; + * A_2 = A_2 + x; * } * A[0] = A_2; */ @@ -2252,7 +2252,7 @@ TEST(Registerizer, RegisterizerPartialCondition) { R"IR( # CHECK: int A_1 = 2; # CHECK: for ( -# CHECK: A_1 = x + A_1; +# CHECK: A_1 = A_1 + x; # CHECK: } # CHECK: A[0] = A_1; # CHECK: if ( @@ -2260,7 +2260,7 @@ TEST(Registerizer, RegisterizerPartialCondition) { # CHECK: } # CHECK: int A_2 = A[0]; # CHECK: for ( -# CHECK: A_2 = x + A_2; +# CHECK: A_2 = A_2 + x; # CHECK: } # CHECK: A[0] = A_2;)IR"; @@ -2937,7 +2937,7 @@ TEST(Registerizer, RegisterizerNestedLoopSimple) { * for (int y = 0; y < 10; y++) { * int A_1 = A[y]; * for (int x = 0; x < 10; x++) { - * A_1 = x + A_1; + * A_1 = A_1 + x; * } * A[y] = A_1; * } @@ -2951,7 +2951,7 @@ TEST(Registerizer, RegisterizerNestedLoopSimple) { # CHECK: for (int y # CHECK: int A_1 = A[y]; # CHECK: for (int x -# CHECK: A_1 = x + A_1; +# CHECK: A_1 = A_1 + x; # CHECK: } # CHECK: A[y] = A_1; # CHECK: })IR"; @@ -3366,13 +3366,13 @@ TEST(Registerizer, RegisterizerLoopLetVar) { BufHandle a("A", {10}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); - StmtPtr stmt = Block::make({For::make( + StmtPtr stmt = IRSimplifier::simplify(Block::make({For::make( x, 0, 10, Block::make( {Let::make(y, 30), - Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))}); + Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))})); /* * for (int x = 0; x < 10; x++) { @@ -3422,7 +3422,7 @@ TEST(Registerizer, RegisterizerLoopLetVarOuter) { * int y = 30; * int A_1 = A[y]; * for (int x = 0; x < 10; x++) { - * A_1 = x + A_1; + * A_1 = A_1 + x; * } * A[y] = A_1; */ @@ -3435,7 +3435,7 @@ TEST(Registerizer, RegisterizerLoopLetVarOuter) { # CHECK: int y = 30; # CHECK: int A_1 = A[y]; # CHECK: for (int x -# CHECK: A_1 = x + A_1; +# CHECK: A_1 = A_1 + x; # CHECK: A[y] = A_1;)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -3516,7 +3516,7 @@ TEST(Registerizer, RegisterizerMultiDimPartial) { * int A_1 = A[0, 1, 4]; * int A_2 = A[0, 2, 2]; * for (int x = 0; x < 10; x++) { - * A_2 = x + A_1; + * A_2 = A_1 + x; * } * A[0, 2, 2] = A_2; */ @@ -3530,7 +3530,7 @@ TEST(Registerizer, RegisterizerMultiDimPartial) { # CHECK: int A_1 = A[0, 1, 4]; # CHECK: int A_2 = A[0, 2, 2]; # CHECK: for ( -# CHECK: A_2 = x + A_1; +# CHECK: A_2 = A_1 + x; # CHECK: A[0, 2, 2] = A_2;)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -3599,7 +3599,7 @@ TEST(Registerizer, RegisterizerMultiDimPartialOverlap) { * A[0, 1, 2] = 0; * int A_1 = A[y, 2, 4]; * for (int x = 0; x < 10; x++) { - * A[0, x, 2] = x + A_1; + * A[0, x, 2] = A_1 + x; * } */ @@ -3611,7 +3611,7 @@ TEST(Registerizer, RegisterizerMultiDimPartialOverlap) { # CHECK: A[0, 1, 2] = 0; # CHECK: int A_1 = A[y, 2, 4]; # CHECK: for ( -# CHECK: A[0, x, 2] = x + A_1; +# CHECK: A[0, x, 2] = A_1 + x; # CHECK: })IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -3736,12 +3736,12 @@ TEST(Registerizer, RegisterizerMultiDim3DReduction2) { /* * for (int x = 0; x < 10; x++) { - * int C_1 = C[x]; * int A_1 = A[x]; + * int C_1 = C[x]; * for (int y = 0; y < 10; y++) { * int B_1 = B[y]; * for (int z = 0; z < 10; z++) { - * C_1 = C_1 + A_1 * B_1; + * C_1 = A_1 * B_1 + C_1; * } * } * C[x] = C_1; @@ -3754,12 +3754,12 @@ TEST(Registerizer, RegisterizerMultiDim3DReduction2) { const std::string& verification_pattern = R"IR( # CHECK: for (int x -# CHECK: int C_1 = C[x]; # CHECK: int A_1 = A[x]; +# CHECK: int C_1 = C[x]; # CHECK: for (int y # CHECK: int B_1 = B[y]; # CHECK: for (int z -# CHECK: C_1 = C_1 + A_1 * B_1; +# CHECK: C_1 = A_1 * B_1 + C_1; # CHECK: } # CHECK: } # CHECK: C[x] = C_1; diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index a08d4ca974fd1..c25ae4f68a1fc 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -649,12 +649,12 @@ TEST(Simplify, SimplifyMultiVar) { ASSERT_NE(lhs, nullptr); VarPtr varX = to(lhs->rhs()); ASSERT_NE(varX, nullptr); - ASSERT_EQ(varX->name_hint(), "y"); + ASSERT_EQ(varX->name_hint(), "x"); MulPtr rhs = to(root->rhs()); ASSERT_NE(rhs, nullptr); VarPtr varY = to(rhs->rhs()); ASSERT_NE(varY, nullptr); - ASSERT_EQ(varY->name_hint(), "x"); + ASSERT_EQ(varY->name_hint(), "y"); } // x + 2 + y => x + y + 2 @@ -698,8 +698,8 @@ TEST(Simplify, SimplifyAdds) { IS_NODE_WITH_NAME(Mul, simplified.node(), root); IS_IMM_WITH_VAL(Int, root->lhs(), 2); IS_NODE_WITH_NAME(Add, root->rhs(), add); - IS_VAR_WITH_NAME(add->lhs(), "y"); - IS_VAR_WITH_NAME(add->rhs(), "x"); + IS_VAR_WITH_NAME(add->lhs(), "x"); + IS_VAR_WITH_NAME(add->rhs(), "y"); } { @@ -770,11 +770,11 @@ TEST(Simplify, SimplifyMuls) { IS_NODE_WITH_NAME(Mul, simplified.node(), mul); IS_NODE_WITH_NAME(Add, mul->lhs(), lhs); - IS_VAR_WITH_NAME(lhs->lhs(), "y"); - IS_VAR_WITH_NAME(lhs->rhs(), "x"); + IS_VAR_WITH_NAME(lhs->lhs(), "x"); + IS_VAR_WITH_NAME(lhs->rhs(), "y"); IS_NODE_WITH_NAME(Add, mul->rhs(), rhs); - IS_VAR_WITH_NAME(rhs->lhs(), "y"); - IS_VAR_WITH_NAME(rhs->rhs(), "x"); + IS_VAR_WITH_NAME(rhs->lhs(), "x"); + IS_VAR_WITH_NAME(rhs->rhs(), "y"); } { @@ -867,8 +867,8 @@ TEST(Simplify, SimplifyMuls) { ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Mul, simplified.node(), mul); IS_NODE_WITH_NAME(Add, mul->lhs(), lhs); - IS_VAR_WITH_NAME(lhs->lhs(), "y"); - IS_VAR_WITH_NAME(lhs->rhs(), "x"); + IS_VAR_WITH_NAME(lhs->lhs(), "x"); + IS_VAR_WITH_NAME(lhs->rhs(), "y"); IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs); IS_VAR_WITH_NAME(rhs->lhs(), "x"); IS_VAR_WITH_NAME(rhs->rhs(), "y"); @@ -1654,14 +1654,14 @@ TEST(Simplify, SimplifyMultiOp) { } { - // (x + y) - (x * y) => x + y - (x * y) - ExprHandle body = (x + y) - (x * y); + // (x + y) - x * y => (x + y) - x * y + ExprHandle body = (x + y) - x * y; ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Sub, simplified.node(), sub); IS_NODE_WITH_NAME(Add, sub->lhs(), add); IS_NODE_WITH_NAME(Mul, sub->rhs(), mul); - IS_VAR_WITH_NAME(add->lhs(), "y"); - IS_VAR_WITH_NAME(add->rhs(), "x"); + IS_VAR_WITH_NAME(add->lhs(), "x"); + IS_VAR_WITH_NAME(add->rhs(), "y"); IS_VAR_WITH_NAME(mul->lhs(), "x"); IS_VAR_WITH_NAME(mul->rhs(), "y"); } @@ -1709,19 +1709,19 @@ TEST(Simplify, SimplifyManyOps) { VarHandle y("y", kInt); { - // x + y + x + x + y + y + x + y + x = 5 * x + 4 * y + // x + y + x + x + y + y + x + y + x = 4 * y + 5 * x ExprHandle body = x + y + x + x + y + y + x + y + x; ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Add, simplified.node(), add); IS_NODE_WITH_NAME(Mul, add->lhs(), lhs); - IS_IMM_WITH_VAL(Int, lhs->lhs(), 5); - IS_VAR_WITH_NAME(lhs->rhs(), "x"); + IS_IMM_WITH_VAL(Int, lhs->lhs(), 4); + IS_VAR_WITH_NAME(lhs->rhs(), "y"); IS_NODE_WITH_NAME(Mul, add->rhs(), rhs); - IS_IMM_WITH_VAL(Int, rhs->lhs(), 4); - IS_VAR_WITH_NAME(rhs->rhs(), "y"); + IS_IMM_WITH_VAL(Int, rhs->lhs(), 5); + IS_VAR_WITH_NAME(rhs->rhs(), "x"); } { @@ -1765,8 +1765,8 @@ TEST(Simplify, SimplifyFactorization) { IS_IMM_WITH_VAL(Int, mul->lhs(), 2); IS_NODE_WITH_NAME(Add, mul->rhs(), add); - IS_VAR_WITH_NAME(add->lhs(), "y"); - IS_VAR_WITH_NAME(add->rhs(), "x"); + IS_VAR_WITH_NAME(add->lhs(), "x"); + IS_VAR_WITH_NAME(add->rhs(), "y"); } { @@ -1794,12 +1794,12 @@ TEST(Simplify, SimplifyFactorization) { IS_NODE_WITH_NAME(Add, simplified.node(), add); IS_NODE_WITH_NAME(Mul, add->lhs(), lhs); - IS_IMM_WITH_VAL(Int, lhs->lhs(), 5); - IS_VAR_WITH_NAME(lhs->rhs(), "y"); + IS_IMM_WITH_VAL(Int, lhs->lhs(), 2); + IS_VAR_WITH_NAME(lhs->rhs(), "x"); IS_NODE_WITH_NAME(Mul, add->rhs(), rhs); - IS_IMM_WITH_VAL(Int, rhs->lhs(), 2); - IS_VAR_WITH_NAME(rhs->rhs(), "x"); + IS_IMM_WITH_VAL(Int, rhs->lhs(), 5); + IS_VAR_WITH_NAME(rhs->rhs(), "y"); } { @@ -1813,8 +1813,8 @@ TEST(Simplify, SimplifyFactorization) { IS_IMM_WITH_VAL(Int, mul->lhs(), 10); IS_NODE_WITH_NAME(Add, mul->rhs(), add); - IS_VAR_WITH_NAME(add->lhs(), "y"); - IS_VAR_WITH_NAME(add->rhs(), "x"); + IS_VAR_WITH_NAME(add->lhs(), "x"); + IS_VAR_WITH_NAME(add->rhs(), "y"); } { @@ -1863,18 +1863,12 @@ TEST(Simplify, SimplifyFactorization) { VarHandle g("g", kInt); VarHandle h("h", kInt); - ExprHandle body = ExprHandle(0) + (ExprHandle(1024) * a) + - (ExprHandle(-1) * b) + (ExprHandle(-1) * c) + (ExprHandle(1) * d) + - (ExprHandle(1) * e) + (ExprHandle(32) * f) + (ExprHandle(-1024) * g) + - (ExprHandle(-32) * h); + ExprHandle body = a * 1024 + 0 + b * (-1) + c * (-1) + d * 1 + e * 1 + + f * 32 + g * (-1024) + h * (-32); ExprHandle simplified = IRSimplifier::simplify(body); - - // We only check for the top level nodes here, since the main purpose - // here is ensure that this simplification completes. - IS_NODE_WITH_NAME(Sub, simplified.node(), sub); - IS_NODE_WITH_NAME(Mul, sub->rhs(), mul); - IS_IMM_WITH_VAL(Int, mul->lhs(), 1024); - IS_VAR_WITH_NAME(mul->rhs(), "g"); + checkExprIR( + simplified, + "((((((d + e) + 1024 * a) + 32 * f) - b) - c) - 1024 * g) - 32 * h"); } } @@ -1904,7 +1898,7 @@ TEST(Simplify, SimplifyFactorizeUneven) { IS_VAR_WITH_NAME(zmul->rhs(), "z"); } -// (x * y) + (2 * x) * (x + y) => 3 * (x * y) + 2 * (x * x) +// (x * y) + (2 * x) * (x + y) => 2 * (x * x) + 3 * (x * y) // This is kind of a placeholder test for variable factorization. TEST(Simplify, SimplifyDeeperTerms) { KernelScope kernel_scope; @@ -1916,16 +1910,16 @@ TEST(Simplify, SimplifyDeeperTerms) { IS_NODE_WITH_NAME(Add, simplified.node(), add); IS_NODE_WITH_NAME(Mul, add->lhs(), lhs); - IS_IMM_WITH_VAL(Int, lhs->lhs(), 3); - IS_NODE_WITH_NAME(Mul, lhs->rhs(), xyTerm); - IS_VAR_WITH_NAME(xyTerm->lhs(), "x"); - IS_VAR_WITH_NAME(xyTerm->rhs(), "y"); + IS_IMM_WITH_VAL(Int, lhs->lhs(), 2); + IS_NODE_WITH_NAME(Mul, lhs->rhs(), xxTerm); + IS_VAR_WITH_NAME(xxTerm->lhs(), "x"); + IS_VAR_WITH_NAME(xxTerm->rhs(), "x"); IS_NODE_WITH_NAME(Mul, add->rhs(), rhs); - IS_IMM_WITH_VAL(Int, rhs->lhs(), 2); - IS_NODE_WITH_NAME(Mul, rhs->rhs(), xxTerm); - IS_VAR_WITH_NAME(xxTerm->rhs(), "x"); - IS_VAR_WITH_NAME(xxTerm->rhs(), "x"); + IS_IMM_WITH_VAL(Int, rhs->lhs(), 3); + IS_NODE_WITH_NAME(Mul, rhs->rhs(), xyTerm); + IS_VAR_WITH_NAME(xyTerm->lhs(), "x"); + IS_VAR_WITH_NAME(xyTerm->rhs(), "y"); } // Tests the difference between two less trivial expressions. @@ -1987,15 +1981,15 @@ TEST(Simplify, SimplifyOpaqueTerms) { VarHandle y("y", kInt); { - // 2 * x/y * x - x/y * y => y * x/y + // 2 * x/y * y - x/y * y => x/y * y ExprHandle body = ((ExprHandle(2)) * (x / y) * y) - ((x / y) * y); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Mul, simplified.node(), mul); - IS_VAR_WITH_NAME(mul->lhs(), "y"); - IS_NODE_WITH_NAME(Div, mul->rhs(), div); + IS_NODE_WITH_NAME(Div, mul->lhs(), div); IS_VAR_WITH_NAME(div->lhs(), "x"); IS_VAR_WITH_NAME(div->rhs(), "y"); + IS_VAR_WITH_NAME(mul->rhs(), "y"); } { @@ -2055,46 +2049,46 @@ TEST(Simplify, SimplifyNestedMax) { ExprHandle simplified = IRSimplifier::simplify(body); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - IS_BINOP_W_VARS(Add, simplified.node(), add, "y", "x"); + IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y"); } { - // Max(x + y, Max(x + y, z)) => Max(y + x, z) + // Max(x + y, Max(x + y, z)) => Max(x + y, z) ExprHandle body = Max::make(x + y, Max::make(x + y, z, true), true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(max->rhs(), "z"); } { - // Max(x + y, Max(z, x + y)) => Max(y + x, z) + // Max(x + y, Max(z, x + y)) => Max(x + y, z) ExprHandle body = Max::make(x + y, Max::make(z, x + y, true), true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(max->rhs(), "z"); } { - // Max(Max(x + y, z), x + y) => Max(y + x, z) + // Max(Max(x + y, z), x + y) => Max(x + y, z) ExprHandle body = Max::make(Max::make(x + y, z, true), x + y, true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(max->rhs(), "z"); } { - // Max(Max(z, x + y), x + y) => Max(y + x, z) + // Max(Max(z, x + y), x + y) => Max(x + y, z) ExprHandle body = Max::make(Max::make(z, x + y, true), x + y, true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(max->rhs(), "z"); } @@ -2112,55 +2106,39 @@ TEST(Simplify, SimplifyNestedMax) { } { - // Max(Min(x, y), Min(x, z)) => Min(x, Max(y, z)) + // Max(Min(x, y), Min(x, z)) => Min(Max(y, z), x) ExprHandle body = Max::make(Min::make(x, y, true), Min::make(x, z, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_VAR_WITH_NAME(min->lhs(), "x"); - IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z"); - ASSERT_TRUE(max->propagate_nans()); + checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)"); } { - // Max(Min(x, y), Min(z, x)) => Min(x, Max(y, z)) + // Max(Min(x, y), Min(z, x)) => Min(Max(y, z), x) ExprHandle body = Max::make(Min::make(x, y, true), Min::make(z, x, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_VAR_WITH_NAME(min->lhs(), "x"); - IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z"); - ASSERT_TRUE(max->propagate_nans()); + checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)"); } { - // Max(Min(y, x), Min(x, z)) => Min(x, Max(y, z)) + // Max(Min(y, x), Min(x, z)) => Min(Max(y, z), x) ExprHandle body = Max::make(Min::make(y, x, true), Min::make(x, z, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_VAR_WITH_NAME(min->lhs(), "x"); - IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z"); - ASSERT_TRUE(max->propagate_nans()); + checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)"); } { - // Max(Min(y, x), Min(z, x)) => Min(x, Max(y, z)) + // Max(Min(y, x), Min(z, x)) => Min(Max(y, z), x) ExprHandle body = Max::make(Min::make(y, x, true), Min::make(z, x, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_VAR_WITH_NAME(min->lhs(), "x"); - IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z"); - ASSERT_TRUE(max->propagate_nans()); + checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)"); } { - // Max(Min(y, x), Min(z, x)) => Max(Min(x, z), Min(x, y)) + // Max(Min(y, x), Min(z, x)) => Max(Min(x, y), Min(x, z)) // When all the ops in the pattern do not have the same propagate_nans, // it should not be simplified. ExprHandle body = @@ -2168,10 +2146,10 @@ TEST(Simplify, SimplifyNestedMax) { ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_BINOP_W_VARS(Min, max->lhs(), min1, "x", "z"); - ASSERT_FALSE(min1->propagate_nans()); - IS_BINOP_W_VARS(Min, max->rhs(), min2, "x", "y"); - ASSERT_TRUE(min2->propagate_nans()); + IS_BINOP_W_VARS(Min, max->lhs(), min1, "x", "y"); + ASSERT_TRUE(min1->propagate_nans()); + IS_BINOP_W_VARS(Min, max->rhs(), min2, "x", "z"); + ASSERT_FALSE(min2->propagate_nans()); ASSERT_TRUE(max->propagate_nans()); } @@ -2304,18 +2282,7 @@ TEST(Simplify, SimplifyNestedMax) { 8, false); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Max, simplified.node(), max1); - IS_NODE_WITH_NAME(Max, max1->lhs(), max2); - IS_VAR_WITH_NAME(max2->lhs(), "x"); - IS_NODE_WITH_NAME(Max, max2->rhs(), max3); - IS_BINOP_W_CONST(Max, max3->lhs(), max4, "z", 5); - ASSERT_TRUE(max4->propagate_nans()); - IS_VAR_WITH_NAME(max3->rhs(), "y"); - ASSERT_FALSE(max3->propagate_nans()); - ASSERT_TRUE(max2->propagate_nans()); - IS_IMM_WITH_VAL(Int, max1->rhs(), 8); - ASSERT_FALSE(max1->propagate_nans()); + checkExprIR(simplified, "Max(Max(Max(Max(z, 5, 1), y, 0), x, 1), 8, 0)"); } { @@ -2359,46 +2326,46 @@ TEST(Simplify, SimplifyNestedMin) { ExprHandle simplified = IRSimplifier::simplify(body); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - IS_BINOP_W_VARS(Add, simplified.node(), add, "y", "x"); + IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y"); } { - // Min(x + y, Min(x + y, z)) => Min(y + x, z) + // Min(x + y, Min(x + y, z)) => Min(x + y, z) ExprHandle body = Min::make(x + y, Min::make(x + y, z, true), true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(min->rhs(), "z"); } { - // Min(x + y, Min(z, x + y)) => Min(y + x, z) + // Min(x + y, Min(z, x + y)) => Min(x + y, z) ExprHandle body = Min::make(x + y, Min::make(z, x + y, true), true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(min->rhs(), "z"); } { - // Min(Min(x + y, z), x + y) => Min(y + x, z) + // Min(Min(x + y, z), x + y) => Min(x + y, z) ExprHandle body = Min::make(Min::make(x + y, z, true), x + y, true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(min->rhs(), "z"); } { - // Min(Min(z, x + y), x + y) => Min(y + x, z) + // Min(Min(z, x + y), x + y) => Min(x + y, z) ExprHandle body = Min::make(Min::make(z, x + y, true), x + y, true); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x"); + IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y"); IS_VAR_WITH_NAME(min->rhs(), "z"); } @@ -2416,55 +2383,39 @@ TEST(Simplify, SimplifyNestedMin) { } { - // Min(Max(x, y), Max(x, z)) => Max(x, Min(y, z)) + // Min(Max(x, y), Max(x, z)) => Max(Min(y, z), x) ExprHandle body = Min::make(Max::make(x, y, true), Max::make(x, z, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_VAR_WITH_NAME(max->lhs(), "x"); - IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z"); - ASSERT_TRUE(min->propagate_nans()); + checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)"); } { - // Min(Max(x, y), Max(z, x)) => Max(x, Min(y, z)) + // Min(Max(x, y), Max(z, x)) => Max(Min(y, z), x) ExprHandle body = Min::make(Max::make(x, y, true), Max::make(z, x, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_VAR_WITH_NAME(max->lhs(), "x"); - IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z"); - ASSERT_TRUE(min->propagate_nans()); + checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)"); } { - // Min(Max(y, x), Max(x, z)) => Max(x, Min(y, z)) + // Min(Max(y, x), Max(x, z)) => Max(Min(y, z), x) ExprHandle body = Min::make(Max::make(y, x, true), Max::make(x, z, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_VAR_WITH_NAME(max->lhs(), "x"); - IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z"); - ASSERT_TRUE(min->propagate_nans()); + checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)"); } { - // Min(Max(y, x), Max(z, x)) => Max(x, Min(y, z)) + // Min(Max(y, x), Max(z, x)) => Max(Min(y, z), x) ExprHandle body = Min::make(Max::make(y, x, true), Max::make(z, x, true), true); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Max, simplified.node(), max); - IS_VAR_WITH_NAME(max->lhs(), "x"); - IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z"); - ASSERT_TRUE(min->propagate_nans()); + checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)"); } { - // Min(Max(y, x), Max(z, x)) => Min(Max(x, z), Max(x, y)) + // Min(Max(y, x), Max(z, x)) => Min(Max(x, y), Max(x, z)) // When all the ops in the pattern do not have the same propagate_nans, // it should not be simplified. ExprHandle body = @@ -2472,10 +2423,10 @@ TEST(Simplify, SimplifyNestedMin) { ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Min, simplified.node(), min); - IS_BINOP_W_VARS(Max, min->lhs(), max1, "x", "z"); - ASSERT_FALSE(max1->propagate_nans()); - IS_BINOP_W_VARS(Max, min->rhs(), max2, "x", "y"); - ASSERT_TRUE(max2->propagate_nans()); + IS_BINOP_W_VARS(Max, min->lhs(), max1, "x", "y"); + ASSERT_TRUE(max1->propagate_nans()); + IS_BINOP_W_VARS(Max, min->rhs(), max2, "x", "z"); + ASSERT_FALSE(max2->propagate_nans()); ASSERT_TRUE(min->propagate_nans()); } @@ -2600,7 +2551,7 @@ TEST(Simplify, SimplifyNestedMin) { } { - // Min(Min(Min(Min(z, 5), y), x), 8) => Min(Min(x, Min(Min(z, 5), y)), 8) + // Min(Min(Min(Min(z, 5), y), x), 8) => Min(Min(Min(Min(z, 5), y), x), 8) // Do not simplify when all the Min ops do not have the same // propagate_nans. ExprHandle body = Min::make( @@ -2608,18 +2559,7 @@ TEST(Simplify, SimplifyNestedMin) { 8, false); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Min, simplified.node(), min1); - IS_NODE_WITH_NAME(Min, min1->lhs(), min2); - IS_VAR_WITH_NAME(min2->lhs(), "x"); - IS_NODE_WITH_NAME(Min, min2->rhs(), min3); - IS_BINOP_W_CONST(Min, min3->lhs(), min4, "z", 5); - ASSERT_TRUE(min4->propagate_nans()); - IS_VAR_WITH_NAME(min3->rhs(), "y"); - ASSERT_FALSE(min3->propagate_nans()); - ASSERT_TRUE(min2->propagate_nans()); - IS_IMM_WITH_VAL(Int, min1->rhs(), 8); - ASSERT_FALSE(min1->propagate_nans()); + checkExprIR(simplified, "Min(Min(Min(Min(z, 5, 1), y, 0), x, 1), 8, 0)"); } { @@ -2922,16 +2862,7 @@ TEST(Simplify, SimplifyRoundModPattern) { VarHandle z("z", kInt); ExprHandle body = ((x / y) * y) + (x % z); ExprHandle simplified = IRSimplifier::simplify(body); - - IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul); - IS_VAR_WITH_NAME(roundMul->lhs(), "y"); - IS_NODE_WITH_NAME(Div, roundMul->rhs(), roundDiv); - IS_VAR_WITH_NAME(roundDiv->lhs(), "x"); - IS_VAR_WITH_NAME(roundDiv->rhs(), "y"); - IS_NODE_WITH_NAME(Mod, add->rhs(), mod); - IS_VAR_WITH_NAME(mod->lhs(), "x"); - IS_VAR_WITH_NAME(mod->rhs(), "z"); + checkExprIR(simplified, "(x / y) * y + x % z"); } { @@ -2941,15 +2872,7 @@ TEST(Simplify, SimplifyRoundModPattern) { VarHandle z("z", kInt); ExprHandle body = (y * (x / z)) + (x % y); ExprHandle simplified = IRSimplifier::simplify(body); - IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul); - IS_VAR_WITH_NAME(roundMul->lhs(), "y"); - IS_NODE_WITH_NAME(Div, roundMul->rhs(), roundDiv); - IS_VAR_WITH_NAME(roundDiv->lhs(), "x"); - IS_VAR_WITH_NAME(roundDiv->rhs(), "z"); - IS_NODE_WITH_NAME(Mod, add->rhs(), mod); - IS_VAR_WITH_NAME(mod->lhs(), "x"); - IS_VAR_WITH_NAME(mod->rhs(), "y"); + checkExprIR(simplified, "x % y + (x / z) * y"); } { @@ -2959,15 +2882,7 @@ TEST(Simplify, SimplifyRoundModPattern) { VarHandle z("z", kInt); ExprHandle body = ((x / y) * z) + (x % y); ExprHandle simplified = IRSimplifier::simplify(body); - IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul); - IS_VAR_WITH_NAME(roundMul->lhs(), "z"); - IS_NODE_WITH_NAME(Div, roundMul->rhs(), roundDiv); - IS_VAR_WITH_NAME(roundDiv->lhs(), "x"); - IS_VAR_WITH_NAME(roundDiv->rhs(), "y"); - IS_NODE_WITH_NAME(Mod, add->rhs(), mod); - IS_VAR_WITH_NAME(mod->lhs(), "x"); - IS_VAR_WITH_NAME(mod->rhs(), "y"); + checkExprIR(simplified, "x % y + (x / y) * z"); } } @@ -3036,20 +2951,20 @@ TEST(Simplify, SimplifyRoundModPatternMultivar) { { // Multivar. - // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => y + x. + // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => x + y. VarHandle x("x", kInt); VarHandle y("y", kInt); ExprHandle body = (x / ExprHandle(8) * ExprHandle(8)) + (y / ExprHandle(5) * ExprHandle(5)) + (x % 8) + (y % 5); ExprHandle simplified = IRSimplifier::simplify(body); IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_VAR_WITH_NAME(add->lhs(), "y"); - IS_VAR_WITH_NAME(add->rhs(), "x"); + IS_VAR_WITH_NAME(add->lhs(), "x"); + IS_VAR_WITH_NAME(add->rhs(), "y"); } { // Find the right var. - // (y/8) * 8 x%8 + y%8 + z%8 => z%8 + x%8 + y + // (y/8) * 8 x%8 + y%8 + z%8 => x%8 + y + z%8 VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -3075,16 +2990,9 @@ TEST(Simplify, SimplifyRoundModPatternMultivar) { VarHandle y("y", kInt); VarHandle z("z", kInt); - ExprHandle body = x + (z + ExprHandle(512) * y) % ExprHandle(16) + - ExprHandle(16) * ((z + ExprHandle(512) * y) / ExprHandle(16)); + ExprHandle body = x + (z + y * 512) % 16 + ((z + y * 512) / 16 * 16); ExprHandle simplified = IRSimplifier::simplify(body); - IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_VAR_WITH_NAME(add->rhs(), "x"); - IS_NODE_WITH_NAME(Add, add->lhs(), add2); - IS_VAR_WITH_NAME(add2->lhs(), "z"); - IS_NODE_WITH_NAME(Mul, add2->rhs(), mul); - IS_IMM_WITH_VAL(Int, mul->lhs(), 512); - IS_VAR_WITH_NAME(mul->rhs(), "y"); + checkExprIR(simplified, "x + (z + 512 * y)"); } } @@ -3135,13 +3043,7 @@ TEST(Simplify, SimplifyModRoundModPattern) { VarHandle k("k", kInt); ExprHandle body = (k * t / x % y) * x + k * t % x; ExprHandle simplified = IRSimplifier::simplify(body); - IS_NODE_WITH_NAME(Mod, simplified.node(), mod); - IS_NODE_WITH_NAME(Mul, mod->lhs(), mul1); - IS_VAR_WITH_NAME(mul1->lhs(), "t"); - IS_VAR_WITH_NAME(mul1->rhs(), "k"); - IS_NODE_WITH_NAME(Mul, mod->rhs(), mul2); - IS_VAR_WITH_NAME(mul2->lhs(), "x"); - IS_VAR_WITH_NAME(mul2->rhs(), "y"); + checkExprIR(simplified, "(k * t) % (x * y)"); } { @@ -3259,11 +3161,7 @@ TEST(Simplify, SimplifyModRoundModPatternMultivar) { VarHandle t("t", kInt); ExprHandle body = (t / 7 % 9) * 7 + t % 7 + t; ExprHandle simplified = IRSimplifier::simplify(body); - IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_NODE_WITH_NAME(Mod, add->rhs(), mod); - IS_VAR_WITH_NAME(mod->lhs(), "t"); - IS_IMM_WITH_VAL(Int, mod->rhs(), 63); - IS_VAR_WITH_NAME(add->lhs(), "t"); + checkExprIR(simplified, "t % 63 + t"); } { @@ -3306,19 +3204,7 @@ TEST(Simplify, SimplifyModRoundModPatternMultivar) { VarHandle k("k", kInt); ExprHandle body = (t / x % y) * x + t % x + (t / k / x % y) * x + t / k % x; ExprHandle simplified = IRSimplifier::simplify(body); - IS_NODE_WITH_NAME(Add, simplified.node(), add); - IS_NODE_WITH_NAME(Mod, add->lhs(), mod); - IS_VAR_WITH_NAME(mod->lhs(), "t"); - IS_NODE_WITH_NAME(Mul, mod->rhs(), mul); - IS_VAR_WITH_NAME(mul->lhs(), "x"); - IS_VAR_WITH_NAME(mul->rhs(), "y"); - IS_NODE_WITH_NAME(Mod, add->rhs(), mod2); - IS_NODE_WITH_NAME(Div, mod2->lhs(), div); - IS_VAR_WITH_NAME(div->lhs(), "t"); - IS_VAR_WITH_NAME(div->rhs(), "k"); - IS_NODE_WITH_NAME(Mul, mod2->rhs(), mul2); - IS_VAR_WITH_NAME(mul2->lhs(), "x"); - IS_VAR_WITH_NAME(mul2->rhs(), "y"); + checkExprIR(simplified, "(t / k) % (x * y) + t % (x * y)"); } { @@ -3971,7 +3857,7 @@ TEST(Simplify, SimplifyForWontLoseLoopOptions) { BufHandle c("C", {4}, kInt); VarHandle i("i", kInt); LoopOptions options; - options.set_gpu_block_index(12); + options.set_gpu_block_index(LoopOptions::IDX_W); auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})), options); StmtPtr simplified = IRSimplifier::simplify(body); diff --git a/test/cpp/tensorexpr/test_utils.h b/test/cpp/tensorexpr/test_utils.h index 01b92a7832a40..065e513c1a645 100644 --- a/test/cpp/tensorexpr/test_utils.h +++ b/test/cpp/tensorexpr/test_utils.h @@ -4,6 +4,7 @@ #include #include +#include #include namespace torch { @@ -69,5 +70,9 @@ using namespace torch::jit::tensorexpr; ASSERT_EQ(node_->op_type(), kRand); \ } +void checkIR(StmtPtr s, const std::string& pattern); +void checkExprIR(ExprPtr e, const std::string& pattern); +void checkExprIR(const ExprHandle& e, const std::string& pattern); + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index 3d849fec6d9db..cb731d2525e71 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -35,8 +35,15 @@ void Term::sort() { if (dtype().is_floating_point()) { throw std::logic_error("reordering FP ops"); } + std::unordered_map str_repr_cache; std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) { - return hasher_.hash(a) < hasher_.hash(b); + if (!str_repr_cache.count(a)) { + str_repr_cache[a] = std::to_string(a); + } + if (!str_repr_cache.count(b)) { + str_repr_cache[b] = std::to_string(b); + } + return str_repr_cache.at(a) < str_repr_cache.at(b); }); } @@ -52,8 +59,15 @@ void Polynomial::sort() { if (dtype().is_floating_point()) { throw std::logic_error("reordering FP ops"); } + std::unordered_map str_repr_cache; std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) { - return hasher_.hash(a) < hasher_.hash(b); + if (!str_repr_cache.count(a)) { + str_repr_cache[a] = std::to_string(a); + } + if (!str_repr_cache.count(b)) { + str_repr_cache[b] = std::to_string(b); + } + return str_repr_cache.at(a) < str_repr_cache.at(b); }); } @@ -66,6 +80,18 @@ void MaxTerm::uniquefy() { return hasher_.hash(a) == hasher_.hash(b); }); variables_.resize(std::distance(variables_.begin(), it)); + + // Once we removed duplicates, sort terms alphabetically for stability. + std::unordered_map str_repr_cache; + std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) { + if (!str_repr_cache.count(a)) { + str_repr_cache[a] = std::to_string(a); + } + if (!str_repr_cache.count(b)) { + str_repr_cache[b] = std::to_string(b); + } + return str_repr_cache.at(a) < str_repr_cache.at(b); + }); } void MinTerm::uniquefy() { @@ -77,6 +103,18 @@ void MinTerm::uniquefy() { return hasher_.hash(a) == hasher_.hash(b); }); variables_.resize(std::distance(variables_.begin(), it)); + + // Once we removed duplicates, sort terms alphabetically for stability. + std::unordered_map str_repr_cache; + std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) { + if (!str_repr_cache.count(a)) { + str_repr_cache[a] = std::to_string(a); + } + if (!str_repr_cache.count(b)) { + str_repr_cache[b] = std::to_string(b); + } + return str_repr_cache.at(a) < str_repr_cache.at(b); + }); } // Handles optimization cases for Broadcast/Ramp +/- Broadcast/Ramp @@ -2076,8 +2114,20 @@ ExprPtr TermExpander::mutate(PolynomialPtr v) { std::vector addTerms; std::vector subTerms; + auto vars = v->variables(); + std::unordered_map str_repr_cache; + std::sort(vars.begin(), vars.end(), [&](ExprPtr a, ExprPtr b) { + if (!str_repr_cache.count(a)) { + str_repr_cache[a] = std::to_string(a); + } + if (!str_repr_cache.count(b)) { + str_repr_cache[b] = std::to_string(b); + } + return str_repr_cache.at(a) < str_repr_cache.at(b); + }); + // partition the terms into a list to add and list to subtract. - for (auto node : v->variables()) { + for (auto node : vars) { if (immediateIsNegative(node->scalar())) { subTerms.push_back(node); } else if (!immediateEquals(node->scalar(), 0)) { @@ -2822,6 +2872,49 @@ bool exprEquals(ExprPtr A, ExprPtr B) { } } +ExprPtr IRSimplifier::simplify(ExprPtr e) { + GRAPH_DEBUG("(Simplifier) Original: ", std::to_string(e)); + SimplifierUnderContext ctxsimplifier; + e = e->accept_mutator(&ctxsimplifier); + + PolynomialTransformer simplifier; + e = e->accept_mutator(&simplifier); + + // There may be terms left in the IR, expand them. + TermExpander expander(&simplifier); + e = e->accept_mutator(&expander); + // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) + if (!expander.check_safe()) { + throw malformed_input("eliminated null Allocation without free"); + } + + GRAPH_DEBUG("(Simplifier) Simplified: ", std::to_string(e)); + return e; +} + +StmtPtr IRSimplifier::simplify(StmtPtr s) { + GRAPH_DEBUG("(Simplifier) Original: ", std::to_string(s)); + SimplifierUnderContext ctxsimplifier; + s = s->accept_mutator(&ctxsimplifier); + + PolynomialTransformer simplifier; + s = s->accept_mutator(&simplifier); + if (s == nullptr) { + GRAPH_DEBUG("(Simplifier) Simplified: NULL"); + return nullptr; + } + + // There may be terms left in the IR, expand them. + TermExpander expander(&simplifier); + s = s->accept_mutator(&expander); + if (!expander.check_safe()) { + throw malformed_input("eliminated null Allocation without free"); + } + + GRAPH_DEBUG("(Simplifier) Simplified: ", std::to_string(s)); + return s; +} + } // namespace tensorexpr } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h index 6281b77349b37..87c476242e8de 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.h +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h @@ -596,47 +596,11 @@ class TORCH_API TermExpander : public PolynomialBase { class TORCH_API IRSimplifier { public: - static ExprPtr simplify(ExprPtr e) { - SimplifierUnderContext ctxsimplifier; - e = e->accept_mutator(&ctxsimplifier); - - PolynomialTransformer simplifier; - e = e->accept_mutator(&simplifier); - - // There may be terms left in the IR, expand them. - TermExpander expander(&simplifier); - e = e->accept_mutator(&expander); - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - if (!expander.check_safe()) { - throw malformed_input("eliminated null Allocation without free"); - } - - return e; - } - + static StmtPtr simplify(StmtPtr s); + static ExprPtr simplify(ExprPtr e); static ExprHandle simplify(const ExprHandle& e) { return ExprHandle(simplify(e.node())); } - - static StmtPtr simplify(StmtPtr s) { - SimplifierUnderContext ctxsimplifier; - s = s->accept_mutator(&ctxsimplifier); - - PolynomialTransformer simplifier; - s = s->accept_mutator(&simplifier); - if (s == nullptr) { - return nullptr; - } - - // There may be terms left in the IR, expand them. - TermExpander expander(&simplifier); - s = s->accept_mutator(&expander); - if (!expander.check_safe()) { - throw malformed_input("eliminated null Allocation without free"); - } - - return s; - } }; // Flattens the buf and performs the simplifier on the flattened dims. From 9bbf80969ece148ca5da2107ef9ad26a99891738 Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Wed, 18 Aug 2021 14:47:19 -0700 Subject: [PATCH 044/530] [PyTorch] Avoid using std::regex for device string parsing in Device.cpp (#63464) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63464 This was previously committed as D30281388 (https://github.com/pytorch/pytorch/commit/4d6f98ecada2d85b2474b023838debad4305316d), but was reverted due to t98478641. jnkwok1 confirmed that this change was not the root cause, so trying to land it again. Currently, `std::regex` is used for parsing device strings. This is undesirable for a few reasons. 1. Increases binary size 2. Slows down model loading 3. Potentially uses more memory at runtime 4. Takes marginally longer time to build code that uses std::regex v/s not using std::regex This change avoids the use of `std::regex` for parsing the device string since we don't need to. ghstack-source-id: 136006963 ghstack-source-id: 136081898 Test Plan: ### AI Bench Runs **Before this change:** 1. Model Load time: [252ms](https://www.internalfb.com/intern/aibench/details/332471502816548) 2. Model unload time: 3.5ms **After this change:** 1. Model Load time: [240ms](https://www.internalfb.com/intern/aibench/details/652195589031318), which is an approx 5% reduction for the current model. I suspect percentage wise, it will be larger for smaller models since this is a fixed cost reduction. 2. Model unload time: 3.3ms (probably too small to be meaningfully impactful to an end user). ### BSB Results ``` D30281388 (https://github.com/pytorch/pytorch/commit/4d6f98ecada2d85b2474b023838debad4305316d)-V1 (https://www.internalfb.com/intern/diff/D30281388 (https://github.com/pytorch/pytorch/commit/4d6f98ecada2d85b2474b023838debad4305316d)/?dest_number=135713848) messenger-pika-optimized-device: Succeeded Change in Download Size for arm64 + 3x assets variation: -7.1 KiB Change in Uncompressed Size for arm64 + 3x assets variation: -17.6 KiB Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:551399955987465@base/bsb:551399955987465@diff/ ``` Reviewed By: raziel, pavithranrao Differential Revision: D30388269 fbshipit-source-id: 10942e7aa56f9ea47aa479a8f50187f2ce2899bf --- c10/core/Device.cpp | 108 +++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 36 deletions(-) diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index ee6f1b473fe08..2709c29ce8460 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -4,28 +4,13 @@ #include #include +#include #include #include -#include #include #include #include -// Check if compiler has working std::regex implementation -// -// Test below is adapted from https://stackoverflow.com/a/41186162 -#if defined(_MSVC_LANG) && _MSVC_LANG >= 201103L -// Compiler has working regex. MSVC has erroneous __cplusplus. -#elif __cplusplus >= 201103L && \ - (!defined(__GLIBCXX__) || (__cplusplus >= 201402L) || \ - (defined(_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT) || \ - defined(_GLIBCXX_REGEX_STATE_LIMIT) || \ - (defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE > 4))) -// Compiler has working regex. -#else -static_assert(false, "Compiler does not have proper regex support."); -#endif - namespace c10 { namespace { DeviceType parse_type(const std::string& device_string) { @@ -65,33 +50,84 @@ DeviceType parse_type(const std::string& device_string) { "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ", device_string); } +enum DeviceStringParsingState { START, INDEX_START, INDEX_REST, ERROR }; + } // namespace Device::Device(const std::string& device_string) : Device(Type::CPU) { TORCH_CHECK(!device_string.empty(), "Device string must not be empty"); - // We assume gcc 5+, so we can use proper regex. - static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?"); - std::smatch match; - TORCH_CHECK( - std::regex_match(device_string, match, regex), - "Invalid device string: '", - device_string, - "'"); - type_ = parse_type(match[1].str()); - if (match[2].matched) { - try { - index_ = c10::stoi(match[2].str()); - } catch (const std::exception&) { - TORCH_CHECK( - false, - "Could not parse device index '", - match[2].str(), - "' in device string '", - device_string, - "'"); + std::string device_name, device_index_str; + DeviceStringParsingState pstate = DeviceStringParsingState::START; + + // The code below tries to match the string in the variable + // device_string against the regular expression: + // ([a-zA-Z_]+)(?::([1-9]\\d*|0))? + for (size_t i = 0; + pstate != DeviceStringParsingState::ERROR && i < device_string.size(); + ++i) { + const char ch = device_string.at(i); + switch (pstate) { + case DeviceStringParsingState::START: + if (ch != ':') { + if (isalpha(ch) || ch == '_') { + device_name.push_back(ch); + } else { + pstate = DeviceStringParsingState::ERROR; + } + } else { + pstate = DeviceStringParsingState::INDEX_START; + } + break; + + case DeviceStringParsingState::INDEX_START: + if (isdigit(ch)) { + device_index_str.push_back(ch); + pstate = DeviceStringParsingState::INDEX_REST; + } else { + pstate = DeviceStringParsingState::ERROR; + } + break; + + case DeviceStringParsingState::INDEX_REST: + if (device_index_str.at(0) == '0') { + pstate = DeviceStringParsingState::ERROR; + break; + } + if (isdigit(ch)) { + device_index_str.push_back(ch); + } else { + pstate = DeviceStringParsingState::ERROR; + } + break; + + case DeviceStringParsingState::ERROR: + // Execution won't reach here. + break; + } + } + + const bool has_error = device_name.empty() || + pstate == DeviceStringParsingState::ERROR || + (pstate == DeviceStringParsingState::INDEX_START && + device_index_str.empty()); + + TORCH_CHECK(!has_error, "Invalid device string: '", device_string, "'"); + + try { + if (!device_index_str.empty()) { + index_ = c10::stoi(device_index_str); } + } catch (const std::exception&) { + TORCH_CHECK( + false, + "Could not parse device index '", + device_index_str, + "' in device string '", + device_string, + "'"); } + type_ = parse_type(device_name); validate(); } From 139413078fe2f2b7cf451943461e7c76038446a4 Mon Sep 17 00:00:00 2001 From: Mostafa Elhoushi Date: Wed, 18 Aug 2021 14:47:40 -0700 Subject: [PATCH 045/530] [FX] make ASTReriter patch wrapped functions properly (#62987) Summary: reference the same global namespace (instead of copying it) in ASTRewriter to patch wrapped functions properly Fixes #{62071} Pull Request resolved: https://github.com/pytorch/pytorch/pull/62987 Test Plan: To test it you may write this snippet and ensure the results are as shown in the comments: ``` import torch import torch.fx torch.fx.wrap def to_be_wrapped(x): return torch.relu(x) class Foo(torch.nn.Module): def forward(self, x): return to_be_wrapped(x) traced = torch.fx.symbolic_trace(Foo()) print(traced.graph) """ graph(): %x : [#users=1] = placeholder[target=x] %to_be_wrapped : [#users=1] = call_function[target=__main__.to_be_wrapped](args = (%x,), kwargs = {}) return to_be_wrapped """ from torch.fx.experimental.rewriter import RewritingTracer rt = RewritingTracer() graph = rt.trace(Foo()) print(graph) """ ### AFTER FIX (CORRECT): graph(): %x : [#users=1] = placeholder[target=x] %to_be_wrapped : [#users=1] = call_function[target=__main__.to_be_wrapped](args = (%x,), kwargs = {}) return to_be_wrapped ### BEFORE FIX (WRONG): graph(): %x : [#users=1] = placeholder[target=x] %relu : [#users=1] = call_function[target=torch.relu](args = (%x,), kwargs = {}) return relu """ ``` Reviewed By: ansley Differential Revision: D30396176 Pulled By: mostafaelhoushi fbshipit-source-id: f61eddf32e9ef42b5f5c3ce21d559945214ee833 --- test/test_fx.py | 90 +++++++++++++++++++++++++++++++ torch/fx/experimental/rewriter.py | 18 ++++++- 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/test/test_fx.py b/test/test_fx.py index 1708634653a64..e39469d0a0676 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -2335,6 +2335,96 @@ def forward(self, x: torch.Tensor): traced.graph.lint() + def test_ast_rewriter_wrap(self): + self.assertEqual(3 + 4 + 5, a_lifted_leaf((3, 4), 5)) + + def to_trace(y): + return ( + a_lifted_leaf((4, y), 3) + + a_lifted_leaf((3, 4), 5) + + a_lifted_leaf((y, y), y) + ) + + ast_rewriter = RewritingTracer() + graph = ast_rewriter.trace(to_trace) + traced = GraphModule(ast_rewriter.root, graph, "gm") + + self.assertIn("a_lifted_leaf", traced.code) + self.assertEqual(27, traced(2)) + self.assertIs(a_lifted_leaf, real_a_lifed_leaf) + + def test_ast_rewriter_wrap_fn_directly(self): + self.assertEqual(3 + 4 + 5, a_lifted_leaf2((3, 4), 5)) + + def to_trace(y): + return ( + a_lifted_leaf2((4, y), 3) + + a_lifted_leaf2((3, 4), 5) + + a_lifted_leaf2((y, y), y) + ) + + ast_rewriter = RewritingTracer() + graph = ast_rewriter.trace(to_trace) + traced = GraphModule(ast_rewriter.root, graph, "gm") + + self.assertIn("a_lifted_leaf2", traced.code) + self.assertEqual(27, traced(2)) + self.assertIs(a_lifted_leaf2, real_a_lifed_leaf2) + + def test_ast_rewriter_wrapped_via_decorator(self): + class F(torch.nn.Module): + def forward(self, x): + return wrapped_via_decorator(x) + + ast_rewriter = RewritingTracer() + graph = ast_rewriter.trace(F()) + traced = GraphModule(ast_rewriter.root, graph, "gm") + + self.assertIn("wrapped_via_decorator", traced.code) + self.assertEqual(traced(0), 1) + self.assertIs(wrapped_via_decorator, real_wrapped_via_decorator) + self.assertFalse(hasattr(wrapped_via_decorator, "__fx_already_patched")) + + def test_ast_rewriter_wrapped_via_decorator_and_transformed(self): + self.assertEqual(wrapped_via_decorator(0), 1) + + def to_trace(y): + return wrapped_via_decorator(y) + + ast_rewriter = RewritingTracer() + graph = ast_rewriter.trace(to_trace) + traced = GraphModule(ast_rewriter.root, graph, "gm") + + self.assertIn("wrapped_via_decorator", traced.code) + self.assertEqual(traced(0), 1) + self.assertIs(wrapped_via_decorator, real_wrapped_via_decorator) + self.assertFalse(hasattr(wrapped_via_decorator, "__fx_already_patched")) + + transformed = torch.fx.Transformer(traced).transform() + self.assertIn("wrapped_via_decorator", transformed.code) + self.assertEqual(transformed(0), 1) + self.assertIs(wrapped_via_decorator, real_wrapped_via_decorator) + self.assertFalse(hasattr(wrapped_via_decorator, "__fx_already_patched")) + + def test_ast_rewriter_wrap_with_submodule(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.batchnorm1d = torch.nn.BatchNorm1d(2, affine=False) + + def forward(self, x: torch.Tensor): + return wrapped_with_submodule(x, self.batchnorm1d) + + ast_rewriter = RewritingTracer() + graph = ast_rewriter.trace(M()) + traced = GraphModule(ast_rewriter.root, graph, "gm") + + self.assertIn("wrapped_with_submodule", traced.code) + + input = torch.rand(3, 2) + ref_batchnorm1d = torch.nn.BatchNorm1d(2, affine=False) + self.assertEqual(ref_batchnorm1d(input), traced(input)) + def test_submodule_manipulation_API(self): class C(torch.nn.Module): def __init__(self): diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py index b3f71d5de6cd2..de08ebaa69880 100644 --- a/torch/fx/experimental/rewriter.py +++ b/torch/fx/experimental/rewriter.py @@ -2,6 +2,7 @@ import inspect import textwrap import copy +import functools from types import FunctionType from typing import cast, Union, Callable, Dict, Optional, Any from torch.fx._symbolic_trace import Tracer @@ -41,8 +42,23 @@ def rewrite(self, fn: FunctionType): assert len(new_keys) == 1 fn_compiled = globals_dict[new_keys[0]] + # return the compiled function with the original globals + def change_func_globals(f, globals): + """Based on https://stackoverflow.com/a/13503277/2988730 (@unutbu)""" + # __globals__ is a private member of the function class + # so we have to copy the function, f, all of its member, except f.__globals__ + g = FunctionType( + f.__code__, + globals, + name=f.__name__, + argdefs=f.__defaults__, + closure=f.__closure__, + ) + g = functools.update_wrapper(g, f) + g.__kwdefaults__ = copy.copy(f.__kwdefaults__) + return g # Return the correct FunctionType object - return fn_compiled + return change_func_globals(fn_compiled, globals=fn.__globals__) def visit_Assert(self, node): """ From 779a3d47b0c057211374a9c4128fbdf78acee4c9 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Wed, 18 Aug 2021 14:56:51 -0700 Subject: [PATCH 046/530] [Static Runtime] Benchmark reports native nodes (#63346) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63346 We have seen that we can get significant perf wins essentially for free by implementing native ops for ops that we cannot write out variants for (e.g. TupleUnpack D30306955 (https://github.com/pytorch/pytorch/commit/078b8004a62a51f75e1fbd8d08eea359af6bb1d7), append D30326461 (https://github.com/pytorch/pytorch/commit/9d9e7a8d7294834ddad957ddb1f4cd5a0e741e55)). Therefore, whether or not SR is using a native implementation is valuable information. By capturing this in the benchmarking suite, we can hopefully avoid wasting time profiling/manually inspecting `native_ops.cpp` Reviewed By: hlu1 Differential Revision: D30346752 fbshipit-source-id: 205b090513b6a5a6ce4cb92f75ab0395b15d08f9 --- torch/csrc/jit/runtime/static/impl.cpp | 10 +++++++--- torch/csrc/jit/runtime/static/impl.h | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index a0c3bac2bbc83..1ee69a642384f 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -897,10 +897,12 @@ void StaticRuntime::benchmark( std::cout << std::setw(15) << ms << " ms. " << std::setw(10) << results.percent_per_node_type[kind] << "%. " << kind << " (" << results.instances_per_node_type[kind] << " nodes"; - if (results.out_nodes.count(kind) == 0) { - std::cout << ")" << std::endl; - } else { + if (results.out_nodes.count(kind)) { std::cout << ", out variant)" << std::endl; + } else if (results.native_nodes.count(kind)) { + std::cout << ", native)" << std::endl; + } else { + std::cout << ")" << std::endl; } } std::cout << std::setw(15) << results.total_time << " ms. in Total" @@ -1136,6 +1138,8 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops( if (nodes_[i].has_out_variant()) { results.out_nodes.insert(kind); results.out_nodes_count++; + } else if (nodes_[i].has_native()) { + results.native_nodes.insert(kind); } results.total_time += results.time_per_node[i]; } diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index cc36df037b02d..b16cfefbc0b60 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -231,6 +231,7 @@ class TORCH_API StaticRuntime { std::unordered_map percent_per_node_type; std::unordered_map instances_per_node_type; std::unordered_set out_nodes; + std::unordered_set native_nodes; }; IndividualMetrics benchmark_individual_ops( @@ -410,6 +411,10 @@ class TORCH_API ProcessedNode { return static_cast(fn_); } + bool has_native() const { + return static_cast(native_fn_); + } + bool verify_outputs_not_overlapping_with_immutable_inputs() const; private: From 15eec8e1d1ea5b3354bc305f1afe0c01a64ea748 Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Wed, 18 Aug 2021 15:02:05 -0700 Subject: [PATCH 047/530] using PR number instead of IN_PULL_REQUEST (#63360) Summary: PR numbers should be available on GHA after this. This fixes some target determinator not working issue discovered when manually running: https://github.com/pytorch/pytorch/issues/63412. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63360 Reviewed By: malfet, zhouzhuojie, seemethere Differential Revision: D30374615 Pulled By: walterddr fbshipit-source-id: eee8d8bb7aa4308a6a50cfdcd4423a96d846777f --- .circleci/config.yml | 9 +++++++++ .../job-specs/pytorch-job-specs.yml | 9 +++++++++ .github/templates/linux_ci_workflow.yml.j2 | 2 ++ .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 2 ++ .../generated-linux-bionic-py3.8-gcc9-coverage.yml | 2 ++ .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 ++ .../generated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 ++ .../generated-linux-xenial-py3.6-gcc5.4.yml | 2 ++ ...ed-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 ++ .jenkins/pytorch/test.sh | 13 ++++++------- 10 files changed, 38 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3a64240fcf8bb..cb3e148e2e162 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -644,6 +644,15 @@ jobs: set -ex export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}" export JOB_BASE_NAME="$CIRCLE_JOB" + # temporary fix for https://github.com/pytorch/pytorch/issues/60746 + if [ -z "$CIRCLE_PR_NUMBER" ]; then + if [[ $CIRCLE_BRANCH =~ .*pull.* ]]; then + export PR_NUMBER="$(echo $CIRCLE_BRANCH | sed 's/[^0-9]//g')" + export CIRCLE_PR_NUMBER="$PR_NUMBER" + fi + else + export PR_NUMBER="$CIRCLE_PR_NUMBER" + fi ${PARALLEL_FLAGS} cd workspace EOL diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index bcc02edd5f0b9..422e44e8a606d 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -174,6 +174,15 @@ jobs: set -ex export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}" export JOB_BASE_NAME="$CIRCLE_JOB" + # temporary fix for https://github.com/pytorch/pytorch/issues/60746 + if [ -z "$CIRCLE_PR_NUMBER" ]; then + if [[ $CIRCLE_BRANCH =~ .*pull.* ]]; then + export PR_NUMBER="$(echo $CIRCLE_BRANCH | sed 's/[^0-9]//g')" + export CIRCLE_PR_NUMBER="$PR_NUMBER" + fi + else + export PR_NUMBER="$CIRCLE_PR_NUMBER" + fi ${PARALLEL_FLAGS} cd workspace EOL diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index ec39ef6f5f260..bceeba51f20bc 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -353,6 +353,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -368,6 +369,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 7aa572d83321b..1e1aec057c7d4 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -321,6 +321,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -336,6 +337,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index eda7568a809dd..28180e3e98727 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -321,6 +321,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -336,6 +337,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index c50cac76a2c3e..ddb1522962dff 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -321,6 +321,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -336,6 +337,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml index cf2395e9ca829..fb6d83a0f2432 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -321,6 +321,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -336,6 +337,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index dd3cb50cfc903..eabc42408fa91 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -321,6 +321,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -336,6 +337,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml index dcbd19d661eb1..47ac9f73d422f 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -319,6 +319,7 @@ jobs: - name: Test PyTorch env: BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh @@ -334,6 +335,7 @@ jobs: docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e GITHUB_ACTIONS \ -e IN_CI \ diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 93de6fbf68969..124fd7c8cdb3e 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -26,11 +26,7 @@ echo "Testing pytorch" export LANG=C.UTF-8 -# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second -# CIRCLE_PULL_REQUEST comes from CircleCI -# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs -# see https://github.com/pytorch/pytorch/issues/60111 -IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-} +PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}} if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then export PYTORCH_TEST_WITH_SLOW=1 @@ -64,7 +60,7 @@ else export PYTORCH_TEST_SKIP_NOARCH=1 fi -if [[ -n "$IN_PULL_REQUEST" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then +if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then # skip expensive checks when on PR and CI_MASTER flag is not set export PYTORCH_TEST_SKIP_CUDA_MEM_LEAK_CHECK=1 else @@ -146,7 +142,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX512-* || $TEST_CONFIG == 'nogpu_NO_AVX export ATEN_CPU_CAPABILITY=avx2 fi -if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then +# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs +# see https://github.com/pytorch/pytorch/issues/60111 +# change it back to PR_NUMBER when issue is fixed. +if [ -n "$CIRCLE_PR_NUMBER" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then DETERMINE_FROM=$(mktemp) file_diff_from_base "$DETERMINE_FROM" fi From 1f4e019d8e8b00d004ca02d17cede0c7aec9f92d Mon Sep 17 00:00:00 2001 From: Sangbaek Park Date: Wed, 18 Aug 2021 15:50:33 -0700 Subject: [PATCH 048/530] [Vulkan] Fix incorrect input range for Hardshrink tests (#63515) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63515 Fixed inappropriate input range for Hardshrink tests: The range -10 ~ +10 for input tensors is more proper when we use the test set of lambda {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}. ghstack-source-id: 136141416 Test Plan: ```build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test adb shell "/data/local/tmp/vulkan_api_test" ``` Note that the test can fail sporadically due to the precision loss by FP16(Vulkan)/FP32(CPU). This issue will be handled separately after some design discussions. Reviewed By: SS-JIA Differential Revision: D30389646 fbshipit-source-id: 7224bd8ba4e4972f5fc147df8a0cb84808f8c62e --- aten/src/ATen/test/vulkan_api_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 474aa36c40cca..2873d3c0584c8 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -942,7 +942,7 @@ TEST(VulkanAPITest, hardshrink) { } for (const auto lambd_value : {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}) { - const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_cpu = (at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) - 0.5) * 20; // between -10 and +10 const auto in_vulkan = in_cpu.vulkan(); const auto out_cpu = at::hardshrink(in_cpu, lambd_value); @@ -964,7 +964,7 @@ TEST(VulkanAPITest, hardshrink_) { } for (const auto lambd_value : {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}) { - const auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + const auto cpu = (at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) - 0.5) * 20; // between -10 and +10 const auto vulkan = cpu.vulkan(); cpu.hardshrink(lambd_value); From 4dcc2197ced0e12f20f296e2e12baad5fad94b0e Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 18 Aug 2021 16:08:48 -0700 Subject: [PATCH 049/530] [fix] tensor_split : non-contiguous indices tensor (#63390) Summary: Fixes https://github.com/pytorch/pytorch/issues/63281 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63390 Reviewed By: ejguan Differential Revision: D30362649 Pulled By: mruberry fbshipit-source-id: 3ea3ad02199e4345beb0b580d056babd56112309 --- aten/src/ATen/native/TensorShape.cpp | 8 +++++++- torch/testing/_internal/common_methods_invocations.py | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index e915078249171..2545ec4c1e035 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -609,7 +609,13 @@ std::vector tensor_split(const Tensor& self, const Tensor& tensor_indice return self.tensor_split(sections, dim); } else { auto indices_data = tensor_indices_or_sections.data_ptr(); - std::vector indices(indices_data, indices_data + tensor_indices_or_sections.numel()); + auto stride = tensor_indices_or_sections.stride(0); + auto numel = tensor_indices_or_sections.numel(); + std::vector indices(numel); + for (size_t offset = 0; offset < numel; offset++) { + // indices tensor could be non-contiguous + indices[offset] = *(indices_data + offset * stride); + } return self.tensor_split(indices, dim); } } diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 5d55f0ec64291..7e57d5d693ec7 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -859,6 +859,7 @@ def sample_inputs_tensor_split(op_info, device, dtype, requires_grad, **kwargs): (torch.tensor([1, 2, 3]),), (torch.tensor(1),), (torch.tensor([1, 2, 3]), 1), + (torch.tensor([1, 4, 2, 5, 3, 6])[::2], 1), # Cases with list of indices. ((2, 4),), ((2, 4), 1), @@ -7590,6 +7591,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): active_if=(IS_MACOS or IS_WINDOWS)), )), OpInfo('tensor_split', + ref=np.array_split, dtypes=all_types_and_complex_and(torch.bool), dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), From d9547b9bb29ca5ba926b3707c5c8313ee65792b2 Mon Sep 17 00:00:00 2001 From: Amy He Date: Wed, 18 Aug 2021 16:23:48 -0700 Subject: [PATCH 050/530] Nnapi Delegation: Quick improvements (#63489) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63489 A few quick improvements to the Android NNAPI Delegate, some of which were discussed here https://github.com/pytorch/pytorch/pull/62272: 1) `throw std::exception` replaced with `TORCH_CHECK` to reduce runtime size (nnapi_backend_lib.cpp) 2) weights processing moved from compile to preprocess step, since it can be done AOT (nnapi_backend_lib.cpp & nnapi_backend_preprocess.cpp) 3) `ser_model_` and `shape_compute_module_` member variables removed, since they are never used after `init()`, so they are not needed (nnapi_backend_lib.cpp) Test Plan: Unit tests: `python test/test_jit.py TestNnapiBackend` Run SparkAR segmentation with delegated NNAPI as done here D30259033 (can use `jf download GAekdAwsyGKXhggFALN4LnSBTzcubsIXAAAz --file "v303-nnd-mod.ptl"` to get a preprocessed model from these changes) Imported from OSS Reviewed By: raziel, iseeyuan Differential Revision: D30398880 fbshipit-source-id: b6872e1e9ccd583622b80659da00c83fdd82580e --- .../jit/backends/nnapi/nnapi_backend_lib.cpp | 31 ++++++------------- .../nnapi/nnapi_backend_preprocess.cpp | 3 ++ 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp index 0533b7d85175f..7d9dc18c12589 100644 --- a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp +++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp @@ -31,19 +31,8 @@ class NnapiBackend : public PyTorchBackendInterface { c10::impl::GenericDict compile( c10::IValue processed, c10::impl::GenericDict method_compile_spec) override { - auto dict = processed.toGenericDict(); - - // Prepare weights - auto weights = dict.at("weights").toTensorList(); - for (int i = 0; i < weights.size(); i++) { - weights.set(i, weights.get(i).contiguous()); - } - dict.insert("weights", weights); - - // Save ser_model to member variable - ser_model_ = dict.at("ser_model").toTensor(); - // Wrap procesed in dictionary: {"forward": processed} + auto dict = processed.toGenericDict(); c10::Dict handles( c10::StringType::get(), c10::AnyType::get()); handles.insert("forward", dict); @@ -86,8 +75,7 @@ class NnapiBackend : public PyTorchBackendInterface { fixed_inputs.push_back( tensorInp.get(i).permute({0, 2, 3, 1}).contiguous()); } else { - throw std::exception(); - std::cerr << "Invalid mem_fmt" << std::endl; + TORCH_CHECK(false, "Invalid mem_fmt"); } } @@ -103,9 +91,8 @@ class NnapiBackend : public PyTorchBackendInterface { // TODO: See if it's possible to use those directly. if (fmt == 1) { outputs.set(i, outputs.get(i).permute({0, 3, 1, 2})); - } else if (fmt != 0) { - throw std::exception(); - std::cerr << "Invalid mem_fmt" << std::endl; + } else { + TORCH_CHECK(fmt == 0, "Invalid mem_fmt"); } } @@ -117,8 +104,6 @@ class NnapiBackend : public PyTorchBackendInterface { // and cannot be passed through the handles dictionary std::unique_ptr comp_; c10::List out_templates_; - at::Tensor ser_model_; - mobile::Module shape_compute_module_; // Runs once per model initialization // Cannot be moved to compile(), because init() requires actual inputs @@ -126,19 +111,21 @@ class NnapiBackend : public PyTorchBackendInterface { TORCH_CHECK(comp_ == nullptr); auto dict = handle.toGenericDict(); + // Get ser_model + auto ser_model = dict.at("ser_model").toTensor(); // Load shape computation module std::stringstream ss; auto shape_ptr = dict.at("shape_compute_module").toString(); ss.str(*shape_ptr); - shape_compute_module_ = _load_for_mobile(ss); + auto shape_compute_module = _load_for_mobile(ss); out_templates_ = - shape_compute_module_.run_method("prepare", ser_model_, inputs) + shape_compute_module.run_method("prepare", ser_model, inputs) .toTensorList(); // Create and initialize NnapiComilation object comp_ = std::make_unique(); auto weights = dict.at("weights").toTensorVector(); - comp_->init(ser_model_, weights); + comp_->init(ser_model, weights); } }; diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp index 2f68536b64107..be0dbe18d90d0 100644 --- a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp +++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp @@ -96,6 +96,9 @@ c10::IValue preprocess( // transform Python lists to C++ c10::List c10::List weights( py::cast>(nnapi_processed[2])); + for (int i = 0; i < weights.size(); i++) { + weights.set(i, weights.get(i).contiguous()); + } c10::List inp_mem_fmts( py::cast>(nnapi_processed[3])); c10::List out_mem_fmts( From 9477211e7d609ce382c0e22d7721c14c36d083de Mon Sep 17 00:00:00 2001 From: John Clow Date: Wed, 18 Aug 2021 16:28:02 -0700 Subject: [PATCH 051/530] Hoisting common expressions out of If blocks (#59492) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/59492 Adding code to find common expressions from the two subblocks of an if operation and hoist them before the if block. This also allows Dead Code Elimination to then eliminate some if blocks. Also eliminated some dead code in the codebase. Test Plan: python test_jit.py TestIfHoisting Imported from OSS Reviewed By: ngimel Differential Revision: D29399533 fbshipit-source-id: 9336b9dc48c02c38862f98f98cd72fc1767a1802 --- test/jit/test_if_hoisting.py | 213 ++++++++++++++++++ test/quantization/jit/test_quantize_jit.py | 3 +- test/test_jit.py | 1 + tools/build_variables.bzl | 1 + torch/_C/__init__.pyi.in | 1 + torch/csrc/jit/ir/node_hashing.cpp | 14 ++ .../jit/passes/common_expression_hoisting.cpp | 153 +++++++++++++ .../jit/passes/common_expression_hoisting.h | 10 + .../jit/passes/symbolic_shape_analysis.cpp | 1 + torch/csrc/jit/python/init.cpp | 6 + torch/csrc/jit/runtime/graph_executor.cpp | 9 +- .../runtime/profiling_graph_executor_impl.cpp | 111 +-------- 12 files changed, 418 insertions(+), 105 deletions(-) create mode 100644 test/jit/test_if_hoisting.py create mode 100644 torch/csrc/jit/passes/common_expression_hoisting.cpp create mode 100644 torch/csrc/jit/passes/common_expression_hoisting.h diff --git a/test/jit/test_if_hoisting.py b/test/jit/test_if_hoisting.py new file mode 100644 index 0000000000000..c8fd4a4bab349 --- /dev/null +++ b/test/jit/test_if_hoisting.py @@ -0,0 +1,213 @@ + +import torch +from torch.testing import FileCheck +from torch.testing._internal.jit_utils import JitTestCase + +if __name__ == "__main__": + raise RuntimeError( + "This test file is not meant to be run directly, use:\n\n" + "\tpython test/test_jit.py TESTNAME\n\n" + "instead." + ) + + +class TestIfHoisting(JitTestCase): + def test_if_hoist_basic(self): + def fn(x: bool, y: int): + if x: + z = y + 3 + else: + z = y + 3 + return z + + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) + self.assertEqual(fn(True, 1), fn_script(True, 1)) + + def test_if_hoist_transposed_expr(self): + """ + Making sure that we can properly eliminate + an expression even if it is not at the start + of a block + """ + def fn(x: bool, y: int): + if x: + a = y + 3 + b = y * 2 + else: + b = y * 2 + a = y + 3 + return a, b + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) + + self.assertEqual(fn(True, 1), fn_script(True, 1)) + self.assertEqual(fn(False, 5), fn_script(False, 5)) + + def test_if_hoist_swapped_expr(self): + """ + Making sure that the if statement + doesn't get fully eliminated here + """ + def fn(x: bool, y: int): + if x: + a = y + 3 + b = y * 2 + else: + a = y * 2 + b = y + 3 + return a, b + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) + + self.assertEqual(fn(True, 1), fn_script(True, 1)) + self.assertEqual(fn(False, 5), fn_script(False, 5)) + + def test_if_hoist_reused_var(self): + """ + Making sure that cases where the python variable is reused + is handled correctly + """ + def fn(x: bool, y: int): + b = 6 + if x: + a = y + 3 + a = y * 2 + else: + a = y * 2 + b = y + 3 + return a, b + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) + FileCheck().check_count("aten::mul", 1, exactly=True).run(op_graph) + + self.assertEqual(fn(True, 1), fn_script(True, 1)) + self.assertEqual(fn(False, 5), fn_script(False, 5)) + + def test_no_hoist(self): + """ + Nothing should happen here, expressions are different + """ + def fn(x: bool, y: int, z: int): + if x: + a = y + 3 + else: + a = z + 3 + return a + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) + + self.assertEqual(fn(True, 1, 3), fn_script(True, 1, 3)) + self.assertEqual(fn(False, 5, 10), fn_script(False, 5, 10)) + + def test_mutate_before(self): + """ + Make sure that if there is a mutation before the common + op, the hoist doesn't happen + """ + def fn(x: bool, y: torch.Tensor): + if x: + y.add_(8) + a = y + 3 + else: + a = y + 3 + return a + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) + FileCheck().check_count("aten::add_", 1, exactly=True).run(op_graph) + + t1 = torch.Tensor([1]) + t2 = torch.Tensor([5, 6]) + self.assertEqual(fn(True, t1), fn_script(True, t1)) + self.assertEqual(fn(False, t2), fn_script(False, t2)) + + def test_mutate_after(self): + """ + Check that the hoist can happen properly, and + that the output is still correct. + """ + def fn(x: bool, y: torch.Tensor): + if x: + b = 1 + a = y + 3 + y.add_(8) + else: + b = 2 + a = y + 3 + c = b + a + return a + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) + + t1 = torch.Tensor([1]) + t2 = torch.Tensor([5, 6]) + self.assertEqual(fn(True, t1.clone()), fn_script(True, t1.clone())) + self.assertEqual(fn(False, t2.clone()), fn_script(False, t2.clone())) + + def test_multiple_hoists(self): + """ + test that hoists that depend on other hoists are done correctly + """ + def fn(x: bool, y: torch.Tensor): + if x: + a = y + 3 + b = a + y + else: + a = y + 3 + b = a + y + c = b * 2 + return c + + fn_script = torch.jit.script(fn) + op_graph = fn_script.graph + self.run_pass("common_expression_hoisting", op_graph) + self.run_pass("dce", op_graph) + + FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) + FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) + + t1 = torch.Tensor([1]) + t2 = torch.Tensor([5, 6]) + self.assertEqual(fn(True, t1), fn_script(True, t1)) + self.assertEqual(fn(False, t2), fn_script(False, t2)) diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py index 14bb31cf07f1a..5fde8e2cc533d 100644 --- a/test/quantization/jit/test_quantize_jit.py +++ b/test/quantization/jit/test_quantize_jit.py @@ -1214,6 +1214,7 @@ class Res(torch.nn.Module): def __init__(self): super(Res, self).__init__() self.conv = torch.nn.Conv2d(3, 3, 1).float() + self.conv2 = torch.nn.Conv2d(3, 3, 1).float() self.use_skip = True def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor: @@ -1222,7 +1223,7 @@ def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor: if self.use_skip: return self.conv(x) else: - return self.conv(x) + return self.conv2(x) class M(torch.nn.Module): def __init__(self): diff --git a/test/test_jit.py b/test/test_jit.py index 99df960da5dc4..6cf1d8e7d5c6e 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -23,6 +23,7 @@ from jit.test_builtins import TestBuiltins, TestTensorBuiltins # noqa: F401 from jit.test_ignore_context_manager import TestIgnoreContextManager # noqa: F401 from jit.test_symbolic_shape_analysis import TestSymbolicShapeAnalysis # noqa: F401 +from jit.test_if_hoisting import TestIfHoisting # noqa: F401 from jit.test_unsupported_ops import TestUnsupportedOps # noqa: F401 from jit.test_freezing import TestFreezing, TestFrozenOptimizations, TestMKLDNNReinplacing # noqa: F401 from jit.test_peephole import TestPeephole # noqa: F401 diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 89697b4428ca1..2e71bedb35db4 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -191,6 +191,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/passes/clear_profiling.cpp", "torch/csrc/jit/passes/clear_undefinedness.cpp", "torch/csrc/jit/passes/common_subexpression_elimination.cpp", + "torch/csrc/jit/passes/common_expression_hoisting.cpp", "torch/csrc/jit/passes/concat_opt.cpp", "torch/csrc/jit/passes/constant_pooling.cpp", "torch/csrc/jit/passes/constant_propagation.cpp", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index b683a60615dc5..30885d3107176 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -204,6 +204,7 @@ def _jit_pass_inline(Graph) -> None: ... def _jit_pass_constant_propagation(Graph) -> None: ... def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ... def _jit_erase_non_input_shape_information(Graph) -> None: ... +def _jit_pass_common_expression_hoisting(Graph) -> None: ... def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ... def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ... def _jit_can_fuse_on_cpu() -> _bool: ... diff --git a/torch/csrc/jit/ir/node_hashing.cpp b/torch/csrc/jit/ir/node_hashing.cpp index 3fd4974ed421b..9a876d062d2fd 100644 --- a/torch/csrc/jit/ir/node_hashing.cpp +++ b/torch/csrc/jit/ir/node_hashing.cpp @@ -204,6 +204,8 @@ bool attributesEqualCSE(const Node* lhs, const Node* rhs) { } // anonymous namespace +// Makes a hash that hashes the input Value, the output type +// as well as the node attributes size_t HashNode::operator()(const Node* k) const { AT_ASSERT(k != nullptr); size_t constant_hash = 0; @@ -231,6 +233,8 @@ size_t HashNode::operator()(const Node* k) const { constant_hash); }; +// Checks that two nodes have the same inputs, output types +// and node attributes. bool EqualNode::operator()(const Node* lhs, const Node* rhs) const { if (lhs == nullptr && rhs == nullptr) return true; @@ -261,6 +265,16 @@ bool EqualNode::operator()(const Node* lhs, const Node* rhs) const { if (!attributesEqualCSE(lhs, rhs)) return false; + // Check if the blocks contained in a op are the same + if (lhs->blocks().size() != rhs->blocks().size()) { + return false; + } + for (size_t i = 0; i < lhs->blocks().size(); ++i) { + if (lhs->blocks()[i] != rhs->blocks()[i]) { + return false; + } + } + return true; }; diff --git a/torch/csrc/jit/passes/common_expression_hoisting.cpp b/torch/csrc/jit/passes/common_expression_hoisting.cpp new file mode 100644 index 0000000000000..ab2b9d41afa8b --- /dev/null +++ b/torch/csrc/jit/passes/common_expression_hoisting.cpp @@ -0,0 +1,153 @@ +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace torch { +namespace jit { +namespace { + +struct CommonExpressionHoister { + CommonExpressionHoister(std::shared_ptr graph) + : graph_(std::move(graph)) {} + + bool run() { + HoistCommonExpression(graph_->block()); + return changed_; + } + + void HoistFromIfNode(Node* if_node) { + Block* true_block = if_node->blocks()[0]; + Block* false_block = if_node->blocks()[1]; + // find common statements in the two subblocks + + auto true_block_nodes = std::unordered_set( + true_block->nodes().begin(), true_block->nodes().end()); + for (auto it = false_block->nodes().begin(); + it != false_block->nodes().end();) { + Node* false_b_node = *it; + // node may be moved to a different block so advance iterator now + ++it; + + auto matching_elem = true_block_nodes.find(false_b_node); + if (matching_elem == true_block_nodes.end()) { + continue; + } + Node* true_b_node = *matching_elem; + + // Check if a move to the front of the block is valid + // If both of the moves are valid, then we know we can move the item out + // of the if blocks entirely. + AliasDb& aliasDb = getOrCreateAliasDb(); + bool true_moveable = aliasDb.couldMoveAfterTopologically( + true_b_node, true_block->nodes().front()); + bool false_moveable = aliasDb.couldMoveAfterTopologically( + false_b_node, false_block->nodes().front()); + + if (!true_moveable || !false_moveable) { + continue; + } + + // Get all the uses of the output to delete and reinsert them + // as the input would change, the HashNode value would also change. + std::unordered_set true_b_uses; + for (Value* true_out : true_b_node->outputs()) { + for (Use true_use : true_out->uses()) { + if (true_use.user->owningBlock() == true_block) { + // Make sure we are not accidentally adding stuff from subblocks + true_b_uses.insert(true_use.user); + } + } + } + for (Node* uses_node : true_b_uses) { + true_block_nodes.erase(uses_node); + } + + // Now hoist the statement out of the block + changed_ = true; + false_b_node->moveBefore(if_node); + + true_b_node->replaceAllUsesWith(false_b_node); + + true_block_nodes.erase(true_b_node); + true_block_nodes.insert(true_b_uses.cbegin(), true_b_uses.cend()); + true_b_node->destroy(); + } + } + + void EliminateUnnecessaryIfOutputs(Node* if_node) { + Block* true_block = if_node->blocks()[0]; + Block* false_block = if_node->blocks()[1]; + + // fix up the if block outputs + for (size_t i = 0; i < true_block->outputs().size();) { + // Need to check both sides match to eliminate common if block outputs + Value* true_block_output = true_block->outputs().at(i); + Value* false_block_output = false_block->outputs().at(i); + if (true_block_output != false_block_output) { + i++; + continue; + } + + // We have a matching output, and can remove it from the block itself + if_node->outputs().at(i)->replaceAllUsesWith(true_block_output); + if_node->eraseOutput(i); + true_block->eraseOutput(i); + false_block->eraseOutput(i); + changed_ = true; + } + + // No need to test here if the IF block should be eliminated. + // The DCE pass will determine that for us. + } + + void HoistCommonExpression(Block* block) { + for (auto it = block->nodes().begin(); it != block->nodes().end();) { + Node* node = *it; + ++it; + + for (auto sub_block : node->blocks()) { + HoistCommonExpression(sub_block); + } + + if (node->kind() == prim::If) { + HoistFromIfNode(node); + EliminateUnnecessaryIfOutputs(node); + } + } + } + + AliasDb& getOrCreateAliasDb() { + if (!alias_db_) { + alias_db_ = std::make_unique(graph_); + } + + return *alias_db_; + } + + private: + std::unique_ptr alias_db_; + std::shared_ptr graph_; + bool changed_ = false; +}; +} // anonymous namespace +bool HoistCommonExpression(const std::shared_ptr& graph) { + // This moves common subexpressions from the two sides of an + // if block out of the if block. + + GRAPH_DUMP("Before CEH", graph); + CommonExpressionHoister ceh(graph); + bool changed = ceh.run(); + if (changed) { + GRAPH_DUMP("After CEH Changes", graph); + } + return changed; +} +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/passes/common_expression_hoisting.h b/torch/csrc/jit/passes/common_expression_hoisting.h new file mode 100644 index 0000000000000..2aad158eea8f8 --- /dev/null +++ b/torch/csrc/jit/passes/common_expression_hoisting.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace torch { +namespace jit { + +TORCH_API bool HoistCommonExpression(const std::shared_ptr& graph); +} +} // namespace torch diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp index f74a91176cfe8..10edfb4c496ef 100644 --- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp +++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 5fca575593551..d582035d6e95e 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -282,6 +283,11 @@ void initJITBindings(PyObject* module) { [](std::shared_ptr& g) { return EliminateCommonSubexpression(g); // overload resolution }) + .def( + "_jit_pass_common_expression_hoisting", + [](std::shared_ptr& g) { + return HoistCommonExpression(g); // overload resolution + }) .def( "_jit_pass_fuse_quantized_add_relu", [](std::shared_ptr& g) { diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp index 476882650a1dd..bb5f272080601 100644 --- a/torch/csrc/jit/runtime/graph_executor.cpp +++ b/torch/csrc/jit/runtime/graph_executor.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -918,7 +919,7 @@ void runOptimization( "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph); EliminateCommonSubexpression(graph); GRAPH_DEBUG( - "After EliminateCommonSubexpression, before PeepholeOptimize\n", *graph); + "After EliminateCommonSubexpression , before PeepholeOptimize\n", *graph); PeepholeOptimize(graph); GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph); @@ -949,8 +950,10 @@ void runOptimization( EliminateCommonSubexpression(graph); GRAPH_DEBUG( - "After EliminateCommonSubexpression, before CheckInplace\n", *graph); - + "After EliminateCommonSubexpression, before HoistCommonExpression\n", + *graph); + HoistCommonExpression(graph); + GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph); CheckInplace(graph); GRAPH_DEBUG("After CheckInplace (end of runOptimization)", *graph); } diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp index b099db17931b0..40d94a4a205fe 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -332,112 +333,16 @@ void runPreAutodiffPassPipeline(std::shared_ptr& graph) { EliminateCommonSubexpression(graph); GRAPH_DEBUG( - "After EliminateCommonSubexpression, before CheckInplace\n", *graph); - + "After EliminateCommonSubexpression, before HoistCommonExpression\n", + *graph); + HoistCommonExpression(graph); + GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph); CheckInplace(graph); } GRAPH_DEBUG( "After CheckInplace (end of runPreAutodiffPassPipeline)\n", *graph); } -void runDiffGraphPasses(std::shared_ptr& graph) { - GRAPH_DEBUG( - "Before EliminateDeadCode (beginning of runDiffGraphPasses)\n", *graph); - // runOptimization: - { - // Basic graph preprocessing to eliminate noise. - EliminateDeadCode(graph); - GRAPH_DEBUG( - "After EliminateDeadCode, before EliminateCommonSubexpression\n", - *graph); - EliminateCommonSubexpression(graph); - GRAPH_DEBUG( - "After EliminateCommonSubexpression, before PeepholeOptimize\n", - *graph); - - PeepholeOptimize(graph); - GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph); - ConstantPropagation(graph); - GRAPH_DEBUG("After ConstantPropagation, before ConstantPooling\n", *graph); - ConstantPooling(graph); - GRAPH_DEBUG("After ConstantPooling, before UnrollLoops\n", *graph); - - UnrollLoops(graph); - GRAPH_DEBUG("After UnrollLoops, before RemoveListMutation\n", *graph); - // run again with unrolled loops - RemoveListMutation(graph); - GRAPH_DEBUG("After RemoveListMutation, before PeepholeOptimize\n", *graph); - PeepholeOptimize(graph); - GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph); - ConstantPropagation(graph); - GRAPH_DEBUG( - "After ConstantPropagation, before EliminateCommonSubexpression\n", - *graph); - - EliminateCommonSubexpression(graph); - GRAPH_DEBUG( - "After EliminateCommonSubexpression, before CheckInplace\n", *graph); - - CheckInplace(graph); - } - GRAPH_DEBUG("After CheckInplace, before customPrePasses\n", *graph); - - // runNondiffOptimization - { - // Run custom passes that different backends can register. - for (const auto& passPair : getCustomPrePasses()) { - passPair.first(graph); - } - GRAPH_DEBUG("After customPrePasses, before LowerSimpleTuples\n", *graph); - - // TupleConstruct / TupleUnpack pairs can still be present at this point - // and must be removed for fusion. - LowerSimpleTuples(graph); - GRAPH_DEBUG("After LowerSimpleTuples\n", *graph); - - if (tensorExprFuserEnabled()) { - // Remove prim::profile nodes and embed the profile info directly in the - // IR in value types. We're doing such transformation as optimizations - // that try to merge/fuse nodes in the graph (e.g. BatchMM and GraphFuser) - // work worse in the presence of intermittent prim::profile nodes. - // Optimizations relying on the type info are also responsible for - // inserting proper type checks. Once we're done with these optimizations - // we will wipe the tensor type information from the IR, so that it's not - // accidentally used by any other pass. - RemoveProfileNodesAndSpecializeTypes(graph); - GRAPH_DEBUG( - "After RemoveProfileNodesAndSpecializeTypes, before BatchMM\n", - *graph); - // Rewrite subgraphs with many MMs into expressions that batch them. - BatchMM(graph); - GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph); - - FuseTensorExprs(graph, getFusionGroupInlining() ? 2 : 1); - GRAPH_DEBUG( - "After Fusion, before RemoveTensorTypeSpecializations\n", *graph); - - // Wipe tensor type info from the IR - RemoveTensorTypeSpecializations(graph); - GRAPH_DEBUG( - "After RemoveTensorTypeSpecializations, before customPostPasses\n", - *graph); - } else { - // Rewrite subgraphs with many MMs into expressions that batch them. - BatchMM(graph); - GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph); - - FuseGraph(graph, true); - GRAPH_DEBUG("After Fusion, before customPostPasses\n", *graph); - } - - // Run custom post-fusion passes - for (const auto& passPair : getCustomPostPasses()) { - passPair.first(graph); - } - } - GRAPH_DEBUG("After customPostPasses (end of runDiffGraphPasses)\n", *graph); -} - void runNoGradOptimizations(std::shared_ptr& graph) { GRAPH_DEBUG( "After customPostPasses (beginning of runNoGradOptimizations)\n", *graph); @@ -593,7 +498,11 @@ void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations( DecomposeOps(graph); GRAPH_DEBUG("After DecomposeOps, before ConstantPropagation\n", *graph); ConstantPropagation(graph); - GRAPH_DEBUG("After ConstantPropagation, before EliminateDeadCode\n", *graph); + GRAPH_DEBUG( + "After ConstantPropagation, before HoistCommonExpression\n", *graph); + HoistCommonExpression(graph); + GRAPH_DEBUG( + "After EliminateCommonSubexpression, before ElimiateDeadCode\n", *graph); EliminateDeadCode(graph); GRAPH_DEBUG( "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph); From 0d437fe6d0ef17648072eb586484a4a5a080b094 Mon Sep 17 00:00:00 2001 From: Yinbin Ma Date: Wed, 18 Aug 2021 20:52:17 -0700 Subject: [PATCH 052/530] BF16 allreduce hook (#63260) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63260 Add BF16 all-reduce communication hook. Skip if CUDA version < 11 or NCCL version < 2.9.7. Reviewed By: SciPioneer Differential Revision: D30238317 fbshipit-source-id: bad35bf7d43f10f1c40997a282b831b61ef592bb --- test/distributed/test_c10d_nccl.py | 67 ++++++++++++++++- torch/csrc/distributed/c10d/NCCLUtils.hpp | 9 +++ .../distributed/c10d/ProcessGroupNCCL.cpp | 2 +- .../algorithms/ddp_comm_hooks/__init__.py | 3 + .../ddp_comm_hooks/default_hooks.py | 72 +++++++++++++++++++ torch/nn/parallel/distributed.py | 13 ++++ 6 files changed, 162 insertions(+), 4 deletions(-) diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index f7f6681b43a76..9efebc94a9288 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -44,6 +44,7 @@ run_tests, retry_on_connect_failures, TEST_WITH_DEV_DBG_ASAN, + TEST_WITH_ROCM, TEST_WITH_TSAN, sandcastle_skip, sandcastle_skip_if, @@ -69,6 +70,11 @@ ) sys.exit(0) +# bfloat16 is only supported by CUDA 11+ +BFLOAT16_AVAILABLE = ( + torch.cuda.is_available() + and torch.version.cuda is not None + and int(torch.version.cuda.split('.')[0]) >= 11) class RendezvousEnvTest(TestCase): @retry_on_connect_failures @@ -1559,15 +1565,23 @@ def allreduce_hook( def _test_default_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False): """ - This unit test verifies whether default Python DDP communication hooks ALLREDUCE and FP16_COMPRESS - can give the same result with the case of no hook registered. + This unit test verifies whether default Python DDP communication hooks ALLREDUCE, FP16_COMPRESS + and BF16_COMPRESS, can give the same result with the case of no hook registered. """ store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # For these default DDP comm hooks, the only state is process group. state = process_group - for hook in [default.allreduce_hook, default.fp16_compress_hook]: + hook_options = [default.allreduce_hook, default.fp16_compress_hook] + if ( + not TEST_WITH_ROCM + and BFLOAT16_AVAILABLE + and c10d.is_nccl_available() + and torch.cuda.nccl.version() >= (2, 9, 7) + ): + hook_options.append(default.bf16_compress_hook) + for hook in hook_options: # Get GPU model with the hook registered. # The first arg 'process_group' is used for initializing the test environment, # so it cannot be replaced by 'state', although they have the same value. @@ -1603,6 +1617,31 @@ def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False): # check whether the grads are equal to what DDP without hook would return. self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + def _test_bf16_compress_wrapper(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with + the BF16_WRAPPER can give the same result as when there is no hook registered. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + powerSGD_state = powerSGD.PowerSGDState(process_group=process_group) + + hook_args = [ + (powerSGD.powerSGD_hook, powerSGD_state), + (default.allreduce_hook, process_group), + ] + + for hook, state in hook_args: + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, + default.bf16_compress_wrapper(hook), + gradient_as_bucket_view, + state, + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + def _test_hook_then_optimizer( self, functional_optim_cls, @@ -1709,6 +1748,17 @@ def test_default_ddp_comm_hooks_nccl(self): def test_fp16_compress_wrapper_nccl(self): self._test_fp16_compress_wrapper() + @requires_nccl() + @requires_nccl_version((2, 9, 7), "Need NCCL 2.9.7+ for BF16_COMPRESS") + @sandcastle_skip_if( + not BFLOAT16_AVAILABLE, + "BFloat16 is only supported by CUDA 11+", + ) + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_bf16_compress_wrapper_nccl(self): + self._test_bf16_compress_wrapper() + @requires_nccl() @skip_if_lt_x_gpu(2) def test_hook_then_sgd_nccl(self): @@ -1808,6 +1858,17 @@ def test_default_ddp_comm_hooks_nccl_is_view(self): def test_fp16_compress_wrapper_is_view(self): self._test_fp16_compress_wrapper(gradient_as_bucket_view=True) + @requires_nccl() + @requires_nccl_version((2, 9, 7), "Need NCCL 2.9.7+ for BF16_COMPRESS") + @sandcastle_skip_if( + not BFLOAT16_AVAILABLE, + "BFloat16 is only supported by CUDA 11+", + ) + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_bf16_compress_wrapper_is_view(self): + self._test_bf16_compress_wrapper(gradient_as_bucket_view=True) + @requires_nccl() @skip_if_lt_x_gpu(2) def test_builtin_ddp_comm_hooks_nccl_grad_is_view(self): diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp index e3ee14da0f542..bd50bba3606b9 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.hpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp @@ -50,6 +50,15 @@ const inline char* getNcclErrorDetailStr(ncclResult_t error) { #define ENABLE_NCCL_P2P_SUPPORT #endif +// NCCL BFloat16 is enabled only for CUDA 11+ and NCCL versions 2.9.7+ +#if (defined(__CUDA_BF16_TYPES_EXIST__) && \ + defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && \ + (defined(NCCL_MINOR) && ((NCCL_MINOR > 9) || \ + ((NCCL_MINOR == 9) && defined(NCCL_PATCH) && (NCCL_PATCH >= 7))))) || \ + (defined(__HIP_PLATFORM_HCC__) && (TORCH_HIP_VERSION >= 301)) +#define ENABLE_NCCL_BF16_DATATYPE +#endif + // Macro to throw on a non-successful NCCL return value. #define C10D_NCCL_CHECK(cmd) \ do { \ diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 3c7041a2dd691..911963b76cd7c 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -63,7 +63,7 @@ std::map ncclDataType = { {at::kLong, ncclInt64}, {at::kHalf, ncclHalf}, {at::kBool, ncclUint8}, -#if defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 301 +#if defined(ENABLE_NCCL_BF16_DATATYPE) {at::kBFloat16, ncclBfloat16}, #endif }; diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py index 35ddf316e91c5..c3f3b066ee478 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py @@ -46,6 +46,9 @@ class DDPCommHookType(Enum): FP16_COMPRESS = partial( _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook ) + BF16_COMPRESS = partial( + _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook + ) QUANTIZE_PER_TENSOR = partial( _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook ) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py index 0642deace3565..d11e39b23f6f0 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py @@ -69,6 +69,41 @@ def decompress(fut): return fut.then(decompress) +# TODO: create an internal helper function and extract the duplicate code in FP16_compress and BF16_compress. +def bf16_compress_hook( + process_group: dist.ProcessGroup, bucket: dist.GradBucket +) -> torch.futures.Future[torch.Tensor]: + """ + Warning: This API is experimental, and it requires NCCL version later than 2.9.6. + + This DDP communication hook implements a simple gradient compression + approach that casts ``GradBucket`` tensor to half-precision + `Brain floating point format `_ (``torch.bfloat16``) + and then divides it by the process group size. + It allreduces those ``bfloat16`` gradient tensors. Once compressed gradient + tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``). + + Example:: + >>> ddp_model.register_comm_hook(process_group, bf16_compress_hook) + """ + group_to_use = process_group if process_group is not None else dist.group.WORLD + world_size = group_to_use.size() + + compressed_tensor = bucket.buffer().to(torch.bfloat16).div_(world_size) + + fut = dist.all_reduce( + compressed_tensor, group=group_to_use, async_op=True + ).get_future() + + def decompress(fut): + decompressed_tensor = bucket.buffer() + # Decompress in place to reduce the peak memory. + # See: https://github.com/pytorch/pytorch/issues/45968 + decompressed_tensor.copy_(fut.value()[0]) + return decompressed_tensor + + return fut.then(decompress) + class _OptimizerHookState(object): """ @@ -160,3 +195,40 @@ def decompress(fut): return fut.then(decompress) return fp16_compress_wrapper_hook + +def bf16_compress_wrapper( + hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]] +) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]: + """ + Warning: This API is experimental, and it requires NCCL version later than 2.9.6. + + This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision + `Brain floating point format `_ (``torch.bfloat16``), + and casts the resulting tensor of the given hook back to the input data type, such as ``float32``. + + Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``. + + Example:: + >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10) + >>> ddp_model.register_comm_hook(state, bf16_compress_wrapper(powerSGD_hook)) + """ + + def bf16_compress_wrapper_hook( + hook_state, bucket: dist.GradBucket + ) -> torch.futures.Future[torch.Tensor]: + # Cast bucket tensor to BF16. + bucket.set_buffer(bucket.buffer().to(torch.bfloat16)) + + fut = hook(hook_state, bucket) + + def decompress(fut): + decompressed_tensor = bucket.buffer() + # Decompress in place to reduce the peak memory. + # See: https://github.com/pytorch/pytorch/issues/45968 + decompressed_tensor.copy_(fut.value()) + return decompressed_tensor + + # Decompress after hook has run. + return fut.then(decompress) + + return bf16_compress_wrapper_hook diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index ed5c3656203ee..60d21431dc5bf 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -1408,6 +1408,19 @@ def _check_comm_hook(self, hook): "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].", ) + if ( + hook.__name__ in ["bf16_compress_hook", "bf16_compress_wrapper_hook"] + and + ( + torch.version.cuda is None + or int(torch.version.cuda.split('.')[0]) < 11 + or not dist.is_available() + or not dist.is_nccl_available() + or torch.cuda.nccl.version() < (2, 9, 7) + ) + ): + self._log_and_throw(TypeError, "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.9.7+.") + @property def _distributed_rank(self): return dist.get_rank(self.process_group) From 2544664e5470cd5012bd88d07fb573aa31ca8c60 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 18 Aug 2021 20:56:25 -0700 Subject: [PATCH 053/530] Beef up comment in AccumulateType (#63503) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63503 Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30403160 Pulled By: ezyang fbshipit-source-id: 6cb24418152d9fb146f86b6f973ec50f1a397a58 --- aten/src/ATen/AccumulateType.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h index 09c8cdb6c095a..4270ec021dbc7 100644 --- a/aten/src/ATen/AccumulateType.h +++ b/aten/src/ATen/AccumulateType.h @@ -6,7 +6,38 @@ // Defines the accumulation type for a scalar type. // Example: -// using accscalar_t = acc_type; +// using accscalar_t = acc_type; +// +// Accumulation types are an important concept in numeric computing +// because you frequently want to perform intermediate computations +// at a higher precision than the input and output precision, to avoid +// compounding internal rounding errors. Accumulation is the most +// well-known intermediate computation (it is of great importance for +// sum reduction and matrix multiply, for example), but in PyTorch +// acc_type ends up getting used for all sorts of other intermediate +// computations, so it perhaps would be more accurately (ahem) called an +// "accurate" type. acc_type is especially important for reduced +// precision operations like float16 and bfloat16, where relatively +// benign looking inputs can easily end up overflowing/underflowing. +// +// acc_type is parametrized by whether or not you are running on CUDA +// or not, because on CUDA double precision operations are expensive +// and so by default, we don't actually want to use double as an +// acc_type on CUDA. A lot of things are typed out below, but +// basically, the table is generated by a few rules: +// +// If bool: +// Use 'bool' as acc_type. +// If floating point: +// If CUDA, use 'float' as acc_type (unless scalar_t is double), +// otherwise (CPU) use 'double' +// If integral: +// Use 'int64_t' as acc_type +// +// You're not forced to use this template; if you happen to know +// something specific about your use case, you can specify your own +// desired behavior. This template, however, will give you a reasonable +// default that will work for all dtypes supported in PyTorch. #if defined(__CUDACC__) #include From 773c8b6440eee7d903f3d3a80d1fa866370b1ba2 Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Wed, 18 Aug 2021 21:39:18 -0700 Subject: [PATCH 054/530] support optional comparisons with different but comparable types (#62890) Summary: Fixes https://github.com/pytorch/pytorch/issues/62565 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62890 Reviewed By: ejguan Differential Revision: D30396008 Pulled By: dagitses fbshipit-source-id: fca02207509f882973d54484f89c4d116505fc66 --- c10/test/util/optional_test.cpp | 8 +++++- c10/util/Optional.h | 48 ++++++++++++++++----------------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/c10/test/util/optional_test.cpp b/c10/test/util/optional_test.cpp index cac325f9188ab..ac976b4b16f79 100644 --- a/c10/test/util/optional_test.cpp +++ b/c10/test/util/optional_test.cpp @@ -146,10 +146,16 @@ TEST(OptionalTest, Nullopt) { using CmpTestTypes = testing::Types< // between two optionals std::pair, c10::optional>, + // between an optional and a value std::pair, int>, // between a value and an optional - std::pair>>; + std::pair>, + + // between an optional and a differently typed value + std::pair, long>, + // between a differently typed value and an optional + std::pair>>; template class CmpTest : public testing::Test {}; TYPED_TEST_CASE(CmpTest, CmpTestTypes); diff --git a/c10/util/Optional.h b/c10/util/Optional.h index 5e0684bb7d2f5..7044c798d2de4 100644 --- a/c10/util/Optional.h +++ b/c10/util/Optional.h @@ -1049,63 +1049,63 @@ constexpr bool operator>=(nullopt_t, const optional& x) noexcept { } // 20.5.10, Comparison with T -template -constexpr bool operator==(const optional& x, const T& v) { +template +constexpr bool operator==(const optional& x, const U& v) { return bool(x) ? *x == v : false; } -template -constexpr bool operator==(const T& v, const optional& x) { +template +constexpr bool operator==(const U& v, const optional& x) { return bool(x) ? v == *x : false; } -template -constexpr bool operator!=(const optional& x, const T& v) { +template +constexpr bool operator!=(const optional& x, const U& v) { return bool(x) ? *x != v : true; } -template -constexpr bool operator!=(const T& v, const optional& x) { +template +constexpr bool operator!=(const U& v, const optional& x) { return bool(x) ? v != *x : true; } -template -constexpr bool operator<(const optional& x, const T& v) { +template +constexpr bool operator<(const optional& x, const U& v) { return bool(x) ? *x < v : true; } -template -constexpr bool operator>(const T& v, const optional& x) { +template +constexpr bool operator>(const U& v, const optional& x) { return bool(x) ? v > *x : true; } -template -constexpr bool operator>(const optional& x, const T& v) { +template +constexpr bool operator>(const optional& x, const U& v) { return bool(x) ? *x > v : false; } -template -constexpr bool operator<(const T& v, const optional& x) { +template +constexpr bool operator<(const U& v, const optional& x) { return bool(x) ? v < *x : false; } -template -constexpr bool operator>=(const optional& x, const T& v) { +template +constexpr bool operator>=(const optional& x, const U& v) { return bool(x) ? *x >= v : false; } -template -constexpr bool operator<=(const T& v, const optional& x) { +template +constexpr bool operator<=(const U& v, const optional& x) { return bool(x) ? v <= *x : false; } -template -constexpr bool operator<=(const optional& x, const T& v) { +template +constexpr bool operator<=(const optional& x, const U& v) { return bool(x) ? *x <= v : true; } -template -constexpr bool operator>=(const T& v, const optional& x) { +template +constexpr bool operator>=(const U& v, const optional& x) { return bool(x) ? v >= *x : true; } From 1d62fb8a63a4c69f791f623e913ee1eabb56c344 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 18 Aug 2021 22:56:47 -0700 Subject: [PATCH 055/530] [TensorExpr] Speedup ExternalCall.ComputeInterop test by reducing tensor sizes. (#63526) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63526 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30411410 Pulled By: ZolotukhinM fbshipit-source-id: d9a99afac14d2238b5100c98ae9ed4467f9f05ea --- test/cpp/tensorexpr/test_external_calls.cpp | 22 ++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp index 9ae99ca5d3b2f..24ddfbf095ab3 100644 --- a/test/cpp/tensorexpr/test_external_calls.cpp +++ b/test/cpp/tensorexpr/test_external_calls.cpp @@ -563,12 +563,12 @@ TEST(ExternalCall, ComputeInterop) { // use Tensors built with Compute API. KernelScope kernel_scope; - BufHandle ConvResultBuf("ConvResult", {1, 16, 112, 112}, kFloat); - BufHandle MatmulResultBuf("MatmulResult", {1, 16, 112, 112}, kFloat); + BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat); + BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat); Tensor* Input = Compute( "Input", - {{1, "n"}, {16, "c"}, {112, "h"}, {112, "w"}}, + {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, @@ -597,7 +597,7 @@ TEST(ExternalCall, ComputeInterop) { {})); Tensor* Result = Compute( "Result", - {{1, "n"}, {16, "c"}, {112, "h"}, {112, "w"}}, + {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, @@ -619,18 +619,18 @@ TEST(ExternalCall, ComputeInterop) { .layout(at::kStrided) .device(at::kCPU) .requires_grad(false); - at::Tensor input = at::ones({1, 16, 112, 112}, options) * 5.f; + at::Tensor input = at::ones({1, 16, 32, 32}, options) * 5.f; at::Tensor weight = at::ones({16, 16, 1, 1}, options) * 6.f; at::Tensor t = at::conv2d(input, weight); at::Tensor t2 = at::matmul(t, t); at::Tensor ref = t + t2; at::Tensor nnc_result; - std::vector input_buf(1 * 16 * 112 * 112, 5.f); + std::vector input_buf(1 * 16 * 32 * 32, 5.f); std::vector weight_buf(16 * 16 * 1 * 1, 6.f); - std::vector conv_result_buf(1 * 16 * 112 * 112, -1.f); - std::vector matmul_result_buf(1 * 16 * 112 * 112, -1.f); - std::vector result_buf(1 * 16 * 112 * 112, -1.f); + std::vector conv_result_buf(1 * 16 * 32 * 32, -1.f); + std::vector matmul_result_buf(1 * 16 * 32 * 32, -1.f); + std::vector result_buf(1 * 16 * 32 * 32, -1.f); #ifdef TORCH_ENABLE_LLVM LLVMCodeGen llvm_codegen( @@ -638,7 +638,7 @@ TEST(ExternalCall, ComputeInterop) { llvm_codegen.call( {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf}); - nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options); + nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options); ASSERT_TRUE(at::allclose(nnc_result, ref)); #endif @@ -647,7 +647,7 @@ TEST(ExternalCall, ComputeInterop) { ir_eval.call( {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf}); - nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options); + nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options); ASSERT_TRUE(at::allclose(nnc_result, ref)); } From 6e00b31b15ba9a09b6aa71b0da1ba200be482011 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Wed, 18 Aug 2021 22:56:47 -0700 Subject: [PATCH 056/530] [TensorExpr] Make CacheReplacer and IndexFlattener mutate stmts/exprs inplace. (#63527) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63527 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30411411 Pulled By: ZolotukhinM fbshipit-source-id: efb14ee57b36537fa4fefa89bdd6bafe7151c012 --- test/cpp/tensorexpr/test_loopnest.cpp | 2 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 66 +++++++++++++--------- torch/csrc/jit/tensorexpr/registerizer.cpp | 6 +- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index f2ae208ca7fed..4a2a1d07db12e 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -4017,7 +4017,7 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) { // Will eliminate the write to g, but not f since it used by the producer of // h. - LoopNest loop(stmt, {h.node()}); + LoopNest loop(Stmt::clone(stmt), {h.node()}); loop.eliminateDeadStores(); checkIR(loop.root_stmt(), R"IR( diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index e9bc76c6e8791..2256369e2e9e2 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -109,12 +109,13 @@ class IndexFlattener : public IRMutator { ExprPtr value = v->value(); ExprPtr new_value = value->accept_mutator(this); if (v->indices().size() == 1 && value == new_value) { - return (StmtPtr)v; + return v; } - return alloc( - v->buf(), - std::vector({flatten_index(v->buf()->dims(), v->indices())}), - new_value); + std::vector indices = { + flatten_index(v->buf()->dims(), v->indices())}; + v->set_indices(indices); + v->set_value(new_value); + return v; } }; @@ -2575,8 +2576,9 @@ class CacheReplacer : public IRMutator { ExprPtr sub = IRSimplifier::simplify(alloc(index, offset)); newIndices.push_back(sub); } - - return alloc(cache_, newIndices); + v->set_buf(cache_); + v->set_indices(newIndices); + return v; } StmtPtr mutate(StorePtr v) override { @@ -2596,8 +2598,10 @@ class CacheReplacer : public IRMutator { ExprPtr sub = IRSimplifier::simplify(alloc(index, offset)); newIndices.push_back(sub); } - - return alloc(cache_, newIndices, newValue); + v->set_buf(cache_); + v->set_indices(newIndices); + v->set_value(newValue); + return v; } BufPtr buf_; @@ -2669,21 +2673,13 @@ LoopNest::AccessResult LoopNest::cacheAccesses( // Replace acceses to the producer in the consumer with the cache. CacheReplacer replacer(producer, tmp_buf, info.start); - // TODO: Can we reuse 'consumer' below without cloning? - StmtPtr new_consumer = - IRSimplifier::simplify(Stmt::clone(consumer)->accept_mutator(&replacer)); + consumer->accept_mutator(&replacer); // replace the old consumer with the replaced consumer. - BlockPtr consumer_block = nullptr; + BlockPtr consumer_block = to(consumer); + BlockPtr parent_block = to(consumer->get_parent()); // if the consumer is a block, we should mutate it in place. - if ((consumer_block = to(consumer))) { - consumer_block->clear(); - consumer_block->append_stmt(new_consumer); - } else { - consumer_block = to(consumer->get_parent()); - assert(consumer_block); - consumer_block->replace_stmt(consumer, new_consumer); - } + bool is_block = consumer_block != nullptr; // If there's a reduction and we are operating on the reduce axis, we need to // initialize the cache with 0s. Also, we can't just write the result straight @@ -2715,7 +2711,11 @@ LoopNest::AccessResult LoopNest::cacheAccesses( alloc(new_loop_vars[i], alloc(0), tmp_dims[i], tmp_init); } - consumer_block->insert_stmt_before(tmp_init, new_consumer); + if (is_block) { + consumer_block->prepend_stmt(tmp_init); + } else { + parent_block->insert_stmt_before(tmp_init, consumer); + } // Reduce back to the original buffer: StmtPtr tmp_store = alloc( @@ -2732,9 +2732,13 @@ LoopNest::AccessResult LoopNest::cacheAccesses( new_loop_vars[i], alloc(0), tmp_dims[i], tmp_store); } - consumer_block->insert_stmt_after(tmp_store, new_consumer); + if (is_block) { + consumer_block->append_stmt(tmp_store); + } else { + parent_block->insert_stmt_after(tmp_store, consumer); + } - return std::make_pair(tmp_buf, new_consumer); + return std::make_pair(tmp_buf, consumer); } if (hasReads) { @@ -2747,7 +2751,11 @@ LoopNest::AccessResult LoopNest::cacheAccesses( new_loop_vars[i], alloc(0), tmp_dims[i], tmp_store); } - consumer_block->insert_stmt_before(tmp_store, new_consumer); + if (is_block) { + consumer_block->prepend_stmt(tmp_store); + } else { + parent_block->insert_stmt_before(tmp_store, consumer); + } } if (hasWrites) { @@ -2760,10 +2768,14 @@ LoopNest::AccessResult LoopNest::cacheAccesses( new_loop_vars[i], alloc(0), tmp_dims[i], tmp_store); } - consumer_block->insert_stmt_after(tmp_store, new_consumer); + if (is_block) { + consumer_block->append_stmt(tmp_store); + } else { + parent_block->insert_stmt_after(tmp_store, consumer); + } } - return std::make_pair(tmp_buf, new_consumer); + return std::make_pair(tmp_buf, consumer); } /* diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp index 07aee209e6e53..bc26581970383 100644 --- a/torch/csrc/jit/tensorexpr/registerizer.cpp +++ b/torch/csrc/jit/tensorexpr/registerizer.cpp @@ -668,8 +668,10 @@ StmtPtr RegisterizerReplacer::mutate(StorePtr v) { ExprPtr new_val = v->value()->accept_mutator(this); - return alloc( - info->replacement().var_wrapper, std::vector({}), new_val); + v->set_value(new_val); + v->set_buf(info->replacement().var_wrapper); + v->set_indices({}); + return v; } StmtPtr RegisterizerReplacer::mutate(BlockPtr v) { From 531262fe2eee7e2b4464f5eceec1877dd57e2deb Mon Sep 17 00:00:00 2001 From: Nikolay Korovaiko Date: Wed, 18 Aug 2021 22:59:40 -0700 Subject: [PATCH 057/530] layernorm (#63436) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63436 use MKLDNN layernorm use mkldnn version 2 address Elias feedback fix build CI errors Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30388825 Pulled By: Krovatkin fbshipit-source-id: fb909bfbf53cb8567a43aac40f51c491daeec908 --- aten/src/ATen/native/mkldnn/Normalization.cpp | 51 ++++++++++++++++++ aten/src/ATen/native/mkldnn/Utils.h | 7 +++ test/jit/test_freezing.py | 26 ++++++++- .../csrc/jit/passes/frozen_ops_to_mkldnn.cpp | 54 +++++++++++++++++++ 4 files changed, 137 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp index 9836f3560d038..3a151828f236c 100644 --- a/aten/src/ATen/native/mkldnn/Normalization.cpp +++ b/aten/src/ATen/native/mkldnn/Normalization.cpp @@ -25,6 +25,13 @@ std::tuple mkldnn_batch_norm_backward( TORCH_CHECK(false, "mkldnn_batch_norm_backward: ATen not compiled with MKLDNN support"); } +std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( + const Tensor& input, + IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias, + double eps) { + TORCH_CHECK(false, "mkldnn_layer_norm_last_index_weight_bias_f32: ATen not compiled with MKLDNN support"); +} + } // namespace native } // namespace at @@ -32,10 +39,54 @@ std::tuple mkldnn_batch_norm_backward( #include #include +#include +#include namespace at { namespace native { +std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( + const Tensor& input, + IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias, + double eps) { + + TORCH_INTERNAL_ASSERT(normalized_shape.size() == 1, "only accept shapes with the last dimension"); + TORCH_INTERNAL_ASSERT(input.scalar_type() == at::kFloat); + auto M_N = at::native::_check_layer_norm_inputs(input, normalized_shape, weight, bias); + auto M = M_N.first; + + auto mean = empty_mkldnn( + {M}, + input.scalar_type(), + input.options().layout_opt(), + input.options().device_opt(), + input.options().pinned_memory_opt()); + auto rstd = empty_mkldnn( + {M}, + input.scalar_type(), + input.options().layout_opt(), + input.options().device_opt(), + input.options().pinned_memory_opt()); + + auto mean_it = at::native::itensor_from_mkldnn(mean); + auto rstd_it = at::native::itensor_from_mkldnn(rstd); + + auto input_it = at::native::itensor_from_mkldnn(input); + auto weight_it = at::native::itensor_from_mkldnn(weight); + auto bias_it = at::native::itensor_from_mkldnn(bias); + + auto out_it = ideep::tensor(input_it.get_desc()); + ideep::layer_normalization_forward::compute(input_it, weight_it, bias_it, out_it, mean_it, rstd_it, static_cast(eps)); + + auto dst = at::native::new_with_itensor_mkldnn( + std::move(out_it), + optTypeMetaToScalarType(input.options().dtype_opt()), + input.options().device_opt()); + + return std::make_tuple(dst, mean, rstd); +} + + std::tuple mkldnn_batch_norm( const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h index f2e4e8f9056df..60a7d457fe12a 100644 --- a/aten/src/ATen/native/mkldnn/Utils.h +++ b/aten/src/ATen/native/mkldnn/Utils.h @@ -1,11 +1,18 @@ #pragma once #include +#include #include #include + namespace at { namespace native { +std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( + const Tensor& input, + IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias, + double eps); + std::vector pool_output_sizes( IntArrayRef input_size, IntArrayRef kernel_size, diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 1e2037e59a0ba..5c70b2a1ec233 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -1927,6 +1927,30 @@ def test_pool3d_batchnorm(self): FileCheck().check("aten::to_dense").check_next("return").run(mod.graph) self.assertTrue(torch.allclose(sub_model(inp), mod(inp))) + @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") + @skipIfNoTorchVision + def test_layernorm(self): + with set_default_dtype(torch.float): + model = torchvision.models.resnet18() + N, C, H, W, = 10, 3, 224, 224 + for param in ((model.conv1, [W // 2], torch.randn(N, C, H, W)), + (model.conv1, [H // 2, W // 2], torch.randn(N, C, H, W)), + (torch.nn.Linear(H, W), [W], torch.randn(N, C, W)),): + + for layernorm in (torch.nn.LayerNorm(param[1]), + torch.nn.LayerNorm(param[1], elementwise_affine=False)): + sub_model = torch.nn.Sequential(param[0], layernorm) + sub_model.eval() + mod = torch.jit.freeze(torch.jit.script(sub_model)) + self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) + # if weight and bias are present and shape is the last dimension + # we should convert `aten::layer_norm` to `prim::MKLDNNLayerNorm` + if layernorm.elementwise_affine and len(param[1]) == 1: + FileCheck().check("prim::MKLDNNLayerNorm").check_count("aten::to_dense", 1, exactly=True).run(mod.graph) + else: + FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph) + self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04)) + @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") @skipIfNoTorchVision def test_conv_hardswish(self): @@ -1940,6 +1964,7 @@ def __init__(self, min_val, max_val, **kwargs): def forward(self, x): return torch.clamp(x, self.min_val, self.max_val) + N, C, H, W, = 10, 3, 224, 224 activations = [ torch.nn.Hardswish(), torch.nn.Hardsigmoid(), @@ -1960,7 +1985,6 @@ def forward(self, x): sub_model = torch.nn.Sequential(model.conv1, activation) sub_model.eval() mod = torch.jit.freeze(torch.jit.script(sub_model)) - N, C, H, W, = 10, 3, 224, 224 inp = torch.randn(N, C, H, W) self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph) diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp index e6faf90b6f2b6..3358b9db26808 100644 --- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp +++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp @@ -1,9 +1,11 @@ #include #include #include +#include #include #include #include + #include #include #include @@ -33,6 +35,7 @@ #if AT_MKLDNN_ENABLED() #include #include +#include #include #include #endif @@ -271,6 +274,33 @@ Operation createUnaryOp( }; } +void MKLDNNLayerNormOp(Stack* stack) { + c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); + + // enable_cudnn not used + pop(stack); + auto eps = pop(stack).toDouble(); + + Tensor bias{}; + Tensor weight{}; + auto bias_ival = pop(stack); + TORCH_INTERNAL_ASSERT(bias_ival.isTensor()); + bias = bias_ival.toTensor(); + + auto weight_ival = pop(stack); + TORCH_INTERNAL_ASSERT(weight_ival.isTensor()); + weight = weight_ival.toTensor(); + + auto shape = pop(stack).toIntVector(); + auto input = pop(stack).toTensor(); + + at::Tensor dst, mean, rstd; + std::tie(dst, mean, rstd) = + at::native::mkldnn_layer_norm_last_index_weight_bias_f32( + input, shape, weight, bias, eps); + push(stack, dst); +}; + Operation BroadOp(const Node* node) { return [](Stack* stack) { auto b = pop(stack).toTensor(); @@ -437,6 +467,13 @@ const RegisterOperators BroadOpReg({ AliasAnalysisKind::INTERNAL_SPECIAL_CASE), }); +const RegisterOperators MKLDNNLayerNormOpReg({ + torch::jit::Operator( + "prim::MKLDNNLayerNorm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor", + MKLDNNLayerNormOp, + AliasAnalysisKind::FROM_SCHEMA), +}); + Operation ConstantMKLDNNTensorOp(const Node* node) { const auto& t = node->t(attr::value); return [t](Stack* stack) { @@ -719,6 +756,13 @@ void ComputeSubgraphInMKLDNN(Node* subgraph_node) { continue; } + if (body_node->matches( + "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor")) { + body_node->replaceWithNewSymbol(Symbol::prim("MKLDNNLayerNorm")); + body_node->destroy(); + continue; + } + if (body_node->kind() == aten::hardswish) { body_node->replaceWithNewSymbol(prim::MKLDNNHardSwish); body_node->destroy(); @@ -917,6 +961,16 @@ class MKLDNNSubgraphSlicer { return false; } } + + if (n->matches( + "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor") && + n->namedInput("weight")->type() != NoneType::get() && + n->namedInput("bias")->type() != NoneType::get()) { + auto norm_shape = + constant_as>(n->namedInput("normalized_shape")); + return norm_shape.has_value() && norm_shape->size() == 1; + } + // unary ops we dont need to prove anything else than // the input is mkldnn supported switch (n->kind()) { From 5254e3adb8b2b14e71aee95fffe467a70fd2f93a Mon Sep 17 00:00:00 2001 From: Nikolay Korovaiko Date: Wed, 18 Aug 2021 22:59:40 -0700 Subject: [PATCH 058/530] layernom inplace (#63437) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63437 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30388824 Pulled By: Krovatkin fbshipit-source-id: 852d19bf238544c5de177ed5854dcd01c7ae5572 --- aten/src/ATen/native/mkldnn/Normalization.cpp | 6 +-- aten/src/ATen/native/mkldnn/Utils.h | 2 +- test/jit/test_freezing.py | 37 +++++++++++++------ .../csrc/jit/passes/frozen_ops_to_mkldnn.cpp | 13 +++++-- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp index 3a151828f236c..f01bbb3d2b4bd 100644 --- a/aten/src/ATen/native/mkldnn/Normalization.cpp +++ b/aten/src/ATen/native/mkldnn/Normalization.cpp @@ -28,7 +28,7 @@ std::tuple mkldnn_batch_norm_backward( std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( const Tensor& input, IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias, - double eps) { + double eps, bool inplace) { TORCH_CHECK(false, "mkldnn_layer_norm_last_index_weight_bias_f32: ATen not compiled with MKLDNN support"); } @@ -48,7 +48,7 @@ namespace native { std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( const Tensor& input, IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias, - double eps) { + double eps, bool inplace) { TORCH_INTERNAL_ASSERT(normalized_shape.size() == 1, "only accept shapes with the last dimension"); TORCH_INTERNAL_ASSERT(input.scalar_type() == at::kFloat); @@ -75,7 +75,7 @@ std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( auto weight_it = at::native::itensor_from_mkldnn(weight); auto bias_it = at::native::itensor_from_mkldnn(bias); - auto out_it = ideep::tensor(input_it.get_desc()); + auto out_it = inplace ? input_it : ideep::tensor(input_it.get_desc()); ideep::layer_normalization_forward::compute(input_it, weight_it, bias_it, out_it, mean_it, rstd_it, static_cast(eps)); auto dst = at::native::new_with_itensor_mkldnn( diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h index 60a7d457fe12a..abfafd5230e98 100644 --- a/aten/src/ATen/native/mkldnn/Utils.h +++ b/aten/src/ATen/native/mkldnn/Utils.h @@ -11,7 +11,7 @@ namespace at { namespace native { std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( const Tensor& input, IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias, - double eps); + double eps, bool inplace = false); std::vector pool_output_sizes( IntArrayRef input_size, diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 5c70b2a1ec233..8e07af06b70ea 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -1931,6 +1931,17 @@ def test_pool3d_batchnorm(self): @skipIfNoTorchVision def test_layernorm(self): with set_default_dtype(torch.float): + + class ResidualLayernorm(torch.nn.Module): + def __init__(self, op, layernorm, **kwargs): + super(ResidualLayernorm, self).__init__() + self.op = op + self.layernorm = layernorm + + def forward(self, x): + y = self.op(x) + return self.layernorm(y) + y + model = torchvision.models.resnet18() N, C, H, W, = 10, 3, 224, 224 for param in ((model.conv1, [W // 2], torch.randn(N, C, H, W)), @@ -1939,17 +1950,21 @@ def test_layernorm(self): for layernorm in (torch.nn.LayerNorm(param[1]), torch.nn.LayerNorm(param[1], elementwise_affine=False)): - sub_model = torch.nn.Sequential(param[0], layernorm) - sub_model.eval() - mod = torch.jit.freeze(torch.jit.script(sub_model)) - self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) - # if weight and bias are present and shape is the last dimension - # we should convert `aten::layer_norm` to `prim::MKLDNNLayerNorm` - if layernorm.elementwise_affine and len(param[1]) == 1: - FileCheck().check("prim::MKLDNNLayerNorm").check_count("aten::to_dense", 1, exactly=True).run(mod.graph) - else: - FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph) - self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04)) + # to generate non inplace tests we extend the use of layernorm's input + for inplace in (True, False): + sub_model = torch.nn.Sequential(param[0], layernorm) if inplace else ResidualLayernorm(param[0], layernorm) + sub_model.eval() + mod = torch.jit.freeze(torch.jit.script(sub_model)) + self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) + # if weight and bias are present and shape is the last dimension + # we should convert `aten::layer_norm` to `prim::MKLDNNLayerNorm` + if layernorm.elementwise_affine and len(param[1]) == 1: + inplace_suffix = "_" if inplace else "" + (FileCheck().check("prim::MKLDNNLayerNorm" + inplace_suffix). + check_count("aten::to_dense", 1, exactly=True).run(mod.graph)) + else: + FileCheck().check_count("aten::to_dense", 1, exactly=True).check("aten::layer_norm").run(mod.graph) + self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04)) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") @skipIfNoTorchVision diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp index 3358b9db26808..6d218af06e34c 100644 --- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp +++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp @@ -185,7 +185,8 @@ void InplaceMKLDNNSubgraph(std::shared_ptr graph) { if (k == aten::relu || k == aten::sigmoid || k == aten::dropout || k == prim::MKLDNNHardSwish || k == prim::MKLDNNHardSigmoid || k == prim::MKLDNNHardTanh || k == aten::tanh || - k == prim::MKLDNNClamp || k == Symbol::prim("MKLDNNScalarMul")) { + k == prim::MKLDNNClamp || k == Symbol::prim("MKLDNNScalarMul") || + k == Symbol::prim("MKLDNNLayerNorm")) { if (set_liveness[alias_mapping[node->inputs().at(0)]]->isAfter(node)) { continue; } @@ -274,7 +275,7 @@ Operation createUnaryOp( }; } -void MKLDNNLayerNormOp(Stack* stack) { +void MKLDNNLayerNormOp(Stack* stack, bool inplace) { c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); // enable_cudnn not used @@ -297,7 +298,7 @@ void MKLDNNLayerNormOp(Stack* stack) { at::Tensor dst, mean, rstd; std::tie(dst, mean, rstd) = at::native::mkldnn_layer_norm_last_index_weight_bias_f32( - input, shape, weight, bias, eps); + input, shape, weight, bias, eps, inplace); push(stack, dst); }; @@ -470,7 +471,11 @@ const RegisterOperators BroadOpReg({ const RegisterOperators MKLDNNLayerNormOpReg({ torch::jit::Operator( "prim::MKLDNNLayerNorm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor", - MKLDNNLayerNormOp, + [](Stack* stack) { MKLDNNLayerNormOp(stack, false); }, + AliasAnalysisKind::FROM_SCHEMA), + torch::jit::Operator( + "prim::MKLDNNLayerNorm_(Tensor(a!) input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor(a!)", + [](Stack* stack) { MKLDNNLayerNormOp(stack, true); }, AliasAnalysisKind::FROM_SCHEMA), }); From 6bb68ba5075a9948e9f52246453e964749226098 Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Thu, 19 Aug 2021 02:12:44 -0700 Subject: [PATCH 059/530] Fix interpreter debug logging message (#63499) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63499 https://github.com/pytorch/pytorch/pull/62418 combine the instruction and debug handle. This change fix the debugging message. ghstack-source-id: 136184053 Test Plan: Uncomment and it works Reviewed By: kimishpatel, raziel Differential Revision: D30390699 fbshipit-source-id: e32b7b297ad3b7d8bffebd025d15519083a244c4 --- torch/csrc/jit/mobile/interpreter.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp index 275b84beba97b..02e7c35792693 100644 --- a/torch/csrc/jit/mobile/interpreter.cpp +++ b/torch/csrc/jit/mobile/interpreter.cpp @@ -58,14 +58,15 @@ bool InterpreterState::run(Stack& stack) { Instruction inst = inst_with_handle.instruction; DebugHandle debug_handle = inst_with_handle.debug_handle; - // std::cout << "RUNNING " << pc << " " << code_->instructions_[pc]; - // if (inst.op == OP) { - // std::cout << ", " << code_->op_names_[inst.X].name; - // if (!code_->op_names_[inst.X].overload_name.empty()) { - // std::cout << "." << code_->op_names_[inst.X].overload_name; - // } - // } - // std::cout << std::endl; + // std::cout << "RUNNING " << pc << " " + // << code_->instructions_with_handles_[pc].instruction; + // if (inst.op == OP) { + // std::cout << ", " << code_->op_names_[inst.X].name; + // if (!code_->op_names_[inst.X].overload_name.empty()) { + // std::cout << "." << code_->op_names_[inst.X].overload_name; + // } + // } + // std::cout << std::endl; // TODO(iliacher): remove the workaround after RecordFunction is in // Dispatcher From ce61100923b3b423a8c0dbbda4d551d0c1e8c358 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Thu, 19 Aug 2021 06:19:20 -0700 Subject: [PATCH 060/530] Revert D29399533: Hoisting common expressions out of If blocks Test Plan: revert-hammer Differential Revision: D29399533 (https://github.com/pytorch/pytorch/commit/9477211e7d609ce382c0e22d7721c14c36d083de) Original commit changeset: 9336b9dc48c0 fbshipit-source-id: f081c7280203f40328bcbb0c03a7c6a007acedb7 --- test/jit/test_if_hoisting.py | 213 ------------------ test/quantization/jit/test_quantize_jit.py | 3 +- test/test_jit.py | 1 - tools/build_variables.bzl | 1 - torch/_C/__init__.pyi.in | 1 - torch/csrc/jit/ir/node_hashing.cpp | 14 -- .../jit/passes/common_expression_hoisting.cpp | 153 ------------- .../jit/passes/common_expression_hoisting.h | 10 - .../jit/passes/symbolic_shape_analysis.cpp | 1 - torch/csrc/jit/python/init.cpp | 6 - torch/csrc/jit/runtime/graph_executor.cpp | 9 +- .../runtime/profiling_graph_executor_impl.cpp | 111 ++++++++- 12 files changed, 105 insertions(+), 418 deletions(-) delete mode 100644 test/jit/test_if_hoisting.py delete mode 100644 torch/csrc/jit/passes/common_expression_hoisting.cpp delete mode 100644 torch/csrc/jit/passes/common_expression_hoisting.h diff --git a/test/jit/test_if_hoisting.py b/test/jit/test_if_hoisting.py deleted file mode 100644 index c8fd4a4bab349..0000000000000 --- a/test/jit/test_if_hoisting.py +++ /dev/null @@ -1,213 +0,0 @@ - -import torch -from torch.testing import FileCheck -from torch.testing._internal.jit_utils import JitTestCase - -if __name__ == "__main__": - raise RuntimeError( - "This test file is not meant to be run directly, use:\n\n" - "\tpython test/test_jit.py TESTNAME\n\n" - "instead." - ) - - -class TestIfHoisting(JitTestCase): - def test_if_hoist_basic(self): - def fn(x: bool, y: int): - if x: - z = y + 3 - else: - z = y + 3 - return z - - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - self.assertEqual(fn(True, 1), fn_script(True, 1)) - - def test_if_hoist_transposed_expr(self): - """ - Making sure that we can properly eliminate - an expression even if it is not at the start - of a block - """ - def fn(x: bool, y: int): - if x: - a = y + 3 - b = y * 2 - else: - b = y * 2 - a = y + 3 - return a, b - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1), fn_script(True, 1)) - self.assertEqual(fn(False, 5), fn_script(False, 5)) - - def test_if_hoist_swapped_expr(self): - """ - Making sure that the if statement - doesn't get fully eliminated here - """ - def fn(x: bool, y: int): - if x: - a = y + 3 - b = y * 2 - else: - a = y * 2 - b = y + 3 - return a, b - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1), fn_script(True, 1)) - self.assertEqual(fn(False, 5), fn_script(False, 5)) - - def test_if_hoist_reused_var(self): - """ - Making sure that cases where the python variable is reused - is handled correctly - """ - def fn(x: bool, y: int): - b = 6 - if x: - a = y + 3 - a = y * 2 - else: - a = y * 2 - b = y + 3 - return a, b - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::mul", 1, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1), fn_script(True, 1)) - self.assertEqual(fn(False, 5), fn_script(False, 5)) - - def test_no_hoist(self): - """ - Nothing should happen here, expressions are different - """ - def fn(x: bool, y: int, z: int): - if x: - a = y + 3 - else: - a = z + 3 - return a - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1, 3), fn_script(True, 1, 3)) - self.assertEqual(fn(False, 5, 10), fn_script(False, 5, 10)) - - def test_mutate_before(self): - """ - Make sure that if there is a mutation before the common - op, the hoist doesn't happen - """ - def fn(x: bool, y: torch.Tensor): - if x: - y.add_(8) - a = y + 3 - else: - a = y + 3 - return a - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - FileCheck().check_count("aten::add_", 1, exactly=True).run(op_graph) - - t1 = torch.Tensor([1]) - t2 = torch.Tensor([5, 6]) - self.assertEqual(fn(True, t1), fn_script(True, t1)) - self.assertEqual(fn(False, t2), fn_script(False, t2)) - - def test_mutate_after(self): - """ - Check that the hoist can happen properly, and - that the output is still correct. - """ - def fn(x: bool, y: torch.Tensor): - if x: - b = 1 - a = y + 3 - y.add_(8) - else: - b = 2 - a = y + 3 - c = b + a - return a - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - - t1 = torch.Tensor([1]) - t2 = torch.Tensor([5, 6]) - self.assertEqual(fn(True, t1.clone()), fn_script(True, t1.clone())) - self.assertEqual(fn(False, t2.clone()), fn_script(False, t2.clone())) - - def test_multiple_hoists(self): - """ - test that hoists that depend on other hoists are done correctly - """ - def fn(x: bool, y: torch.Tensor): - if x: - a = y + 3 - b = a + y - else: - a = y + 3 - b = a + y - c = b * 2 - return c - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - - t1 = torch.Tensor([1]) - t2 = torch.Tensor([5, 6]) - self.assertEqual(fn(True, t1), fn_script(True, t1)) - self.assertEqual(fn(False, t2), fn_script(False, t2)) diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py index 5fde8e2cc533d..14bb31cf07f1a 100644 --- a/test/quantization/jit/test_quantize_jit.py +++ b/test/quantization/jit/test_quantize_jit.py @@ -1214,7 +1214,6 @@ class Res(torch.nn.Module): def __init__(self): super(Res, self).__init__() self.conv = torch.nn.Conv2d(3, 3, 1).float() - self.conv2 = torch.nn.Conv2d(3, 3, 1).float() self.use_skip = True def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor: @@ -1223,7 +1222,7 @@ def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor: if self.use_skip: return self.conv(x) else: - return self.conv2(x) + return self.conv(x) class M(torch.nn.Module): def __init__(self): diff --git a/test/test_jit.py b/test/test_jit.py index 6cf1d8e7d5c6e..99df960da5dc4 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -23,7 +23,6 @@ from jit.test_builtins import TestBuiltins, TestTensorBuiltins # noqa: F401 from jit.test_ignore_context_manager import TestIgnoreContextManager # noqa: F401 from jit.test_symbolic_shape_analysis import TestSymbolicShapeAnalysis # noqa: F401 -from jit.test_if_hoisting import TestIfHoisting # noqa: F401 from jit.test_unsupported_ops import TestUnsupportedOps # noqa: F401 from jit.test_freezing import TestFreezing, TestFrozenOptimizations, TestMKLDNNReinplacing # noqa: F401 from jit.test_peephole import TestPeephole # noqa: F401 diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 2e71bedb35db4..89697b4428ca1 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -191,7 +191,6 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/passes/clear_profiling.cpp", "torch/csrc/jit/passes/clear_undefinedness.cpp", "torch/csrc/jit/passes/common_subexpression_elimination.cpp", - "torch/csrc/jit/passes/common_expression_hoisting.cpp", "torch/csrc/jit/passes/concat_opt.cpp", "torch/csrc/jit/passes/constant_pooling.cpp", "torch/csrc/jit/passes/constant_propagation.cpp", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 30885d3107176..b683a60615dc5 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -204,7 +204,6 @@ def _jit_pass_inline(Graph) -> None: ... def _jit_pass_constant_propagation(Graph) -> None: ... def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ... def _jit_erase_non_input_shape_information(Graph) -> None: ... -def _jit_pass_common_expression_hoisting(Graph) -> None: ... def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ... def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ... def _jit_can_fuse_on_cpu() -> _bool: ... diff --git a/torch/csrc/jit/ir/node_hashing.cpp b/torch/csrc/jit/ir/node_hashing.cpp index 9a876d062d2fd..3fd4974ed421b 100644 --- a/torch/csrc/jit/ir/node_hashing.cpp +++ b/torch/csrc/jit/ir/node_hashing.cpp @@ -204,8 +204,6 @@ bool attributesEqualCSE(const Node* lhs, const Node* rhs) { } // anonymous namespace -// Makes a hash that hashes the input Value, the output type -// as well as the node attributes size_t HashNode::operator()(const Node* k) const { AT_ASSERT(k != nullptr); size_t constant_hash = 0; @@ -233,8 +231,6 @@ size_t HashNode::operator()(const Node* k) const { constant_hash); }; -// Checks that two nodes have the same inputs, output types -// and node attributes. bool EqualNode::operator()(const Node* lhs, const Node* rhs) const { if (lhs == nullptr && rhs == nullptr) return true; @@ -265,16 +261,6 @@ bool EqualNode::operator()(const Node* lhs, const Node* rhs) const { if (!attributesEqualCSE(lhs, rhs)) return false; - // Check if the blocks contained in a op are the same - if (lhs->blocks().size() != rhs->blocks().size()) { - return false; - } - for (size_t i = 0; i < lhs->blocks().size(); ++i) { - if (lhs->blocks()[i] != rhs->blocks()[i]) { - return false; - } - } - return true; }; diff --git a/torch/csrc/jit/passes/common_expression_hoisting.cpp b/torch/csrc/jit/passes/common_expression_hoisting.cpp deleted file mode 100644 index ab2b9d41afa8b..0000000000000 --- a/torch/csrc/jit/passes/common_expression_hoisting.cpp +++ /dev/null @@ -1,153 +0,0 @@ -#include - -#include -#include -#include -#include - -#include -#include -#include - -namespace torch { -namespace jit { -namespace { - -struct CommonExpressionHoister { - CommonExpressionHoister(std::shared_ptr graph) - : graph_(std::move(graph)) {} - - bool run() { - HoistCommonExpression(graph_->block()); - return changed_; - } - - void HoistFromIfNode(Node* if_node) { - Block* true_block = if_node->blocks()[0]; - Block* false_block = if_node->blocks()[1]; - // find common statements in the two subblocks - - auto true_block_nodes = std::unordered_set( - true_block->nodes().begin(), true_block->nodes().end()); - for (auto it = false_block->nodes().begin(); - it != false_block->nodes().end();) { - Node* false_b_node = *it; - // node may be moved to a different block so advance iterator now - ++it; - - auto matching_elem = true_block_nodes.find(false_b_node); - if (matching_elem == true_block_nodes.end()) { - continue; - } - Node* true_b_node = *matching_elem; - - // Check if a move to the front of the block is valid - // If both of the moves are valid, then we know we can move the item out - // of the if blocks entirely. - AliasDb& aliasDb = getOrCreateAliasDb(); - bool true_moveable = aliasDb.couldMoveAfterTopologically( - true_b_node, true_block->nodes().front()); - bool false_moveable = aliasDb.couldMoveAfterTopologically( - false_b_node, false_block->nodes().front()); - - if (!true_moveable || !false_moveable) { - continue; - } - - // Get all the uses of the output to delete and reinsert them - // as the input would change, the HashNode value would also change. - std::unordered_set true_b_uses; - for (Value* true_out : true_b_node->outputs()) { - for (Use true_use : true_out->uses()) { - if (true_use.user->owningBlock() == true_block) { - // Make sure we are not accidentally adding stuff from subblocks - true_b_uses.insert(true_use.user); - } - } - } - for (Node* uses_node : true_b_uses) { - true_block_nodes.erase(uses_node); - } - - // Now hoist the statement out of the block - changed_ = true; - false_b_node->moveBefore(if_node); - - true_b_node->replaceAllUsesWith(false_b_node); - - true_block_nodes.erase(true_b_node); - true_block_nodes.insert(true_b_uses.cbegin(), true_b_uses.cend()); - true_b_node->destroy(); - } - } - - void EliminateUnnecessaryIfOutputs(Node* if_node) { - Block* true_block = if_node->blocks()[0]; - Block* false_block = if_node->blocks()[1]; - - // fix up the if block outputs - for (size_t i = 0; i < true_block->outputs().size();) { - // Need to check both sides match to eliminate common if block outputs - Value* true_block_output = true_block->outputs().at(i); - Value* false_block_output = false_block->outputs().at(i); - if (true_block_output != false_block_output) { - i++; - continue; - } - - // We have a matching output, and can remove it from the block itself - if_node->outputs().at(i)->replaceAllUsesWith(true_block_output); - if_node->eraseOutput(i); - true_block->eraseOutput(i); - false_block->eraseOutput(i); - changed_ = true; - } - - // No need to test here if the IF block should be eliminated. - // The DCE pass will determine that for us. - } - - void HoistCommonExpression(Block* block) { - for (auto it = block->nodes().begin(); it != block->nodes().end();) { - Node* node = *it; - ++it; - - for (auto sub_block : node->blocks()) { - HoistCommonExpression(sub_block); - } - - if (node->kind() == prim::If) { - HoistFromIfNode(node); - EliminateUnnecessaryIfOutputs(node); - } - } - } - - AliasDb& getOrCreateAliasDb() { - if (!alias_db_) { - alias_db_ = std::make_unique(graph_); - } - - return *alias_db_; - } - - private: - std::unique_ptr alias_db_; - std::shared_ptr graph_; - bool changed_ = false; -}; -} // anonymous namespace -bool HoistCommonExpression(const std::shared_ptr& graph) { - // This moves common subexpressions from the two sides of an - // if block out of the if block. - - GRAPH_DUMP("Before CEH", graph); - CommonExpressionHoister ceh(graph); - bool changed = ceh.run(); - if (changed) { - GRAPH_DUMP("After CEH Changes", graph); - } - return changed; -} -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/passes/common_expression_hoisting.h b/torch/csrc/jit/passes/common_expression_hoisting.h deleted file mode 100644 index 2aad158eea8f8..0000000000000 --- a/torch/csrc/jit/passes/common_expression_hoisting.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -namespace torch { -namespace jit { - -TORCH_API bool HoistCommonExpression(const std::shared_ptr& graph); -} -} // namespace torch diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp index 10edfb4c496ef..f74a91176cfe8 100644 --- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp +++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index d582035d6e95e..5fca575593551 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -283,11 +282,6 @@ void initJITBindings(PyObject* module) { [](std::shared_ptr& g) { return EliminateCommonSubexpression(g); // overload resolution }) - .def( - "_jit_pass_common_expression_hoisting", - [](std::shared_ptr& g) { - return HoistCommonExpression(g); // overload resolution - }) .def( "_jit_pass_fuse_quantized_add_relu", [](std::shared_ptr& g) { diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp index bb5f272080601..476882650a1dd 100644 --- a/torch/csrc/jit/runtime/graph_executor.cpp +++ b/torch/csrc/jit/runtime/graph_executor.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -919,7 +918,7 @@ void runOptimization( "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph); EliminateCommonSubexpression(graph); GRAPH_DEBUG( - "After EliminateCommonSubexpression , before PeepholeOptimize\n", *graph); + "After EliminateCommonSubexpression, before PeepholeOptimize\n", *graph); PeepholeOptimize(graph); GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph); @@ -950,10 +949,8 @@ void runOptimization( EliminateCommonSubexpression(graph); GRAPH_DEBUG( - "After EliminateCommonSubexpression, before HoistCommonExpression\n", - *graph); - HoistCommonExpression(graph); - GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph); + "After EliminateCommonSubexpression, before CheckInplace\n", *graph); + CheckInplace(graph); GRAPH_DEBUG("After CheckInplace (end of runOptimization)", *graph); } diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp index 40d94a4a205fe..b099db17931b0 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -333,16 +332,112 @@ void runPreAutodiffPassPipeline(std::shared_ptr& graph) { EliminateCommonSubexpression(graph); GRAPH_DEBUG( - "After EliminateCommonSubexpression, before HoistCommonExpression\n", - *graph); - HoistCommonExpression(graph); - GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph); + "After EliminateCommonSubexpression, before CheckInplace\n", *graph); + CheckInplace(graph); } GRAPH_DEBUG( "After CheckInplace (end of runPreAutodiffPassPipeline)\n", *graph); } +void runDiffGraphPasses(std::shared_ptr& graph) { + GRAPH_DEBUG( + "Before EliminateDeadCode (beginning of runDiffGraphPasses)\n", *graph); + // runOptimization: + { + // Basic graph preprocessing to eliminate noise. + EliminateDeadCode(graph); + GRAPH_DEBUG( + "After EliminateDeadCode, before EliminateCommonSubexpression\n", + *graph); + EliminateCommonSubexpression(graph); + GRAPH_DEBUG( + "After EliminateCommonSubexpression, before PeepholeOptimize\n", + *graph); + + PeepholeOptimize(graph); + GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph); + ConstantPropagation(graph); + GRAPH_DEBUG("After ConstantPropagation, before ConstantPooling\n", *graph); + ConstantPooling(graph); + GRAPH_DEBUG("After ConstantPooling, before UnrollLoops\n", *graph); + + UnrollLoops(graph); + GRAPH_DEBUG("After UnrollLoops, before RemoveListMutation\n", *graph); + // run again with unrolled loops + RemoveListMutation(graph); + GRAPH_DEBUG("After RemoveListMutation, before PeepholeOptimize\n", *graph); + PeepholeOptimize(graph); + GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph); + ConstantPropagation(graph); + GRAPH_DEBUG( + "After ConstantPropagation, before EliminateCommonSubexpression\n", + *graph); + + EliminateCommonSubexpression(graph); + GRAPH_DEBUG( + "After EliminateCommonSubexpression, before CheckInplace\n", *graph); + + CheckInplace(graph); + } + GRAPH_DEBUG("After CheckInplace, before customPrePasses\n", *graph); + + // runNondiffOptimization + { + // Run custom passes that different backends can register. + for (const auto& passPair : getCustomPrePasses()) { + passPair.first(graph); + } + GRAPH_DEBUG("After customPrePasses, before LowerSimpleTuples\n", *graph); + + // TupleConstruct / TupleUnpack pairs can still be present at this point + // and must be removed for fusion. + LowerSimpleTuples(graph); + GRAPH_DEBUG("After LowerSimpleTuples\n", *graph); + + if (tensorExprFuserEnabled()) { + // Remove prim::profile nodes and embed the profile info directly in the + // IR in value types. We're doing such transformation as optimizations + // that try to merge/fuse nodes in the graph (e.g. BatchMM and GraphFuser) + // work worse in the presence of intermittent prim::profile nodes. + // Optimizations relying on the type info are also responsible for + // inserting proper type checks. Once we're done with these optimizations + // we will wipe the tensor type information from the IR, so that it's not + // accidentally used by any other pass. + RemoveProfileNodesAndSpecializeTypes(graph); + GRAPH_DEBUG( + "After RemoveProfileNodesAndSpecializeTypes, before BatchMM\n", + *graph); + // Rewrite subgraphs with many MMs into expressions that batch them. + BatchMM(graph); + GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph); + + FuseTensorExprs(graph, getFusionGroupInlining() ? 2 : 1); + GRAPH_DEBUG( + "After Fusion, before RemoveTensorTypeSpecializations\n", *graph); + + // Wipe tensor type info from the IR + RemoveTensorTypeSpecializations(graph); + GRAPH_DEBUG( + "After RemoveTensorTypeSpecializations, before customPostPasses\n", + *graph); + } else { + // Rewrite subgraphs with many MMs into expressions that batch them. + BatchMM(graph); + GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph); + + FuseGraph(graph, true); + GRAPH_DEBUG("After Fusion, before customPostPasses\n", *graph); + } + + // Run custom post-fusion passes + for (const auto& passPair : getCustomPostPasses()) { + passPair.first(graph); + } + } + GRAPH_DEBUG("After customPostPasses (end of runDiffGraphPasses)\n", *graph); +} + void runNoGradOptimizations(std::shared_ptr& graph) { GRAPH_DEBUG( "After customPostPasses (beginning of runNoGradOptimizations)\n", *graph); @@ -498,11 +593,7 @@ void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations( DecomposeOps(graph); GRAPH_DEBUG("After DecomposeOps, before ConstantPropagation\n", *graph); ConstantPropagation(graph); - GRAPH_DEBUG( - "After ConstantPropagation, before HoistCommonExpression\n", *graph); - HoistCommonExpression(graph); - GRAPH_DEBUG( - "After EliminateCommonSubexpression, before ElimiateDeadCode\n", *graph); + GRAPH_DEBUG("After ConstantPropagation, before EliminateDeadCode\n", *graph); EliminateDeadCode(graph); GRAPH_DEBUG( "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph); From 47a9e8ff320b638fcff0e25147e7f042740bf734 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Thu, 19 Aug 2021 06:37:44 -0700 Subject: [PATCH 061/530] [Static Runtime] Support __getitem__ for lists (#63398) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63398 This change provides a native `__getitem__` implementation for lists to avoid overhead associated with falling back to the JIT interpreter. Test Plan: Unit tests: `buck test //caffe2/benchmarks/static_runtime:static_runtime_cpptest` Reviewed By: hlu1 Differential Revision: D30368464 fbshipit-source-id: e0e0971508cd5d9bcf6025606993dc24ecbf6764 --- benchmarks/static_runtime/test_scripts.h | 18 ++++++++++-- .../static_runtime/test_static_runtime.cc | 21 ++++++++++---- torch/csrc/jit/runtime/static/native_ops.cpp | 29 ++++++++++++------- 3 files changed, 50 insertions(+), 18 deletions(-) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 6045a1c2f9772..8db8da2887799 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -632,24 +632,36 @@ const auto argmin_with_keep_dim_script = R"JIT( return torch.argmin(a, dim, True).clone() )JIT"; -const auto getitem_tensor_script = R"JIT( +const auto getitem_dict_tensor_script = R"JIT( def forward(self, key: Tensor): d = {key: 1} return d[key] )JIT"; -const auto getitem_int_script = R"JIT( +const auto getitem_dict_int_script = R"JIT( def forward(self, key: int): d = {key: 1} return d[key] )JIT"; -const auto getitem_str_script = R"JIT( +const auto getitem_dict_str_script = R"JIT( def forward(self, key: str): d = {key: 1} return d[key] )JIT"; +const auto getitem_list_int_script = R"JIT( + def forward(self, idx: int): + lst = [1, 2, 3] + return lst[idx] +)JIT"; + +const auto getitem_list_tensor_script = R"JIT( + def forward(self, tensor: Tensor, idx: int): + lst = [tensor, tensor] + return lst[idx] +)JIT"; + const auto transpose_script = R"JIT( def forward(self, a: Tensor, dim1: int, dim2: int): return torch.transpose(a, dim1, dim2).clone() diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 7af49d6c8fa63..14d613f074858 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1043,19 +1043,30 @@ TEST(StaticRuntime, IndividualOps_Argmin) { testStaticRuntime(argmin_with_keep_dim_script, args_a, args_b); } -TEST(StaticRuntime, IndividualOps_GetItem) { +TEST(StaticRuntime, IndividualOps_GetItem_Dict) { int int_key = 0; std::string str_key = "str"; // No need to test these multiple times, args are not tensors - testStaticRuntime(getitem_int_script, {int_key}); - testStaticRuntime(getitem_str_script, {str_key}); + testStaticRuntime(getitem_dict_int_script, {int_key}); + testStaticRuntime(getitem_dict_str_script, {str_key}); auto a = torch::tensor({1}); auto b = torch::tensor({1, 1}); - testStaticRuntime(getitem_tensor_script, {a}); - testStaticRuntime(getitem_tensor_script, {a}, {b}); + testStaticRuntime(getitem_dict_tensor_script, {a}); + testStaticRuntime(getitem_dict_tensor_script, {a}, {b}); +} + +TEST(StaticRuntime, IndividualOps_GetItem_List) { + testStaticRuntime(getitem_list_int_script, {1}); + testStaticRuntime(getitem_list_int_script, {-1}); + + auto a = torch::tensor({1}); + auto b = torch::tensor({1, 1}); + + testStaticRuntime(getitem_list_tensor_script, {a, 1}); + testStaticRuntime(getitem_list_tensor_script, {a, 1}, {b, -1}); } TEST(StaticRuntime, IndividualOps_Transpose) { diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp index d84b1cd8b28d2..616ad87119ab3 100644 --- a/torch/csrc/jit/runtime/static/native_ops.cpp +++ b/torch/csrc/jit/runtime/static/native_ops.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace torch { @@ -100,17 +101,25 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR( if (n->inputs().size() != 2) { return nullptr; } - // TODO: make __getitem__ work for other container types - if (n->input(0)->type()->castRaw() == nullptr) { - return nullptr; + + if (n->input(0)->type()->castRaw()) { + return [](ProcessedNode* p_node) { + auto dict = p_node->Input(0).toGenericDict(); + auto key = p_node->Input(1); + auto value = dict.find(key); + TORCH_CHECK(value != dict.end(), "Key not in dict: ", key); + p_node->Output(0) = value->value(); + }; + } else if (n->input(0)->type()->castRaw()) { + return [](ProcessedNode* p_node) { + auto list = p_node->Input(0).toList(); + auto idx = p_node->Input(1).toInt(); + p_node->Output(0) = getItem(list, idx); + }; } - return [](ProcessedNode* p_node) { - auto dict = p_node->Input(0).toGenericDict(); - auto key = p_node->Input(1); - auto value = dict.find(key); - TORCH_CHECK(value != dict.end(), "Key not in dict: ", key); - p_node->Output(0) = value->value(); - }; + + // TODO(T98581096): make __getitem__ work for other container types + return nullptr; }); REGISTER_NATIVE_OPERATOR_FUNCTOR( From 2d5b19f62b636eb9924bcef43a23690c2b8018b4 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Thu, 19 Aug 2021 06:47:31 -0700 Subject: [PATCH 062/530] Update full backward hook doc with not-same-object note (#63245) Summary: Fixes https://github.com/pytorch/pytorch/issues/61446 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63245 Reviewed By: ejguan Differential Revision: D30352656 Pulled By: albanD fbshipit-source-id: 7000ecb54a80f2da968ec7600b98574b608578ae --- torch/nn/modules/module.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index f4ef4533de600..2376422117306 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -145,13 +145,6 @@ def register_module_full_backward_hook( This adds global state to the `nn.module` module and it is only intended for debugging/profiling purposes. - The current implementation will not have the presented behavior - for complex :class:`Module` that perform many operations. - In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only - contain the gradients for a subset of the inputs and outputs. - For such :class:`Module`, you should use :func:`torch.Tensor.register_hook` - directly on a specific input or output to get the required gradients. - The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature:: @@ -165,6 +158,10 @@ def register_module_full_backward_hook( in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor arguments. + For technical reasons, when this hook is applied to a Module, its forward function will + receive a view of each Tensor passed to the Module. Similarly the caller will receive a view + of each Tensor returned by the Module's forward function. + Global hooks are called before hooks registered with `register_backward_hook` Returns: @@ -907,6 +904,10 @@ def register_full_backward_hook( in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor arguments. + For technical reasons, when this hook is applied to a Module, its forward function will + receive a view of each Tensor passed to the Module. Similarly the caller will receive a view + of each Tensor returned by the Module's forward function. + .. warning :: Modifying inputs or outputs inplace is not allowed when using backward hooks and will raise an error. From e7c4988b526afa73b8bae4b18daaa9a86dbfbff2 Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Thu, 19 Aug 2021 07:15:16 -0700 Subject: [PATCH 063/530] To fix the chainability at epoch zero for some schedulers (#63457) Summary: It has been discussed in the https://github.com/pytorch/pytorch/pull/60836#issuecomment-899084092 that we have observed an obstacle to chain some type of learning rate schedulers. In particular we observed * some of the learning rate schedulers returns initial learning rates at epoch 0 as ``` return self.base_lrs` ``` * This can be a problem when two schedulers called as chained as ``` scheduler1.step() scheduler2.step() ``` in particular, we completely ignore the effect of scheduler1 at epoch 0. This could not be an issue if at epoch 0, scheduler1 was ineffective as in many schedulers, however for schedulers as WarmUp Schedulers, where at epoch 0 schedulers multiplicative value is smaller than 1 this could lead to undesired behaviors. The following code snippet illustrates the problem better ## Reproducing the bug ```python import torch from torch.nn import Parameter from torch.optim import SGD from torch.optim.lr_scheduler import WarmUpLR, ExponentialLR model = [Parameter(torch.randn(2, 2, requires_grad=True))] optimizer = SGD(model, 1.0) scheduler1 = WarmUpLR(optimizer, warmup_factor=0.1, warmup_iters=5, warmup_method="constant") scheduler2 = ExponentialLR(optimizer, gamma=0.9) for epoch in range(10): print(epoch, scheduler2.get_last_lr()[0]) optimizer.step() scheduler1.step() scheduler2.step() ``` ### Current Result ``` 0 1.0 1 0.9 2 0.81 3 0.7290000000000001 4 0.6561000000000001 5 5.904900000000001 6 5.314410000000001 7 4.782969000000001 8 4.304672100000001 9 3.874204890000001 ``` ### Expected Result ``` 0 1.0 1 0.9 2 0.81 3 0.7290000000000001 4 0.6561000000000001 5 0.5904900000000001 6 0.5314410000000001 7 0.4782969000000001 8 0.4304672100000001 9 0.3874204890000001 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/63457 Reviewed By: datumbox Differential Revision: D30424160 Pulled By: iramazanli fbshipit-source-id: 3e15af8d278c872cd6f53406b55f4d3ce5002867 --- test/test_optim.py | 12 ++++++------ torch/optim/lr_scheduler.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_optim.py b/test/test_optim.py index 20b8e5c443de5..01ec43bbea883 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -440,8 +440,8 @@ def test_adam(self): ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True), - [lambda opt: ExponentialLR(opt, gamma=0.9), - lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")] + [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant"), + lambda opt: ExponentialLR(opt, gamma=0.9)] ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True), @@ -1294,8 +1294,8 @@ def test_compound_exp_and_linear_warmup_lr(self): for i in range(iters): single_targets[i] *= factor + i / iters * (1 - factor) targets = [single_targets, [x * epochs for x in single_targets]] - schedulers[0] = ExponentialLR(self.opt, gamma=0.9) - schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[1] = ExponentialLR(self.opt, gamma=0.9) self._test(schedulers, targets, epochs) def test_compound_step_and_constant_warmup(self): @@ -1361,8 +1361,8 @@ def test_compound_cosanneal_and_linear_warmup_lr(self): for i in range(iters): single_targets[i] *= factor + i / iters * (1 - factor) targets = [single_targets, [x * epochs for x in single_targets]] - schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min) - schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[1] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min) self._test(schedulers, targets, epochs) def test_compound_cosanneal_and_exp_lr(self): diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index 78a8cfad0d637..657a35ad681b0 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -328,7 +328,7 @@ def get_lr(self): return [group['lr'] * lmbda(self.last_epoch) for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)] else: - return list(self.base_lrs) + return [group['lr'] for group in self.optimizer.param_groups] class StepLR(_LRScheduler): @@ -526,7 +526,7 @@ def get_lr(self): "please use `get_last_lr()`.", UserWarning) if self.last_epoch == 0: - return self.base_lrs + return [group['lr'] for group in self.optimizer.param_groups] return [group['lr'] * self.gamma for group in self.optimizer.param_groups] @@ -586,7 +586,7 @@ def get_lr(self): "please use `get_last_lr()`.", UserWarning) if self.last_epoch == 0: - return self.base_lrs + return [group['lr'] for group in self.optimizer.param_groups] elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0: return [group['lr'] + (base_lr - self.eta_min) * (1 - math.cos(math.pi / self.T_max)) / 2 From be9be9bfdd3be7cbd5b03027bb5597bc234bc79c Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Thu, 19 Aug 2021 07:49:43 -0700 Subject: [PATCH 064/530] add distributed/_sharded_tensor/test_sharded_tensor to ROCM_BLOCKLIST (#63508) Summary: Fixes current ROCm CI test2 brokenness until tensorpipe is fully supported by ROCm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63508 Reviewed By: ejguan Differential Revision: D30406450 Pulled By: walterddr fbshipit-source-id: c07509271d5d33901f3eaf7ffb916dc3626e1f9a --- test/run_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/run_test.py b/test/run_test.py index e40f580bbe9e7..e043bcd0ad152 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -245,6 +245,7 @@ 'distributed/rpc/test_faulty_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/rpc/cuda/test_tensorpipe_agent', + 'distributed/_sharded_tensor/test_sharded_tensor', 'test_determination', 'test_multiprocessing', 'test_jit_legacy', From f596aa8b77d6c57dd82f33a45926fad95ab2a21e Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Thu, 19 Aug 2021 08:28:55 -0700 Subject: [PATCH 065/530] Poisson zero rate (#61511) Summary: This PR fixes https://github.com/pytorch/pytorch/issues/53485 by allowing zero rates for the Poisson distribution. This implementation is consistent with `scipy.stats.poisson` which admits zero rates. In addition to addressing the aforementioned issue, this PR makes two supporting changes: 1. add a `nonnegative` constraint to enforce non-negative rates for the Poisson distribution. 2. adjust the evaluation of the gradient of `xlogy` such that it is well defined for `x == 0 and y == 0`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/61511 Reviewed By: ejguan Differential Revision: D30352917 Pulled By: albanD fbshipit-source-id: f3d33da58360e80d75eb83519f199b93232a2a2d --- test/distributions/test_distributions.py | 26 ++++++++++++++++++---- tools/autograd/derivatives.yaml | 4 ++-- torch/distributions/constraint_registry.py | 2 ++ torch/distributions/constraints.py | 1 + torch/distributions/poisson.py | 4 ++-- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index 85e4dbacd4b6a..319b55795addb 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -387,6 +387,12 @@ def is_all_nan(tensor): }, { 'rate': 0.2, + }, + { + 'rate': torch.tensor([0.0], requires_grad=True), + }, + { + 'rate': 0.0, } ]), Example(RelaxedBernoulli, [ @@ -667,7 +673,7 @@ def is_all_nan(tensor): ]), Example(Poisson, [ { - 'rate': torch.tensor([0.0], requires_grad=True), + 'rate': torch.tensor([-0.1], requires_grad=True), }, { 'rate': -1.0, @@ -1315,17 +1321,29 @@ def test_poisson_shape(self): def test_poisson_log_prob(self): rate = torch.randn(2, 3).abs().requires_grad_() rate_1d = torch.randn(1).abs().requires_grad_() + rate_zero = torch.zeros([], requires_grad=True) - def ref_log_prob(idx, x, log_prob): - l = rate.view(-1)[idx].detach() + def ref_log_prob(ref_rate, idx, x, log_prob): + l = ref_rate.view(-1)[idx].detach() expected = scipy.stats.poisson.logpmf(x, l) self.assertEqual(log_prob, expected, atol=1e-3, rtol=0) set_rng_seed(0) - self._check_log_prob(Poisson(rate), ref_log_prob) + self._check_log_prob(Poisson(rate), lambda *args: ref_log_prob(rate, *args)) + self._check_log_prob(Poisson(rate_zero), lambda *args: ref_log_prob(rate_zero, *args)) self._gradcheck_log_prob(Poisson, (rate,)) self._gradcheck_log_prob(Poisson, (rate_1d,)) + # We cannot check gradients automatically for zero rates because the finite difference + # approximation enters the forbidden parameter space. We instead compare with the + # theoretical results. + dist = Poisson(rate_zero) + s = dist.sample() + dist.log_prob(s).backward() + torch.testing.assert_allclose(rate_zero.grad, -1.0) + dist.log_prob(torch.ones_like(rate_zero)).backward() + torch.testing.assert_allclose(rate_zero.grad, torch.inf) + @unittest.skipIf(IS_MACOS, "See https://github.com/pytorch/pytorch/issues/60347") @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_poisson_sample(self): diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index b52b69018e177..49e574a1651ba 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -801,11 +801,11 @@ - name: xlogy.Tensor(Tensor self, Tensor other) -> Tensor self: grad * at::xlogy((self != 0), other) - other: grad * self / other + other: grad * at::where(other.isnan() | (self != 0), self / other, zeros_like(other)) result: self_t * at::xlogy((self_p != 0), other_p) + other_t * self_p / other_p - name: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor - other: grad * self / other + other: grad * at::where(other.isnan() | (!self.equal(0)), self / other, zeros_like(other)) result: auto_element_wise - name: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py index cbe987e72c798..c03f0ad02d2c6 100644 --- a/torch/distributions/constraint_registry.py +++ b/torch/distributions/constraint_registry.py @@ -173,7 +173,9 @@ def _transform_to_independent(constraint): @biject_to.register(constraints.positive) +@biject_to.register(constraints.nonnegative) @transform_to.register(constraints.positive) +@transform_to.register(constraints.nonnegative) def _transform_to_positive(constraint): return transforms.ExpTransform() diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py index 99808b6b80beb..5eed19afd09ec 100644 --- a/torch/distributions/constraints.py +++ b/torch/distributions/constraints.py @@ -545,6 +545,7 @@ def check(self, value): real = _Real() real_vector = independent(real, 1) positive = _GreaterThan(0.) +nonnegative = _GreaterThanEq(0.) greater_than = _GreaterThan greater_than_eq = _GreaterThanEq less_than = _LessThan diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py index 954ed6e0d3206..9adb641d7fcee 100644 --- a/torch/distributions/poisson.py +++ b/torch/distributions/poisson.py @@ -24,7 +24,7 @@ class Poisson(ExponentialFamily): Args: rate (Number, Tensor): the rate parameter """ - arg_constraints = {'rate': constraints.positive} + arg_constraints = {'rate': constraints.nonnegative} support = constraints.nonnegative_integer @property @@ -60,7 +60,7 @@ def log_prob(self, value): if self._validate_args: self._validate_sample(value) rate, value = broadcast_all(self.rate, value) - return (rate.log() * value) - rate - (value + 1).lgamma() + return value.xlogy(rate) - rate - (value + 1).lgamma() @property def _natural_params(self): From e1334512a3aa0f8f8a3a0a59cb868355a33b6233 Mon Sep 17 00:00:00 2001 From: anjali411 Date: Thu, 19 Aug 2021 08:41:08 -0700 Subject: [PATCH 066/530] Add fastpath for dot and vdot when the inputs have conj bit set to True (#62915) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62915 As much as 45% and 20% perf improvement on CUDA and CPU respectively. consistent improvement in perf for all cases -- see perf numbers in comments below Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30404006 Pulled By: anjali411 fbshipit-source-id: 565940da28c7761d993cf43346932c24292e8a4d --- aten/src/ATen/ConjugateFallback.cpp | 4 +++ aten/src/ATen/native/Blas.cpp | 27 ++++++++++++++++--- aten/src/ATen/native/cuda/Blas.cpp | 23 +++++++++++++++- .../_internal/common_methods_invocations.py | 20 ++++++++++---- 4 files changed, 65 insertions(+), 9 deletions(-) diff --git a/aten/src/ATen/ConjugateFallback.cpp b/aten/src/ATen/ConjugateFallback.cpp index 3ae9859f2d618..a64ef4950940b 100644 --- a/aten/src/ATen/ConjugateFallback.cpp +++ b/aten/src/ATen/ConjugateFallback.cpp @@ -56,6 +56,10 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) { m.impl("view", torch::CppFunction::makeFallthrough()); m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); m.impl("reshape", torch::CppFunction::makeFallthrough()); + m.impl("dot", torch::CppFunction::makeFallthrough()); + m.impl("vdot", torch::CppFunction::makeFallthrough()); + m.impl("dot.out", torch::CppFunction::makeFallthrough()); + m.impl("vdot.out", torch::CppFunction::makeFallthrough()); } } // namespace at diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index ab522ac21ea92..114de632a384a 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -133,8 +133,19 @@ inline void dot_check(const Tensor& self, const Tensor& other) { } Tensor dot(const Tensor &self, const Tensor &other){ - at::NoNamesGuard guard; + if (self.is_complex()) { + if (self.is_conj()) { + if (other.is_conj()) { + return (at::native::dot(self.conj(), other.conj())).conj(); + } else { + return at::native::vdot(self.conj(), other); + } + } else if (other.is_conj()) { + return at::native::vdot(other.conj(), self); + } + } + at::NoNamesGuard guard; dot_check(self, other); return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, self.scalar_type(), "dot", [&] { @@ -145,15 +156,25 @@ Tensor dot(const Tensor &self, const Tensor &other){ } Tensor vdot(const Tensor &self, const Tensor &other){ - at::NoNamesGuard guard; - // Dispatch to `dot` for real dtypes. if (!self.is_complex()){ return at::dot(self, other); } + if (self.is_conj()) { + if (other.is_conj()) { + return at::native::vdot(other.conj(), self.conj()); + } else { + return at::native::dot(self.conj(), other); + } + } else if (other.is_conj()) { + return (at::native::dot(self, other.conj())).conj(); + } + + at::NoNamesGuard guard; // For complex dtypes. dot_check(self, other); + return AT_DISPATCH_COMPLEX_TYPES(self.scalar_type(), "vdot", [&] { Tensor result = at::empty({}, self.options()); result.fill_(vdot_impl(self.numel(), self.data_ptr(), self.stride(0), other.data_ptr(), other.stride(0))); diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index b0fe0ac7a05b6..b4479101c59c9 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -353,8 +353,19 @@ inline void dot_check(const Tensor& self, const Tensor& other) { } // anonymous namespace Tensor dot_cuda(const Tensor& self, const Tensor& other) { - at::NoNamesGuard guard; + if (self.is_complex()) { + if (self.is_conj()) { + if (other.is_conj()) { + return (dot_cuda(self.conj(), other.conj())).conj(); + } else { + return vdot_cuda(self.conj(), other); + } + } else if (other.is_conj()) { + return vdot_cuda(other.conj(), self); + } + } + at::NoNamesGuard guard; dot_check(self, other); const int n = static_cast(self.numel()); @@ -391,6 +402,16 @@ Tensor vdot_cuda(const Tensor& self, const Tensor& other) { return dot_cuda(self, other); } + if (self.is_conj()) { + if (other.is_conj()) { + return vdot_cuda(other.conj(), self.conj()); + } else { + return dot_cuda(self.conj(), other); + } + } else if (other.is_conj()) { + return (dot_cuda(self, other.conj())).conj(); + } + at::NoNamesGuard guard; dot_check(self, other); diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 7e57d5d693ec7..873d91c0e1293 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1377,14 +1377,24 @@ def sample_inputs_bmm(self, device, dtype, requires_grad, **kwargs): ) def sample_inputs_dot_vdot(self, device, dtype, requires_grad, **kwargs): - return ( - SampleInput( + sample_inputs = [] + sample_inputs.append(SampleInput( + make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad), + args=( + make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad), + ) + )) + if dtype.is_complex: + # dot/vdot for (conj(input), conj(arg_tensor)) and (conj(input), arg_tensor) + # is tested in test_conj_view (which tests operations with only conjugated input tensor + # -- not conjugated arg tensors) + sample_inputs.append(SampleInput( make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad), args=( - make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad), + torch.conj(make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad)), ) - ), - ) + )) + return sample_inputs def sample_inputs_addmv(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) From 79693bb86a3f601a5c0d3da52d99acec95bb48c1 Mon Sep 17 00:00:00 2001 From: David Esiobu Date: Thu, 19 Aug 2021 09:15:34 -0700 Subject: [PATCH 067/530] Use linecache.lazycache to cache generated code. (#63453) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63453 Instead of patching linecache.getlines, use linecache.lazycache and parts of the loader protocol described in PEP-302 Test Plan: python3 test/test_fx.py Imported from OSS Reviewed By: suo Differential Revision: D30388176 fbshipit-source-id: 92933711ecf3a21a07e1d6b0d1185ab0efd8341c --- torch/fx/graph_module.py | 70 +++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index 85479f069d53d..b87aeaaa78a03 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -15,28 +15,60 @@ import os import warnings -# normal exec loses the source code, however we can patch -# the linecache module to still recover it. -# using exec_with_source will add it to our local cache +# Normal exec loses the source code, however we can work with +# the linecache module to recover it. +# Using exec_with_source will add it to our local cache # and then tools like TorchScript will be able to get source info. -_next_id = 0 +class EvalCacheLoader(object): + def __init__(self): + self.eval_cache = {} + self.next_id = 0 + + def cache(self, src: str, globals: Dict[str, Any]): + """Store the source in a private cache, and add a lazy entry in linecache + that allows the source to be retrieved by 'filename'. + + Args: + src (str): The module source to cache + globals (dict): The module globals + + Returns: + str: The cache key (and dummy filename) generated for src. + """ + + key = self._get_key() + self.eval_cache[key] = src + + # Don't mutate globals so that this loader is only used + # to populate linecache, and doesn't interact with other modules + # that might check `__loader__` + globals_copy = globals.copy() + globals_copy['__file__'] = key + globals_copy['__name__'] = key + globals_copy['__loader__'] = self + linecache.lazycache(key, globals_copy) + + return key + + # Part of the loader protocol (PEP 302) + # linecache will use this method when trying to find source code + def get_source(self, module_name) -> Optional[str]: + if module_name in self.eval_cache: + return self.eval_cache[module_name] + return None + + def _get_key(self): + key = f'.{self.next_id}' + self.next_id += 1 + return key + +_loader = EvalCacheLoader() + + def exec_with_source(src: str, globals: Dict[str, Any]): - global _next_id - key = f'' - _next_id += 1 - _eval_cache[key] = [line + '\n' for line in src.splitlines()] + key = _loader.cache(src, globals) exec(compile(src, key, 'exec'), globals) -# patch linecache so that any code we exec using exec_with_source -# works with inspect -_eval_cache : Dict[str, List[str]] = {} -_orig_getlines = linecache.getlines -def patched_getline(*args, **kwargs): - if args[0] in _eval_cache: - return _eval_cache[args[0]] - return _orig_getlines(*args, **kwargs) -linecache.getlines = patched_getline - def _forward_from_src(src: str, globals: Dict[str, Any]): # avoid mutating the passed in dict @@ -539,7 +571,7 @@ def generate_error_message(frame_summary: traceback.FrameSummary) -> str: # auxiliary variables (for readability) err_lineno = frame_summary.lineno err_line_len = len(frame_summary.line) - all_src_lines = _eval_cache[frame_summary.filename] + all_src_lines = linecache.getlines(frame_summary.filename) # constituent substrings of the error message tb_repr = traceback.format_exc() From e7831fe5de574e4ce542e02a6c56f57cc0493bf9 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 19 Aug 2021 09:49:12 -0700 Subject: [PATCH 068/530] [PyTorch] Test IValue move/copy/assign/swap more (#54717) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/54717 Hit more tags in these tests ghstack-source-id: 136140508 Test Plan: buck test //caffe2/aten:ivalue_test Reviewed By: anjali411 Differential Revision: D27339736 fbshipit-source-id: 610c8e92846bb70ba725ab117440326ab50af5ce --- aten/src/ATen/test/ivalue_test.cpp | 113 +++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp index 915e267347170..3ae18390f8f6e 100644 --- a/aten/src/ATen/test/ivalue_test.cpp +++ b/aten/src/ATen/test/ivalue_test.cpp @@ -90,6 +90,18 @@ TEST(IValueTest, Basic) { ASSERT_EQ(complex_tuple.toTuple()->elements()[1], foo1); } +TEST(IValueTest, BasicStorage) { + at::Storage emptyStorage; + at::Storage nonemptyStorage(at::rand({3, 4}).storage()); + IValue ivEmpty(emptyStorage); + IValue ivNonempty(nonemptyStorage); + + ASSERT_TRUE(ivEmpty.isStorage()); + ASSERT_TRUE(ivNonempty.isStorage()); + ASSERT_EQ(emptyStorage.unsafeGetStorageImpl(), ivEmpty.toStorage().unsafeGetStorageImpl()); + ASSERT_EQ(nonemptyStorage.unsafeGetStorageImpl(), ivNonempty.toStorage().unsafeGetStorageImpl()); +} + TEST(IValueTest, ComplexDict) { typedef c10::complex c_type; c10::Dict m; @@ -102,21 +114,70 @@ TEST(IValueTest, ComplexDict) { ASSERT_EQ(m_.at(num1), 2 * num1); ASSERT_EQ(m_.at(num2), 2 * num2); } -static std::array makeSampleIValues() { - return { at::rand({3, 4}), "hello", 42, true, 1.5 }; -} -static std::array makeMoreSampleIValues() { - return { at::rand({3, 4}), "goodbye", 23, false, 0.5 }; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) +static std::array makeSampleIValues() { + return { + IValue(), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + at::rand({3, 4}), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + at::rand({3, 4}).storage(), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + 1.5, + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + c10::complex(2.5, -0.5), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + 42, + true, + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + std::make_tuple(23, "hello"), + "hello", + c10::make_intrusive(), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + c10::List({1, 2, 3}), + c10::Dict(), + c10::make_intrusive(FloatType::get()), + c10::Device(c10::DeviceType::CPU, 0), + c10::Stream(c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CPU, 0)), + c10::make_intrusive(c10::StrongTypePtr(nullptr, ClassType::create("class1", {})), 1), + }; } +// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) +static std::array makeMoreSampleIValues() { + return { + IValue(), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + at::rand({3, 4}), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + at::rand({3, 4}).storage(), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + 2.5, + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + c10::complex(2.7, -0.3), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + 43, + false, + std::make_tuple(1, "goodbye"), + "goodbye", + c10::make_intrusive(), + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + c10::List({4, 5, 6}), + c10::Dict(), + c10::make_intrusive(IntType::get()), + c10::Device(c10::DeviceType::CUDA, 2), + c10::Stream(c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CUDA, 1)), + c10::make_intrusive(c10::StrongTypePtr(nullptr, ClassType::create("class2", {})), 2), + };} + // IValue::operator== doesn't seem to work on Tensors. #define EXPECT_IVALUE_EQ(a, b) \ EXPECT_EQ((a).isTensor(), (b).isTensor()); \ if ((a).isTensor()) { \ - EXPECT_TRUE(a.toTensor().equal(b.toTensor())); \ + EXPECT_TRUE((a).toTensor().equal((b).toTensor())); \ } else { \ - EXPECT_EQ(a, b); \ + EXPECT_EQ((a), (b)); \ } TEST(IValueTest, Swap) { @@ -580,13 +641,31 @@ TEST(IValueTest, IdentityComparisonAndHashing) { ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size()); for (int ii = 0; ii < sampleIValues.size(); ++ii) { - // Constant strings will have the same pointer value. - if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) { - EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash()); - } else { - EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash()); + if (sampleIValues[ii].isComplexDouble() || + sampleIValues[ii].isBlob() || + sampleIValues[ii].isList() || + sampleIValues[ii].isFuture() || + sampleIValues[ii].isStream() || + sampleIValues[ii].isObject() || + sampleIValues[ii].isGenericDict()) { + // Not hashable. + continue; + } + // Tuples may or may not have the same hash across instantiations. + if (!sampleIValues[ii].isTuple()) { + // Constant strings will have the same pointer value. + if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) { + EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash()) + << " at index " << ii; + } else { + EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash()) + << " at index " << ii; + } + } + if (!sampleIValues[ii].isNone() && !moreSampleIValues[ii].isNone()) { + EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash()) + << " at index " << ii; } - EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash()); } } @@ -656,5 +735,13 @@ TEST(IValueTest, ScalarBool) { EXPECT_TRUE(actual.toBool()); } +TEST(IValueTest, ToWeakAndBack) { + auto sampleInputs = makeSampleIValues(); + for (const auto& sample: sampleInputs) { + WeakIValue weak(sample); + EXPECT_IVALUE_EQ(sample, weak.lock()); + } +} + // TODO(gmagogsfm): Add type conversion test? } // namespace c10 From 885e312ce05b51294b27737347f736d632d8d8e6 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Thu, 19 Aug 2021 10:16:26 -0700 Subject: [PATCH 069/530] Add permute021 fx2trt converter (#63238) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63238 Reviewed By: yinghai Differential Revision: D30295373 fbshipit-source-id: 2a189fe485edaa978fd03e4b8d8582edb34ec648 --- torch/fx/experimental/fx2trt/fx2trt.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py index 9879fd7e0952e..0e7cc24c18be5 100644 --- a/torch/fx/experimental/fx2trt/fx2trt.py +++ b/torch/fx/experimental/fx2trt/fx2trt.py @@ -302,8 +302,9 @@ def validate_conversion(self): missing_converter.add(f"{node.op} {node.target}") elif node.op == "call_module": submod = self.fetch_attr(node.target) - if not CONVERTERS.get(type(submod)): - missing_converter.add(f"{node.op} {type(submod)}") + submod_type = getattr(submod, "_base_class_origin", type(submod)) + if not CONVERTERS.get(submod_type): + missing_converter.add(f"{node.op} {submod_type}") return missing_converter @@ -373,12 +374,11 @@ def placeholder(self, target, args, kwargs): def call_module(self, target, args, kwargs): assert isinstance(target, str) submod = self.fetch_attr(target) - converter = CONVERTERS.get(type(submod)) + submod_type = getattr(submod, "_base_class_origin", type(submod)) + converter = CONVERTERS.get(submod_type) if not converter: - raise RuntimeError( - f"Conversion of module of type {type(submod)} not currently supported!" - ) + raise RuntimeError(f'Conversion of module of type {submod_type} not currently supported!') return converter(self.network, submod, args, kwargs, self._cur_node_name) From 3aa4521fe8151fd9e072a79b44d34b725e00550e Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Thu, 19 Aug 2021 10:16:26 -0700 Subject: [PATCH 070/530] [hpc] use fx2trt for exploration track (#63535) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63535 Reviewed By: yinghai, jianyuh Differential Revision: D30272810 fbshipit-source-id: 61f3edf2a2282cd8c268a92acf92feb05a6ae3e1 --- .../fx2trt/converters/acc_ops_converters.py | 124 +++++++++++++++++- torch/fx/experimental/fx_acc/acc_ops.py | 16 +++ 2 files changed, 133 insertions(+), 7 deletions(-) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 88a74fe9e32c0..e7fcb94475b3e 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -842,6 +842,77 @@ def acc_ops_reshape(network, target, args, kwargs, name): layer.name = name return layer.get_output(0) +@tensorrt_converter(acc_ops.slice_tensor) +def acc_ops_slice_tensor(network, target, args, kwargs, name): + input_val = kwargs["input"] + + if not isinstance(input_val, trt.tensorrt.ITensor): + raise RuntimeError(f"slice_tensor received input {input_val} that is not part " + "of the TensorRT region!") + + dims = kwargs["dims"] + if network.has_implicit_batch_dimension: + if not len(dims): + raise RuntimeError("dim argument cannot be empty!") + if any([dim == 0 for dim in dims]): + raise RuntimeError( + f"We do not support slice_tensor at batch dim when it's implicit, got {dims}!" + ) + dims = [d - 1 for d in dims] + else: + raise RuntimeError("We don't support slice_tensor with explicit batch dimension yet!") + + start = [0] * len(input_val.shape) + stride = [1] * len(start) + output_shape = list(input_val.shape) + starts = kwargs["starts"] + stops = kwargs["stops"] + steps = kwargs["steps"] + + for i, dim in enumerate(dims): + start[dim] = starts[i] + stride[dim] = steps[i] + output_shape[dim] = (stops[i] - start[i]) // steps[i] + + layer = network.add_slice(input_val, start=start, shape=output_shape, stride=stride) + layer.name = name + return layer.get_output(0) + +@tensorrt_converter(acc_ops.split) +def acc_ops_split(network, target, args, kwargs, name): + input_val = kwargs["input"] + + if not isinstance(input_val, trt.tensorrt.ITensor): + raise RuntimeError(f"split received input {input_val} that is not part " + "of the TensorRT region!") + + dim = kwargs["dim"] + if network.has_implicit_batch_dimension: + assert dim != 0, "Can't split on batch dim when it's implicit!" + dim -= 1 + else: + raise RuntimeError("We don't support split with explicit batch dimension yet!") + + split_size = kwargs["split_size"] + start = [0] * len(input_val.shape) + stride = [1] * len(start) + offset = 0 + num_splits = (input_val.shape[dim] + split_size - 1) // split_size + if num_splits < 1: + raise RuntimeError(f"Invalid split: {input_val.shape[dim]} wuth split_size={split_size}") + + max_offset = input_val.shape[dim] + # add slice layers + output = [] + for i in range(num_splits): + shape = list(input_val.shape) + shape[dim] = min(split_size, max_offset - offset) + start[dim] = offset + layer = network.add_slice(input_val, start=start, shape=shape, stride=stride) + offset += split_size + layer.name = f"{name}_{i}" + output.append(layer.get_output(0)) + return output @tensorrt_converter(acc_ops.linear) def acc_ops_linear(network, target, args, kwargs, name): @@ -859,13 +930,42 @@ def acc_ops_linear(network, target, args, kwargs, name): "dim for linear and it can't be the last dim." ) - # add matrix multiply and add - weight = get_trt_tensor(network, kwargs["weight"], f"{name}_linear_weight", squeeze_vector=False) - output = add_matrix_multiply_layer(network, input_val, weight, f"{name}_linear_mm", transpose_other=True) - if kwargs["bias"] is not None: - return add_binary_elementwise_layer(network, output, kwargs["bias"], trt.ElementWiseOperation.SUM, f"{name}_linear_add") + weight = kwargs["weight"] + + # For quantization, weight here would be a trt tensor because it goes through + # quant + dequant. In this case, we need to use matmul + add because fully_connected + # can't take non-constant weight. + # TODO: Need to benchmark the performance of lowering linear as fully_connected versus + # lowering as matmul + add. TensorRT documentation suggests to always lower it as + # matmul + add but we found in some cases this results in performance regression compared + # with lowering to fully_connected layer. + if isinstance(weight, torch.Tensor): + layer = network.add_shuffle(input_val) + layer.reshape_dims = tuple(input_val.shape) + (1, 1) + layer.name = f"{name}_pre_shuffle" + + # add fully connected + layer = network.add_fully_connected( + input=layer.get_output(0), + num_outputs=kwargs["weight"].shape[0], + kernel=to_numpy(kwargs["weight"]), + bias=to_numpy(kwargs["bias"]), + ) + layer.name = f"{name}_linear" + + # reshape back + layer = network.add_shuffle(layer.get_output(0)) + layer.reshape_dims = tuple(input_val.shape[:-1]) + (kwargs["weight"].shape[0],) + layer.name = f"{name}_post_shuffle" + + return layer.get_output(0) else: - return output + # add matrix multiply and add + output = add_matrix_multiply_layer(network, input_val, weight, f"{name}_linear_mm", transpose_other=True) + if kwargs["bias"] is not None: + return add_binary_elementwise_layer(network, output, kwargs["bias"], trt.ElementWiseOperation.SUM, f"{name}_linear_add") + else: + return output def add_clamp(network, input, val, op): @@ -910,6 +1010,16 @@ def acc_ops_clamp(network, target, args, kwargs, name): return input_val +@tensorrt_converter(acc_ops.tuple_construct) +def acc_ops_tuple_construct(network, target, args, kwargs, name): + return kwargs["tensors"] + + +@tensorrt_converter(acc_ops.contiguous) +def acc_ops_contiguous(network, target, args, kwargs, name): + return kwargs["input"] + + @tensorrt_converter(acc_ops.getitem) def acc_ops_getitem(network, target, args, kwargs, name): input_val = kwargs["input"] @@ -951,7 +1061,7 @@ def slice_to_trt_params(py_slice, dim_size): batch_subscript = slices[0] if batch_subscript != slice(None, None, None): raise RuntimeError( - f"Can't subscript batch dimension when it's implicit. Got {slices}" + f"{name}: Can't subscript batch dimension when it's implicit. Got {slices}" ) # Remove batch_dim subscript diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 9b2c7f95e0000..7c9520660ef77 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -162,6 +162,7 @@ def add(*, input, other): return input + other +@register_acc_op_mapping(op_and_target=("call_method", "unsqueeze")) @register_acc_op_mapping(op_and_target=("call_function", torch.unsqueeze)) @register_acc_op def unsqueeze(*, input, dim): @@ -222,6 +223,12 @@ def transpose(*, input, dim0, dim1): return torch.transpose(**locals()) +@register_acc_op_mapping(op_and_target=("call_method", "contiguous")) +@register_acc_op +def contiguous(*, input): + return input.contiguous() + + @register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softmax)) @register_acc_op def softmax(*, input, dim, dtype): @@ -873,6 +880,15 @@ def slice_tensor(*, input, dims, starts, stops, steps): ("length", "length"), ], ) +@register_custom_acc_mapper_fn( + op_and_target=("call_method", "narrow"), + arg_replacement_tuples=[ + ("input", "input"), + ("dim", "dim"), + ("start", "start"), + ("length", "length"), + ], +) def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node: kwargs = { "input": node.kwargs["input"], From e030b813569c0fa89d527b8fe31c5e5fd31753d8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 19 Aug 2021 10:37:31 -0700 Subject: [PATCH 071/530] [easy] Fix missing move in TupleType::createNamed (#61572) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61572 ghstack-source-id: 136161829 Test Plan: CI Reviewed By: SplitInfinity Differential Revision: D29672872 fbshipit-source-id: d8ba2d54f7914dbeb3fc52aa21dd77025951c4b5 --- aten/src/ATen/core/type.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 4214f4d3e1f6f..6bfba7b6d181a 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -802,7 +802,7 @@ TupleTypePtr TupleType::createNamed(const c10::optional& qua auto schema = std::make_shared( /*name=*/qualName.value_or(c10::QualifiedName()).name(), /*overload_name=*/std::string(""), - /*arguments=*/arguments, + /*arguments=*/std::move(arguments), /*returns=*/std::vector{}); return std::shared_ptr(new TupleType( field_types, qualName, schema)); // NOLINT(modernize-make-shared) From bd8608cd5cdaa69384b11f1253b4bcc822032a51 Mon Sep 17 00:00:00 2001 From: driazati Date: Thu, 19 Aug 2021 10:38:41 -0700 Subject: [PATCH 072/530] Use CMake for breakpad (#63186) Summary: We currently build breakpad from [this fork](https://github.com/driazati/breakpad) to include extra logic to restore signal handlers that were previously present. With some [new additions](https://github.com/google/breakpad/compare/main...driazati:main) this fork now includes a CMake based build, so we can add breakpad as a proper dependency rather than rely on including it in Docker images as a system library which is error prone (we have a bunch of images) and hard to extend to MacOS / Windows. This also includes some changes to the crash handling code to support MacOS / Windows in a similar way to Linux. ```python import torch # On Windows this writes crashes to C:\Users\\AppData\pytorch_crashes # On MacOS/Linux this writes crashes to /tmp/pytorch_crashes torch.utils._crash_handler.enable_minidumps() # Easy way to cause a segfault and trigger the handler torch.bincount(input=torch.tensor([9223372036854775807])) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/63186 Reviewed By: malfet, seemethere Differential Revision: D30318404 Pulled By: driazati fbshipit-source-id: 0d7daf3701cfaba5451cc529a0730272ab1eb1dc --- .gitmodules | 3 + CMakeLists.txt | 5 ++ caffe2/CMakeLists.txt | 25 ++----- cmake/Dependencies.cmake | 4 ++ cmake/Summary.cmake | 1 + test/test_cpp_extensions_jit.py | 65 +++++++++++------- test/test_utils.py | 5 +- third_party/breakpad | 1 + torch/csrc/utils/crash_handler.cpp | 87 ++++++++++++++++++++----- torch/csrc/utils/crash_handler.h | 10 ++- torch/testing/_internal/common_utils.py | 23 +++++-- torch/utils/_crash_handler.py | 5 +- 12 files changed, 162 insertions(+), 72 deletions(-) create mode 160000 third_party/breakpad diff --git a/.gitmodules b/.gitmodules index 6836ccb49c881..a7cc437f43840 100644 --- a/.gitmodules +++ b/.gitmodules @@ -139,3 +139,6 @@ [submodule "third_party/pocketfft"] path = third_party/pocketfft url = https://github.com/mreineck/pocketfft +[submodule "third_party/breakpad"] + path = third_party/breakpad + url = https://github.com/driazati/breakpad.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 188f35a9981e0..4d1653ffaded3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,6 +202,7 @@ cmake_dependent_option( "USE_CUDNN" OFF) option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) option(USE_KINETO "Use Kineto profiling library" ON) +option(USE_BREAKPAD "Use breakpad crash dump library" ON) option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF) option(USE_FAKELOWP "Use FakeLowp operators" OFF) option(USE_FFMPEG "Use ffmpeg" OFF) @@ -264,6 +265,10 @@ if(NOT DEFINED USE_VULKAN) "ANDROID" OFF) endif() +if(IOS) + set(USE_BREAKPAD OFF) +endif() + option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON) option(USE_LITE_INTERPRETER_PROFILER "Enable " ON) option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 619455421f282..523fea8181cf8 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1042,27 +1042,10 @@ if(USE_TBB) target_link_libraries(torch_cpu PUBLIC TBB::tbb) endif() - -if(LINUX) - find_library(BREAKPAD_LIB breakpad_client - PATHS /usr/local/lib/) - find_path(BREAKPAD_INCLUDE_DIR breakpad - PATHS /usr/local/include/) - - if(BREAKPAD_LIB AND BREAKPAD_INCLUDE_DIR) - message(STATUS "found breakpad library") - target_link_libraries(torch_cpu PRIVATE ${BREAKPAD_LIB}) - target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER) - target_include_directories(torch_cpu PRIVATE ${BREAKPAD_INCLUDE_DIR}/breakpad) - else() - if(BREAKPAD_INCLUDE_DIR) - message(STATUS "breakpad_client library not found") - elseif(BREAKPAD_LIB) - message(STATUS "breakpad include path not found") - else() - message(STATUS "breakpad_client library and include path not found") - endif() - endif() +if(USE_BREAKPAD) + target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER) + target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party ${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad/src) + target_link_libraries(torch_cpu PRIVATE breakpad) endif() target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE}) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 2c19dae96c909..3e37c3538f6fd 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1881,6 +1881,10 @@ set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) +if(USE_BREAKPAD) + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad) +endif() + # ---[ Kineto # edge profiler depends on KinetoProfiler but it only does cpu # profiling. Thus we dont need USE_CUDA/USE_ROCM diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 4de2d79cb9757..afc63b18f5f07 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -178,6 +178,7 @@ function(caffe2_print_configuration_summary) message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}") endif() message(STATUS " USE_DEPLOY : ${USE_DEPLOY}") + message(STATUS " USE_BREAKPAD : ${USE_BREAKPAD}") message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}") message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}") endfunction() diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py index 073835277e678..89d9af10e0d35 100644 --- a/test/test_cpp_extensions_jit.py +++ b/test/test_cpp_extensions_jit.py @@ -869,11 +869,29 @@ def test_custom_compound_op_autograd(self): gradcheck(torch.ops.my.add, [a, b], eps=1e-2) - @unittest.skipIf(not has_breakpad(), "Breakpad library must be present on system for crash handler") - @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler") - def test_crash_handler(self): - def run_test(stderr_file, destination): - # Code to enable dumps and trigger a segfault + @staticmethod + def _crash_handler_test_process(stderr_file, destination): + # Code to enable dumps and trigger a segfault + if sys.platform == "win32": + destination = destination.replace("\\", "\\\\") + csrc = textwrap.dedent(f""" + #include + #include + #include + #include + #include + + int fail() {{ + std::wstring_convert> converter; + std::string narrow("{destination}"); + std::wstring wide = converter.from_bytes(narrow); + torch::crash_handler::enable_minidumps(wide.c_str()); + + volatile int* bad = nullptr; + return *bad; + }} + """) + else: csrc = textwrap.dedent(f""" #include @@ -885,29 +903,32 @@ def run_test(stderr_file, destination): }} """) - # Some special stuff to overwrite stderr for a C++ extension - # Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls - sys.stdout.flush() - newstdout = os.dup(2) - devnull = os.open(stderr_file, os.O_WRONLY) - os.dup2(devnull, 2) - os.close(devnull) - sys.stdout = os.fdopen(newstdout, 'w') - - module = torch.utils.cpp_extension.load_inline( - name="segfault", - cpp_sources=csrc, - functions=["fail"], - ) - module.fail() + # Some special stuff to overwrite stderr for a C++ extension + # Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls + sys.stdout.flush() + newstdout = os.dup(2) + devnull = os.open(stderr_file, os.O_WRONLY) + os.dup2(devnull, 2) + os.close(devnull) + sys.stdout = os.fdopen(newstdout, 'w') + module = torch.utils.cpp_extension.load_inline( + name="segfault", + cpp_sources=csrc, + functions=["fail"], + ) + module.fail() - with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile() as stderr: + @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler") + @unittest.skipIf(not has_breakpad(), "Built without breakpad") + def test_crash_handler(self): + with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile(delete=not sys.platform == "win32") as stderr: # Use multiprocessing to spin up a separate process to make catching # the segfault easier - p = Process(target=run_test, args=(stderr.name, temp_dir)) + p = Process(target=self._crash_handler_test_process, args=(stderr.name, temp_dir)) p.start() p.join() + with open(stderr.name) as f: result = f.read().strip() diff --git a/test/test_utils.py b/test/test_utils.py index d0f8d10d9fbd4..6f9432e0e6392 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,7 +19,7 @@ import torch.hub as hub from torch.autograd._functions.utils import check_onnx_broadcast from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings -from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, has_breakpad +from torch.testing._internal.common_utils import has_breakpad, load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, TEST_WITH_ASAN from urllib.error import URLError # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for @@ -739,7 +739,8 @@ def forward(self, x): class TestCrashHandler(TestCase): - @unittest.skipIf(not has_breakpad(), "Crash handler lib was not linked in") + @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler") + @unittest.skipIf(not has_breakpad(), "Built without breakpad") def test_python_exception_writing(self): with tempfile.TemporaryDirectory() as temp_dir: torch.utils._crash_handler.enable_minidumps(temp_dir) diff --git a/third_party/breakpad b/third_party/breakpad new file mode 160000 index 0000000000000..469a80ee54947 --- /dev/null +++ b/third_party/breakpad @@ -0,0 +1 @@ +Subproject commit 469a80ee54947ad8d000d33a615f1a199165a711 diff --git a/torch/csrc/utils/crash_handler.cpp b/torch/csrc/utils/crash_handler.cpp index 2de22be0d2e86..8fb318b265a83 100644 --- a/torch/csrc/utils/crash_handler.cpp +++ b/torch/csrc/utils/crash_handler.cpp @@ -3,8 +3,16 @@ #include #ifdef ADD_BREAKPAD_SIGNAL_HANDLER -#include +#ifdef __linux__ +#include #include +#elif __APPLE__ +#include +#elif _WIN32 +#include +#else +#error unsupported platform +#endif #endif #include @@ -16,9 +24,10 @@ namespace crash_handler { #ifdef ADD_BREAKPAD_SIGNAL_HANDLER static std::unique_ptr handler; // NOLINT -static std::string minidump_directory; // NOLINT +static STRING_TYPE minidump_directory; // NOLINT static bool enabled_for_exceptions = false; // NOLINT +#if __linux__ bool dump_callback( const google_breakpad::MinidumpDescriptor& descriptor, void* context, @@ -28,10 +37,45 @@ bool dump_callback( } return succeeded; } +#elif __APPLE__ -void enable_minidumps(const std::string& dir) { +bool dump_callback( + const char* dump_dir, + const char* minidump_id, + void* context, + bool succeeded) { + if (succeeded) { + std::cerr << "Wrote minidump to " << dump_dir << "/" << minidump_id + << ".dmp" << std::endl; + } + return succeeded; +} +#elif _WIN32 +bool dump_callback( + const wchar_t* dump_path, + const wchar_t* minidump_id, + void* context, + EXCEPTION_POINTERS* exinfo, + MDRawAssertionInfo* assertion, + bool succeeded) { + if (succeeded) { + // Printing with wcerr inserts spaces between all the characters for some + // reason. If someone figures that out then we can get rid of the std::string + // conversions here. + std::wstring dump_path_ws(dump_path); + std::string dump_path_string(dump_path_ws.begin(), dump_path_ws.end()); + std::wstring minidump_id_ws(minidump_id); + std::string minidump_id_string(minidump_id_ws.begin(), minidump_id_ws.end()); + std::cerr << "Wrote minidump to " << dump_path_string << "\\" << minidump_id_string << ".dmp" << std::endl; + } + return succeeded; +} +#endif + +void enable_minidumps(const STRING_TYPE& dir) { minidump_directory = dir; - // The constructor here registers the actual signal handler +// The constructor here registers the actual signal handler +#ifdef __linux__ handler = std::make_unique( google_breakpad::MinidumpDescriptor(minidump_directory), nullptr, @@ -39,13 +83,30 @@ void enable_minidumps(const std::string& dir) { nullptr, true, -1); +#elif __APPLE__ + handler = std::make_unique( + /*dump_path=*/minidump_directory.c_str(), + /*filter=*/nullptr, + /*callback=*/dump_callback, + /*callback_context=*/nullptr, + /*install_handler=*/true, + /*port_name=*/nullptr); +#elif _WIN32 + handler = std::make_unique( + /*dump_path=*/minidump_directory.c_str(), + /*filter=*/nullptr, + /*callback=*/dump_callback, + /*callback_context=*/nullptr, + /*handler_types=*/ + google_breakpad::ExceptionHandler::HandlerType::HANDLER_ALL); +#endif } void disable_minidumps() { handler.reset(); } -const std::string& get_minidump_directory() { +const STRING_TYPE& get_minidump_directory() { if (handler == nullptr) { AT_ERROR( "Minidump handler is uninintialized, make sure to call enable_minidumps first"); @@ -78,18 +139,16 @@ void enable_minidumps_on_exceptions() { #else // On unspported systems we can't do anything, so stub out everything. -void enable_minidumps(const std::string& dir) { - AT_ERROR( - "Minidump collection is currently only implemented for Linux platforms"); +void enable_minidumps(const STRING_TYPE& dir) { + AT_ERROR("Compiled without minidump support"); } void disable_minidumps() { // Purposefully do nothing } -const std::string& get_minidump_directory() { - AT_ERROR( - "Minidump collection is currently only implemented for Linux platforms"); +const STRING_TYPE& get_minidump_directory() { + AT_ERROR("Compiled without minidump support"); } bool is_enabled_on_exceptions() { @@ -97,13 +156,11 @@ bool is_enabled_on_exceptions() { } void write_minidump() { - AT_ERROR( - "Minidump collection is currently only implemented for Linux platforms"); + AT_ERROR("Compiled without minidump support"); } void enable_minidumps_on_exceptions() { - AT_ERROR( - "Minidump collection is currently only implemented for Linux platforms"); + AT_ERROR("Compiled without minidump support"); } #endif diff --git a/torch/csrc/utils/crash_handler.h b/torch/csrc/utils/crash_handler.h index 5fe0503b2ed00..dc11945195372 100644 --- a/torch/csrc/utils/crash_handler.h +++ b/torch/csrc/utils/crash_handler.h @@ -5,10 +5,16 @@ namespace torch { namespace crash_handler { +#ifdef _WIN32 +typedef std::wstring STRING_TYPE; +#else +typedef std::string STRING_TYPE; +#endif + // Set up a handler that writes minidumps to 'dir' on signals. This is not // necessary to call unless you want to change 'dir' to something other than // the default '/tmp/pytorch_crashes'. -TORCH_API void enable_minidumps(const std::string& dir); +TORCH_API void enable_minidumps(const STRING_TYPE& dir); // Enable minidumps when passing exceptions up to Python. By default these don't // do anything special, but it can be useful to write out a minidump on @@ -19,7 +25,7 @@ TORCH_API void enable_minidumps_on_exceptions(); TORCH_API void disable_minidumps(); // Get the directory that minidumps will be written to -TORCH_API const std::string& get_minidump_directory(); +TORCH_API const STRING_TYPE& get_minidump_directory(); // These are TORCH_API'ed since they are used from libtorch_python.so TORCH_API bool is_enabled_on_exceptions(); diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index fed9a005a55c5..a16056cd55cf7 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -2533,13 +2533,6 @@ def disable_gc(): else: yield -def has_breakpad() -> bool: - # If not on a special build, check that the library was actually linked in - try: - torch._C._get_minidump_directory() # type: ignore[attr-defined] - return True - except RuntimeError as e: - return False def find_library_location(lib_name: str) -> Path: # return the shared library file in the installed folder if exist, @@ -2590,6 +2583,22 @@ def get_tensors_from(args, kwargs): return set([arg for arg in args if isinstance(arg, Tensor)] + [v for v in kwargs.values() if isinstance(v, Tensor)]) + +def has_breakpad(): + # We always build with breakpad in CI + if IS_IN_CI: + return True + + # If not on a special build, check that the library was actually linked in + try: + torch._C._get_minidump_directory() # type: ignore[attr-defined] + return True + except RuntimeError as e: + if "Minidump handler is uninintialized" in str(e): + return True + return False + + def sandcastle_skip_if(condition, reason): """ Similar to unittest.skipIf, however in the sandcastle environment it just diff --git a/torch/utils/_crash_handler.py b/torch/utils/_crash_handler.py index 3d736c3f85ce0..84b345229bde9 100644 --- a/torch/utils/_crash_handler.py +++ b/torch/utils/_crash_handler.py @@ -5,11 +5,10 @@ import torch DEFAULT_MINIDUMP_DIR = "/tmp/pytorch_crashes" +if sys.platform == "win32": + DEFAULT_MINIDUMP_DIR = str(pathlib.Path.home() / "AppData" / "pytorch_crashes") def enable_minidumps(directory=DEFAULT_MINIDUMP_DIR): - if sys.platform != "linux": - raise RuntimeError("Minidump collection is currently only implemented for Linux platforms") - if directory == DEFAULT_MINIDUMP_DIR: pathlib.Path(directory).mkdir(parents=True, exist_ok=True) elif not os.path.exists(directory): From 535d44141b6a3d99eddfa241c4dfb6fc4aed7cab Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Thu, 19 Aug 2021 11:21:26 -0700 Subject: [PATCH 073/530] [7/N] Remove fork tests for RPC. (#63443) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63443 After https://github.com/pytorch/pytorch/pull/63442, all distributed tests can run with opt-asan. As a result, we can now remove all of our fork based tests. This is the first PR in a stack, which first removes fork based tests from RPC. ghstack-source-id: 136177744 Test Plan: waitforbuildbot Reviewed By: lw Differential Revision: D30384905 fbshipit-source-id: 86d438aebaa6cb02ae2a966fea244849849a1889 --- .../rpc/cuda/test_tensorpipe_agent.py | 2 - test/distributed/rpc/test_faulty_agent.py | 2 - test/distributed/rpc/test_tensorpipe_agent.py | 2 - .../_internal/distributed/rpc_utils.py | 74 +++++-------------- 4 files changed, 18 insertions(+), 62 deletions(-) diff --git a/test/distributed/rpc/cuda/test_tensorpipe_agent.py b/test/distributed/rpc/cuda/test_tensorpipe_agent.py index 5647434f6f53e..7cb35f9f73aa1 100644 --- a/test/distributed/rpc/cuda/test_tensorpipe_agent.py +++ b/test/distributed/rpc/cuda/test_tensorpipe_agent.py @@ -15,7 +15,6 @@ from torch.testing._internal.distributed.rpc_utils import ( GENERIC_CUDA_TESTS, TENSORPIPE_CUDA_TESTS, - MultiProcess, generate_tests, ) @@ -25,7 +24,6 @@ "TensorPipe", TensorPipeRpcAgentTestFixture, GENERIC_CUDA_TESTS + TENSORPIPE_CUDA_TESTS, - MultiProcess.SPAWN, __name__, ) ) diff --git a/test/distributed/rpc/test_faulty_agent.py b/test/distributed/rpc/test_faulty_agent.py index 7c26643ab6b60..cb889115be8a1 100644 --- a/test/distributed/rpc/test_faulty_agent.py +++ b/test/distributed/rpc/test_faulty_agent.py @@ -15,7 +15,6 @@ ) from torch.testing._internal.distributed.rpc_utils import ( FAULTY_AGENT_TESTS, - MultiProcess, generate_tests, ) @@ -28,7 +27,6 @@ "Faulty", FaultyRpcAgentTestFixture, FAULTY_AGENT_TESTS, - MultiProcess.SPAWN, __name__, ) ) diff --git a/test/distributed/rpc/test_tensorpipe_agent.py b/test/distributed/rpc/test_tensorpipe_agent.py index 32b0e1c69357a..b741bc443c460 100644 --- a/test/distributed/rpc/test_tensorpipe_agent.py +++ b/test/distributed/rpc/test_tensorpipe_agent.py @@ -16,7 +16,6 @@ from torch.testing._internal.distributed.rpc_utils import ( GENERIC_TESTS, TENSORPIPE_TESTS, - MultiProcess, generate_tests, ) @@ -29,7 +28,6 @@ "TensorPipe", TensorPipeRpcAgentTestFixture, GENERIC_TESTS + TENSORPIPE_TESTS, - MultiProcess.SPAWN, __name__, ) ) diff --git a/torch/testing/_internal/distributed/rpc_utils.py b/torch/testing/_internal/distributed/rpc_utils.py index b5cf9f73548c1..dd11c8dc450e0 100644 --- a/torch/testing/_internal/distributed/rpc_utils.py +++ b/torch/testing/_internal/distributed/rpc_utils.py @@ -2,13 +2,11 @@ import os import sys import unittest -from enum import Flag, auto from typing import Dict, List, Type from torch.testing._internal.common_distributed import MultiProcessTestCase from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, find_free_port, IS_SANDCASTLE, ) @@ -75,25 +73,12 @@ def _check_and_unset_tcp_init(): # The tests for the RPC module need to cover multiple possible combinations: # - different aspects of the API, each one having its own suite of tests; # - different agents (ProcessGroup, TensorPipe, ...); -# - and subprocesses launched with either fork or spawn. # To avoid a combinatorial explosion in code size, and to prevent forgetting to # add a combination, these are generated automatically by the code in this file. -# Here, we collect all the test suites that we need to cover and the two multi- -# processing methods. We then have one separate file for each agent, from which +# Here, we collect all the test suites that we need to cover. +# We then have one separate file for each agent, from which # we call the generate_tests function of this file, passing to it a fixture for -# the agent, which then gets mixed-in with each test suite and each mp method. - - -@unittest.skipIf(TEST_WITH_TSAN, "TSAN and fork() is broken") -class ForkHelper(MultiProcessTestCase): - def setUp(self): - super().setUp() - _check_and_set_tcp_init() - self._fork_processes() - - def tearDown(self): - _check_and_unset_tcp_init() - super().tearDown() +# the agent, which then gets mixed-in with each test suite. @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues" @@ -109,17 +94,6 @@ def tearDown(self): super().tearDown() -class MultiProcess(Flag): - FORK = auto() - SPAWN = auto() - - -MP_HELPERS_AND_SUFFIXES = { - MultiProcess.FORK: (ForkHelper, "WithFork"), - MultiProcess.SPAWN: (SpawnHelper, "WithSpawn"), -} - - # This list contains test suites that are agent-agnostic and that only verify # compliance with the generic RPC interface specification. These tests should # *not* make use of implementation details of a specific agent (options, @@ -175,7 +149,6 @@ def generate_tests( prefix: str, mixin: Type[RpcAgentTestFixture], tests: List[Type[RpcAgentTestFixture]], - mp_type_filter: MultiProcess, module_name: str, ) -> Dict[str, Type[RpcAgentTestFixture]]: """Mix in the classes needed to autogenerate the tests based on the params. @@ -183,36 +156,25 @@ def generate_tests( Takes a series of test suites, each written against a "generic" agent (i.e., derived from the abstract RpcAgentTestFixture class), as the `tests` args. Takes a concrete subclass of RpcAgentTestFixture, which specializes it for a - certain agent, as the `mixin` arg. Produces all combinations of them, and of - the multiprocessing start methods (fork or spawn), possibly filtered using - the `mp_type_filter`. Returns a dictionary of class names to class type + certain agent, as the `mixin` arg. Produces all combinations of them. + Returns a dictionary of class names to class type objects which can be inserted into the global namespace of the calling - module. The name of each test will be a concatenation of the `prefix` arg, - the original name of the test suite, and a suffix of either `WithFork` or - `WithSpawn`. The `module_name` should be the name of the calling module so + module. The name of each test will be a concatenation of the `prefix` arg + and the original name of the test suite. + The `module_name` should be the name of the calling module so that the classes can be fixed to make it look like they belong to it, which is necessary for pickling to work on them. """ ret: Dict[str, Type[RpcAgentTestFixture]] = {} for test_class in tests: - for mp_type in MultiProcess: - if mp_type & mp_type_filter: - mp_helper, suffix = MP_HELPERS_AND_SUFFIXES[mp_type] - if IS_SANDCASTLE: - if mp_helper == SpawnHelper and TEST_WITH_DEV_DBG_ASAN: - print( - f'Skipping test {test_class} on sandcastle for the following reason: ' - 'Skip dev-asan as torch + multiprocessing spawn have known issues', file=sys.stderr) - continue - elif mp_helper == ForkHelper and TEST_WITH_TSAN: - print( - f'Skipping test {test_class} on sandcastle for the following reason: ' - 'TSAN and fork() is broken' - ) - continue - - name = f"{prefix}{test_class.__name__}{suffix}" - class_ = type(name, (test_class, mixin, mp_helper), dict()) - class_.__module__ = module_name - ret[name] = class_ + if IS_SANDCASTLE and TEST_WITH_DEV_DBG_ASAN: + print( + f'Skipping test {test_class} on sandcastle for the following reason: ' + 'Skip dev-asan as torch + multiprocessing spawn have known issues', file=sys.stderr) + continue + + name = f"{prefix}{test_class.__name__}" + class_ = type(name, (test_class, mixin, SpawnHelper), dict()) + class_.__module__ = module_name + ret[name] = class_ return ret From 0c3904d18061ea31c9fe1bded5893ffb07f0a4b5 Mon Sep 17 00:00:00 2001 From: Yusuo Hu Date: Thu, 19 Aug 2021 12:37:58 -0700 Subject: [PATCH 074/530] [BF16] Add a missing thread local specifier to autocast_gpu_dtype (#63416) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63416 Fix a missing thread local specifier introduced by recent PR https://github.com/pytorch/pytorch/pull/61002 Test Plan: Unit Tests Reviewed By: ngimel Differential Revision: D30376154 fbshipit-source-id: c70d37ec85c3eba88eb87f766f1c4e7aeff8eaf9 --- aten/src/ATen/autocast_mode.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 97ec9ec69dbeb..1ac5ad1c88ba6 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -59,7 +59,7 @@ thread_local int nesting = 0; thread_local at::ScalarType autocast_cpu_dtype = at::kBFloat16; // autocast_gpu_dtype is the lower_precision_fp used by AutocastGPU. -at::ScalarType autocast_gpu_dtype = at::kHalf; +thread_local at::ScalarType autocast_gpu_dtype = at::kHalf; } void clear_cache() { From d986d4bf6354bc02d5e68eaaea60b02234a4449f Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Thu, 19 Aug 2021 12:40:37 -0700 Subject: [PATCH 075/530] [special] use __all__ to hide internal imports (#63135) Summary: Reference: https://github.com/pytorch/pytorch/issues/50345 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63135 Reviewed By: ngimel Differential Revision: D30364287 Pulled By: mruberry fbshipit-source-id: 20078668943fafa45ce09610634b1d2c424b1922 --- torch/special/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/torch/special/__init__.py b/torch/special/__init__.py index 1f3b3fc5dc899..2fea9c6cb1b04 100644 --- a/torch/special/__init__.py +++ b/torch/special/__init__.py @@ -1,9 +1,12 @@ -import sys - import torch from torch._C import _add_docstr, _special # type: ignore[attr-defined] from torch._torch_docs import common_args, multi_dim_common +__all__ = ['entr', 'psi', 'digamma', 'gammaln', 'polygamma', 'erf', 'erfc', 'erfinv', + 'erfcx', 'logit', 'logsumexp', 'expit', 'exp2', 'expm1', 'xlog1py', 'xlogy', + 'i0', 'i0e', 'i1', 'i1e', 'ndtr', 'ndtri', 'log1p', 'sinc', 'round', 'log_softmax', + 'zeta', 'multigammaln'] + Tensor = torch.Tensor entr = _add_docstr(_special.special_entr, From efd70b7ce6b2d3c000494ce5e527198637db5bc9 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Thu, 19 Aug 2021 12:41:42 -0700 Subject: [PATCH 076/530] Modernizes add and mul documentation (#63309) Summary: Fixes https://github.com/pytorch/pytorch/issues/39329. The documentation for torch.add and torch.mul was sorely out of date and even included deprecated references. This PR modernizes their descriptions consistent with torch.sub. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63309 Reviewed By: ngimel Differential Revision: D30338004 Pulled By: mruberry fbshipit-source-id: ee1c2a8106af8341253cafb0003b06e8f652624d --- torch/_torch_docs.py | 104 ++++++++++++------------------------------- 1 file changed, 29 insertions(+), 75 deletions(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index deee91dc5fa7b..a4f3bdaef7df7 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -214,25 +214,26 @@ def merge_dicts(*dicts): """.format(**common_args)) add_docstr(torch.add, r""" -add(input, other, *, out=None) -> Tensor +add(input, other, *, alpha=1, out=None) -> Tensor -Adds the scalar :attr:`other` to each element of the input :attr:`input` -and returns a new resulting tensor. +Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`. .. math:: - \text{{out}} = \text{{input}} + \text{{other}} + \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i +""" + r""" -If :attr:`input` is of type FloatTensor or DoubleTensor, :attr:`other` must be -a real number, otherwise it should be an integer. +Supports :ref:`broadcasting to a common shape `, +:ref:`type promotion `, and integer, float, and complex inputs. Args: {input} - other (Number): the number to be added to each element of :attr:`input` + other (Tensor or Number): the tensor or number to add to input. Keyword arguments: + alpha (Number): the multiplier for :attr:`other`. {out} -Example:: +Examples:: >>> a = torch.randn(4) >>> a @@ -240,42 +241,16 @@ def merge_dicts(*dicts): >>> torch.add(a, 20) tensor([ 20.0202, 21.0985, 21.3506, 19.3944]) -.. function:: add(input, other, *, alpha=1, out=None) -> Tensor - :noindex: - -Each element of the tensor :attr:`other` is multiplied by the scalar -:attr:`alpha` and added to each element of the tensor :attr:`input`. -The resulting tensor is returned. - -The shapes of :attr:`input` and :attr:`other` must be -:ref:`broadcastable `. - -.. math:: - \text{{out}} = \text{{input}} + \text{{alpha}} \times \text{{other}} - -If :attr:`other` is of type FloatTensor or DoubleTensor, :attr:`alpha` must be -a real number, otherwise it should be an integer. - -Args: - input (Tensor): the first input tensor - other (Tensor): the second input tensor - -Keyword args: - alpha (Number): the scalar multiplier for :attr:`other` - {out} - -Example:: - - >>> a = torch.randn(4) - >>> a - tensor([-0.9732, -0.3497, 0.6245, 0.4022]) - >>> b = torch.randn(4, 1) + >>> b = torch.randn(4) >>> b + tensor([-0.9732, -0.3497, 0.6245, 0.4022]) + >>> c = torch.randn(4, 1) + >>> c tensor([[ 0.3743], [-1.7724], [-0.5811], [-0.8017]]) - >>> torch.add(a, b, alpha=10) + >>> torch.add(b, c, alpha=10) tensor([[ 2.7695, 3.3930, 4.3672, 4.1450], [-18.6971, -18.0736, -17.0994, -17.3216], [ -6.7845, -6.1610, -5.1868, -5.4090], @@ -6640,23 +6615,24 @@ def merge_dicts(*dicts): add_docstr(torch.mul, r""" mul(input, other, *, out=None) -> Tensor -Multiplies each element of the input :attr:`input` with the scalar -:attr:`other` and returns a new resulting tensor. +Multiplies :attr:`input` by :attr:`other`. + .. math:: - \text{out}_i = \text{other} \times \text{input}_i + \text{out}_i = \text{input}_i \times \text{other}_i """ + r""" -If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`other` -should be a real number, otherwise it should be an integer + +Supports :ref:`broadcasting to a common shape `, +:ref:`type promotion `, and integer, float, and complex inputs. Args: {input} - other (Number): the number to be multiplied to each element of :attr:`input` + other (Tensor or Number) - the tensor or number to multiply input by. Keyword args: {out} -Example:: +Examples:: >>> a = torch.randn(3) >>> a @@ -6664,38 +6640,16 @@ def merge_dicts(*dicts): >>> torch.mul(a, 100) tensor([ 20.1494, -42.5491, 260.8663]) -.. function:: mul(input, other, *, out=None) -> Tensor - :noindex: - -Each element of the tensor :attr:`input` is multiplied by the corresponding -element of the Tensor :attr:`other`. The resulting tensor is returned. - -The shapes of :attr:`input` and :attr:`other` must be -:ref:`broadcastable `. - -.. math:: - \text{{out}}_i = \text{{input}}_i \times \text{{other}}_i -""".format(**common_args) + r""" - -Args: - input (Tensor): the first multiplicand tensor - other (Tensor): the second multiplicand tensor - -Keyword args: - {out} - -Example:: - - >>> a = torch.randn(4, 1) - >>> a + >>> b = torch.randn(4, 1) + >>> b tensor([[ 1.1207], [-0.3137], [ 0.0700], [ 0.8378]]) - >>> b = torch.randn(1, 4) - >>> b + >>> c = torch.randn(1, 4) + >>> c tensor([[ 0.5146, 0.1216, -0.5244, 2.2382]]) - >>> torch.mul(a, b) + >>> torch.mul(b, c) tensor([[ 0.5767, 0.1363, -0.5877, 2.5083], [-0.1614, -0.0382, 0.1645, -0.7021], [ 0.0360, 0.0085, -0.0367, 0.1567], @@ -8977,10 +8931,10 @@ def merge_dicts(*dicts): Args: {input} - other (Tensor or Scalar): the tensor or scalar to subtract from :attr:`input` + other (Tensor or Number): the tensor or number to subtract from :attr:`input`. Keyword args: - alpha (Scalar): the scalar multiplier for :attr:`other` + alpha (Number): the multiplier for :attr:`other`. {out} Example:: From 99203580a9e2bb468a9d814f36b6c6a4c13fbed4 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 19 Aug 2021 12:45:32 -0700 Subject: [PATCH 077/530] Updates internal `assert_allclose` callsites in favor of `assert_close` (#61841) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61841 Redo of #60863. Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30408145 Pulled By: mruberry fbshipit-source-id: 0b34ebc7f23ba38ecd89640b61d8aca59b7eab58 --- benchmarks/cpp/tensorexpr/bench_ops.py | 4 +- docs/source/jit.rst | 2 +- test/mobile/test_bytecode.py | 6 +- test/mobile/test_lite_script_module.py | 18 ++--- test/quantization/core/test_quantized_op.py | 23 +++---- .../jit/test_deprecated_jit_quant.py | 22 +++--- test/test_fx.py | 6 +- test/test_fx_experimental.py | 10 +-- test/test_jit.py | 26 +++---- test/test_jit_fuser_te.py | 16 ++--- test/test_mobile_optimizer.py | 18 ++--- test/test_nn.py | 69 +++++++++---------- test/test_pruning_op.py | 2 +- test/test_reductions.py | 40 ++++++----- test/test_static_runtime.py | 26 +++---- test/test_tensorexpr.py | 8 +-- test/test_tensorexpr_pybind.py | 8 +-- test/test_throughput_benchmark.py | 3 +- test/test_torch.py | 18 ++--- test/test_xnnpack_integration.py | 30 ++++---- .../fx2trt/example/fx2trt_example.py | 2 +- torch/jit/_trace.py | 9 +-- torch/testing/_core.py | 41 ----------- torch/testing/_deprecated.py | 61 +++++++++++++++- .../testing/_internal/common_quantization.py | 6 +- .../_internal/distributed/distributed_test.py | 17 ++--- 26 files changed, 246 insertions(+), 245 deletions(-) diff --git a/benchmarks/cpp/tensorexpr/bench_ops.py b/benchmarks/cpp/tensorexpr/bench_ops.py index ca40e5d3c7459..12d766ae74862 100644 --- a/benchmarks/cpp/tensorexpr/bench_ops.py +++ b/benchmarks/cpp/tensorexpr/bench_ops.py @@ -59,7 +59,7 @@ def hardswish(x): traced(x) # Validate result. - torch.testing.assert_allclose(op(x), traced(x)) + torch.testing.assert_close(op(x), traced(x)) # Benchmark. bench_iters = 100 @@ -94,7 +94,7 @@ def test_batch_norm(): traced(x, y, z) # Validate result. - torch.testing.assert_allclose(op(x, y, z), traced(x, y, z)) + torch.testing.assert_close(op(x, y, z), traced(x, y, z)) # Benchmark. bench_iters = 100 diff --git a/docs/source/jit.rst b/docs/source/jit.rst index eeb0d2a2c4ac3..f791c1c687153 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -475,7 +475,7 @@ In this case, data-dependent control flow like this can be captured using #print(str(scripted_fn.graph).strip()) for input_tuple in [inputs] + check_inputs: - torch.testing.assert_allclose(fn(*input_tuple), scripted_fn(*input_tuple)) + torch.testing.assert_close(fn(*input_tuple), scripted_fn(*input_tuple)) .. testoutput:: :hide: diff --git a/test/mobile/test_bytecode.py b/test/mobile/test_bytecode.py index 5511e6a63b085..95baa86d5763e 100644 --- a/test/mobile/test_bytecode.py +++ b/test/mobile/test_bytecode.py @@ -228,7 +228,7 @@ def test_bytecode_values_for_all_backport_functions(self): # # Load model and run forward method # mobile_module = _load_for_lite_interpreter(str(tmp_input_model_path)) # mobile_module_result = mobile_module(module_input) - # torch.testing.assert_allclose(mobile_module_result, expected_mobile_module_result) + # torch.testing.assert_close(mobile_module_result, expected_mobile_module_result) # current_to_version -= 1 # # Check backport failure case @@ -270,7 +270,7 @@ def test_backport_bytecode_from_file_to_file(self): module_input = 1 mobile_module_result = mobile_module(module_input) expected_mobile_module_result = 3 * torch.ones([2, 4], dtype=torch.float64) - torch.testing.assert_allclose(mobile_module_result, expected_mobile_module_result) + torch.testing.assert_close(mobile_module_result, expected_mobile_module_result) shutil.rmtree(tmpdirname) # Check just the _backport_for_mobile_to_buffer mechanism but not the function implementations @@ -296,7 +296,7 @@ def test_backport_bytecode_from_file_to_buffer(self): module_input = 1 mobile_module_result = mobile_module(module_input) expected_mobile_module_result = 3 * torch.ones([2, 4], dtype=torch.float64) - torch.testing.assert_allclose(mobile_module_result, expected_mobile_module_result) + torch.testing.assert_close(mobile_module_result, expected_mobile_module_result) def test_get_model_ops_and_info(self): diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py index 369371fd3279c..a86669ec574b7 100644 --- a/test/mobile/test_lite_script_module.py +++ b/test/mobile/test_lite_script_module.py @@ -48,13 +48,13 @@ def forward(self, x): mobile_module = _load_for_lite_interpreter(buffer) mobile_module_result = mobile_module(input) - torch.testing.assert_allclose(script_module_result, mobile_module_result) + torch.testing.assert_close(script_module_result, mobile_module_result) mobile_module_forward_result = mobile_module.forward(input) - torch.testing.assert_allclose(script_module_result, mobile_module_forward_result) + torch.testing.assert_close(script_module_result, mobile_module_forward_result) mobile_module_run_method_result = mobile_module.run_method("forward", input) - torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result) + torch.testing.assert_close(script_module_result, mobile_module_run_method_result) def test_save_mobile_module_with_debug_info_with_trace(self): class A(torch.nn.Module): @@ -117,13 +117,13 @@ def forward(self, x): mobile_module = _load_for_lite_interpreter(buffer) mobile_module_result = mobile_module(input) - torch.testing.assert_allclose(script_module_result, mobile_module_result) + torch.testing.assert_close(script_module_result, mobile_module_result) mobile_module_forward_result = mobile_module.forward(input) - torch.testing.assert_allclose(script_module_result, mobile_module_forward_result) + torch.testing.assert_close(script_module_result, mobile_module_forward_result) mobile_module_run_method_result = mobile_module.run_method("forward", input) - torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result) + torch.testing.assert_close(script_module_result, mobile_module_run_method_result) def test_find_and_run_method(self): class MyTestModule(torch.nn.Module): @@ -154,7 +154,7 @@ def forward(self, arg): bundled_inputs = mobile_module.run_method("get_all_bundled_inputs") mobile_module_result = mobile_module.forward(*bundled_inputs[0]) - torch.testing.assert_allclose(script_module_result, mobile_module_result) + torch.testing.assert_close(script_module_result, mobile_module_result) def test_method_calls_with_optional_arg(self): class A(torch.nn.Module): @@ -183,7 +183,7 @@ def forward(self, x, one: int = 1): input = torch.tensor([5]) script_module_forward_result = script_module.forward(input) mobile_module_forward_result = mobile_module.forward(input) - torch.testing.assert_allclose( + torch.testing.assert_close( script_module_forward_result, mobile_module_forward_result ) @@ -198,7 +198,7 @@ def forward(self, x, one: int = 1): # now both match again mobile_module_forward_result = mobile_module.forward(input, 2) - torch.testing.assert_allclose( + torch.testing.assert_close( script_module_forward_result, mobile_module_forward_result ) diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index d0a2dea45e8e3..6c94586d3101e 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -1617,8 +1617,8 @@ def test_qtopk(self, X, k, dim, largest, sorted): quantized_out = torch.topk(qX, k, dim=dim, largest=largest, sorted=sorted) assert(len(unquantized_out) == len(quantized_out)) - torch.testing.assert_allclose(quantized_out[0].dequantize(), unquantized_out[0]) - torch.testing.assert_allclose(quantized_out[1], unquantized_out[1]) + torch.testing.assert_close(quantized_out[0].dequantize(), unquantized_out[0]) + torch.testing.assert_close(quantized_out[1], unquantized_out[1]) @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4, min_side=1, max_side=10), @@ -1643,8 +1643,8 @@ def test_qtopk_nhwc(self, X, k, dim, largest, sorted): quantized_out = torch.topk(qX, k, dim=dim, largest=largest, sorted=sorted) assert(len(unquantized_out) == len(quantized_out)) - torch.testing.assert_allclose(quantized_out[0].dequantize(), unquantized_out[0]) - torch.testing.assert_allclose(quantized_out[1], unquantized_out[1]) + torch.testing.assert_close(quantized_out[0].dequantize(), unquantized_out[0]) + torch.testing.assert_close(quantized_out[1], unquantized_out[1]) """Tests quantize concatenation (both fused and not).""" @@ -1846,7 +1846,7 @@ def test_cat_nhwc(self, X, relu): else: out = torch.ops.quantized.cat([qX, qY], dim=1, scale=scale, zero_point=zero_point) - torch.testing.assert_allclose(out.dequantize(), ref.dequantize()) + torch.testing.assert_close(out.dequantize(), ref.dequantize()) self.assertNotEqual(out.stride(), sorted(out.stride())) @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=1, max_dims=5, @@ -3400,8 +3400,7 @@ def get_reference_result( num_embeddings, embedding_dim, include_last_offset, weights, per_sample_weights, indices, offsets) - torch.testing.assert_allclose(reference_result, result, atol=atol, - rtol=rtol) + torch.testing.assert_close(reference_result, result, atol=atol, rtol=rtol) if bit_rate == 8 or bit_rate == 4: @@ -3424,7 +3423,7 @@ def get_reference_result( per_sample_weights=per_sample_weights, compressed_indices_mapping=torch.tensor(mapping_table), include_last_offset=include_last_offset) - torch.testing.assert_allclose(reference_result, result, atol=atol, rtol=rtol) + torch.testing.assert_close(reference_result, result, atol=atol, rtol=rtol) @@ -3510,7 +3509,7 @@ def test_embedding_byte(self, num_embeddings, embedding_dim): qresult = quant_op(packed_weight, indices, pruned_weights=False) ref = torch.embedding(weights, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False) - torch.testing.assert_allclose(ref, qresult, atol=0.005, rtol=1e-3) + torch.testing.assert_close(ref, qresult, atol=0.005, rtol=1e-3) def test_embedding_2d_indices(self): @@ -3533,7 +3532,7 @@ def test_embedding_2d_indices(self): qweight = torch.quantize_per_channel(weights, qparams[0], qparams[1], axis=0, dtype=torch.quint8) packed_weight = prepack_op(qweight) qresult = quant_op(packed_weight, indices, pruned_weights=False) - torch.testing.assert_allclose(ref, qresult, atol=0.05, rtol=1e-3) + torch.testing.assert_close(ref, qresult, atol=0.05, rtol=1e-3) def test_embedding_bag_2d_indices(self): """ @@ -3555,7 +3554,7 @@ def test_embedding_bag_2d_indices(self): pt_prepack_op = torch.ops.quantized.embedding_bag_byte_prepack q_weights = pt_prepack_op(weights) qresult = pt_op(q_weights, indices, mode=0, pruned_weights=False) - torch.testing.assert_allclose(result, qresult, atol=0.05, rtol=1e-3) + torch.testing.assert_close(result, qresult, atol=0.05, rtol=1e-3) # Test TorchBind based embedding_bag operator obs = PerChannelMinMaxObserver(dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0) @@ -3569,7 +3568,7 @@ def test_embedding_bag_2d_indices(self): packed_weight = torch.ops.quantized.embedding_bag_prepack(qweight) qresult = torch.ops.quantized.embedding_bag_byte(packed_weight, indices, mode=0) - torch.testing.assert_allclose(result, qresult, atol=0.05, rtol=1e-3) + torch.testing.assert_close(result, qresult, atol=0.05, rtol=1e-3) class TestQuantizedConv(TestCase): diff --git a/test/quantization/jit/test_deprecated_jit_quant.py b/test/quantization/jit/test_deprecated_jit_quant.py index 662ead35bcf01..68ddb5c346a49 100644 --- a/test/quantization/jit/test_deprecated_jit_quant.py +++ b/test/quantization/jit/test_deprecated_jit_quant.py @@ -99,7 +99,7 @@ def forward(self, x: torch.Tensor, hiddens: torch.Tensor) -> torch.Tensor: self.assertEqual(len(outs), len(ref_outs)) for out, ref_out in zip(outs, ref_outs): - torch.testing.assert_allclose(out, ref_out) + torch.testing.assert_close(out, ref_out) @skipIfNoFBGEMM def test_rnn_quantized(self): @@ -165,32 +165,32 @@ def test_rnn_quantized(self): # Compare int8 quantized to unquantized output_int8, final_hiddens_int8 = cell_int8(x, hiddens) - torch.testing.assert_allclose(output_int8, ref_out) + torch.testing.assert_close(output_int8, ref_out) for out, ref in zip(final_hiddens_int8, ref_hid): - torch.testing.assert_allclose(out, ref) + torch.testing.assert_close(out, ref) # Compare fp16 quantized to unquantized output_fp16, final_hiddens_fp16 = cell_fp16(x, hiddens) - torch.testing.assert_allclose(output_fp16, ref_out) + torch.testing.assert_close(output_fp16, ref_out) for out, ref in zip(final_hiddens_fp16, ref_hid): - torch.testing.assert_allclose(out, ref) + torch.testing.assert_close(out, ref) def compare_quantized_unquantized(ScriptWrapper, cell): wrapper = ScriptWrapper(cell) # Compare quantize scripted module to unquantized script_out, script_hid = wrapper(x, hiddens) - torch.testing.assert_allclose(script_out, ref_out) + torch.testing.assert_close(script_out, ref_out) for out, ref in zip(script_hid, ref_hid): - torch.testing.assert_allclose(out, ref) + torch.testing.assert_close(out, ref) # Compare export/import to unquantized export_import_wrapper = self.getExportImportCopyWithPacking(wrapper) ei_out, ei_hid = export_import_wrapper(x, hiddens) - torch.testing.assert_allclose(ei_out, ref_out) + torch.testing.assert_close(ei_out, ref_out) for out, ref in zip(ei_hid, ref_hid): - torch.testing.assert_allclose(out, ref) + torch.testing.assert_close(out, ref) if isinstance(cell, torch.jit.quantized.QuantizedGRU): class ScriptWrapper(torch.jit.ScriptModule): @@ -252,8 +252,8 @@ def forward(self, x): fb_fp16 = self.getExportImportCopyWithPacking(traced_fp16) y_fp16 = fb_fp16(value) - torch.testing.assert_allclose(y_int8, y_ref, rtol=0.0001, atol=1e-3) - torch.testing.assert_allclose(y_fp16, y_ref, rtol=0.0001, atol=1e-3) + torch.testing.assert_close(y_int8, y_ref, rtol=0.0001, atol=1e-3) + torch.testing.assert_close(y_fp16, y_ref, rtol=0.0001, atol=1e-3) @skipIfNoFBGEMM def test_erase_class_tensor_shapes(self): diff --git a/test/test_fx.py b/test/test_fx.py index e39469d0a0676..c55e97dc7da84 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -593,17 +593,17 @@ def __init__(self, interpreter): x = torch.rand(3, 4) ref_out = msm(x) test_out = lowered(x) - torch.testing.assert_allclose(test_out, ref_out) + torch.testing.assert_close(test_out, ref_out) # Test TorchScript compilation scripted_lowered = torch.jit.script(lowered) script_out = scripted_lowered(x) - torch.testing.assert_allclose(script_out, ref_out) + torch.testing.assert_close(script_out, ref_out) # Test TorchScript ser/de import_copy = self.getExportImportCopy(scripted_lowered) imported_out = import_copy(x) - torch.testing.assert_allclose(imported_out, ref_out) + torch.testing.assert_close(imported_out, ref_out) def test_reserved_getattr(self): """Ensure that we do not name any nodes with a reserved builtin like `getattr`""" diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index 00f3201452964..f000b0af59598 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -876,7 +876,7 @@ def forward(self, x, y): traced = symbolic_trace(WrapperMod()) normalized = NormalizeOperators(traced).transform() x, y = torch.randn(3, 4), torch.randn(3, 4) - torch.testing.assert_allclose(traced(x, y), normalized(x, y)) + torch.testing.assert_close(traced(x, y), normalized(x, y)) self.assertFalse( any(n.target in ops_to_test for n in normalized.graph.nodes) ) @@ -891,7 +891,7 @@ def forward(self, x): traced = symbolic_trace(WrapperMod()) normalized = NormalizeOperators(traced).transform() x = torch.randn(3, 4) - torch.testing.assert_allclose(traced(x), normalized(x)) + torch.testing.assert_close(traced(x), normalized(x)) self.assertFalse( any(n.target in ops_to_test for n in normalized.graph.nodes) ) @@ -1413,12 +1413,12 @@ def forward(self, x): with torch.no_grad(): model = Foo().eval() optimized_model = optimization.optimize_for_inference(model) - torch.testing.assert_allclose(model(inp), optimized_model(inp)) + torch.testing.assert_close(model(inp), optimized_model(inp)) optimized_model2 = optimization.optimize_for_inference( model, pass_config={"remove_dropout": False} ) - torch.testing.assert_allclose(model(inp), optimized_model2(inp)) + torch.testing.assert_close(model(inp), optimized_model2(inp)) @skipIfNoTorchVision @skipIfNoMkldnn @@ -1450,7 +1450,7 @@ def test_optimize_for_inference_cpu_torchvision(self): orig_out = model(inp) new_out = optimized_model(inp) - torch.testing.assert_allclose(orig_out, new_out) + torch.testing.assert_close(orig_out, new_out) class TestNormalizeOperators(JitTestCase): diff --git a/test/test_jit.py b/test/test_jit.py index 99df960da5dc4..2dd0d4764c46c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -497,7 +497,7 @@ def forward(self, a, b, c): FileCheck().check_not("aten::relu(") \ .check("aten::_add_relu(") \ .run(m.graph) - torch.testing.assert_allclose(orig_res, new_res) + torch.testing.assert_close(orig_res, new_res) # add, relu_ a = torch.rand((7, 11)) @@ -516,7 +516,7 @@ def forward(self, a, b, c): FileCheck().check_not("aten::relu_(") \ .check("aten::_add_relu(") \ .run(m.graph) - torch.testing.assert_allclose(orig_res, new_res) + torch.testing.assert_close(orig_res, new_res) class Madd_(torch.nn.Module): def __init__(self, relu_op): @@ -547,10 +547,10 @@ def forward(self, a, b): .check_not("aten::relu_(") \ .check("aten::_add_relu_(") \ .run(m.graph) - torch.testing.assert_allclose(orig_res, new_res) + torch.testing.assert_close(orig_res, new_res) # Since _add_relu_ does inplace mutation ensure # a_copy is modified - torch.testing.assert_allclose(orig_res, a_copy) + torch.testing.assert_close(orig_res, a_copy) class Madd_out(torch.nn.Module): def __init__(self, relu_op): @@ -585,10 +585,10 @@ def forward(self, a, b): .check_not("aten::relu_(") \ .check("aten::_add_relu(") \ .run(m.graph) - torch.testing.assert_allclose(orig_res, new_res) + torch.testing.assert_close(orig_res, new_res) # Since _add_relu_ with out=a does inplace mutation ensure # a_copy is modified - torch.testing.assert_allclose(orig_res, a_copy) + torch.testing.assert_close(orig_res, a_copy) @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Simple executor doesn't have shape information") def test_peephole_optimize_shape_ops(self): @@ -8888,7 +8888,7 @@ def forward(self, x): def test_pack_unpack_state(self): sm = TestScript.DerivedStateModule() x = torch.rand(3, 4, dtype=torch.float) - torch.testing.assert_allclose(sm(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float))) + torch.testing.assert_close(sm(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float))) # Test save path self.assertFalse(sm.pack_called.item()) @@ -8899,11 +8899,11 @@ def test_pack_unpack_state(self): # ensure unpack was called after serialization so as to leave the module in an initialized state self.assertTrue(sm.unpack_called.item()) - torch.testing.assert_allclose(sm.derived, torch.neg(sm.param)) + torch.testing.assert_close(sm.derived, torch.neg(sm.param)) # Test load paths self.assertTrue(imported.unpack_called.item()) - torch.testing.assert_allclose(imported(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float))) + torch.testing.assert_close(imported(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float))) @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support") def test_torch_functional(self): @@ -9101,11 +9101,11 @@ def forward(self, x): return self.submod(x + self.buf) m = Mod() - torch.testing.assert_allclose(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6) + torch.testing.assert_close(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6) m.apply(lambda s: s._pack()) - torch.testing.assert_allclose(m(torch.zeros(3, 4)), torch.zeros(3, 4)) + torch.testing.assert_close(m(torch.zeros(3, 4)), torch.zeros(3, 4)) m.apply(lambda s: s._unpack()) - torch.testing.assert_allclose(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6) + torch.testing.assert_close(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6) def test_torch_any(self): def fn(x): @@ -10958,7 +10958,7 @@ def forward(self, x): torch._C._jit_pass_remove_dropout(m._c) res = m(data) FileCheck().check_not("aten::dropout").run(str(m.graph)) - torch.testing.assert_allclose(ref_res, res, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_res, res, rtol=1e-2, atol=1e-3) def test_unfold_zero_dim(self): def fn(x): diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index ba47547256b75..64c26b7936b54 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1186,7 +1186,7 @@ def fn(input_v, mask): ref = fn(input_v, mask) try: t = torch.jit.trace(fn, (input_v, mask)) - torch.testing.assert_allclose(ref, t(input_v, mask)) + torch.testing.assert_close(ref, t(input_v, mask)) print(torch.jit.last_executed_optimized_graph()) self.assertLastGraphAllFused() except Exception as e: @@ -1287,7 +1287,7 @@ def apply(fn): continue try: t = torch.jit.trace(fn, (x,)) - torch.testing.assert_allclose(ref, t(x)) + torch.testing.assert_close(ref, t(x)) self.assertAllFused(t.graph_for(x)) except Exception as e: raise RuntimeError( @@ -1683,7 +1683,7 @@ def eager(t0, t1, t2, t3, t4): for _ in range(4): for pair in zip(script(*inputs), eager(*inputs)): test, ref = pair - torch.testing.assert_allclose(test, ref) + torch.testing.assert_close(test, ref) self.assertAllFused(script.graph_for(*inputs)) def test_sub_gt_and(self): @@ -1776,10 +1776,10 @@ def eager(x, y): one = torch.tensor([[1]]).to(dtype2) script = torch.jit.trace(eager, (x, zero)) for _ in range(3): - torch.testing.assert_allclose( + torch.testing.assert_close( script(x, zero), eager(x, zero)) - torch.testing.assert_allclose( + torch.testing.assert_close( script(x, one), eager(x, one)) self.assertAllFused(script.graph_for(x, one)) @@ -1824,7 +1824,7 @@ def _test_fwd_bwd(self, fn): xs -= 0.1 * xs.grad x.grad = None xs.grad = None - torch.testing.assert_allclose(y, ys) + torch.testing.assert_close(y, ys) def test_relu_fwd_bwd(self): def eager(x): @@ -1907,12 +1907,12 @@ def eager(x): for _ in range(3): script(x) - torch.testing.assert_allclose(eager(x), script(x)) + torch.testing.assert_close(eager(x), script(x)) # Now when an input hits the unrolled path, it will produce an # incorrectly-sized tensor, since size=1 has been burned in. x = torch.ones((8, 1)) - torch.testing.assert_allclose(eager(x), script(x)) + torch.testing.assert_close(eager(x), script(x)) works_list = [ '__radd__', diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py index 78ebb550d0227..19f07e2454488 100644 --- a/test/test_mobile_optimizer.py +++ b/test/test_mobile_optimizer.py @@ -119,7 +119,7 @@ def forward(self, x): .check_not("aten::relu(") \ .check_count("aten::_add_relu(", 1, exactly=True) \ .run(optimized_scripted_model.graph) - torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(initial_result, optimized_result, rtol=1e-2, atol=1e-3) FileCheck().check_not("Tensor = aten::conv2d") \ .check_not("Tensor = prim::CallFunction") \ @@ -131,7 +131,7 @@ def forward(self, x): .check_not("aten::relu(") \ .check_count("aten::_add_relu(", 1, exactly=True) \ .run(optimized_scripted_model.foo.graph) - torch.testing.assert_allclose(initial_foo_result, optimized_foo_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(initial_foo_result, optimized_foo_result, rtol=1e-2, atol=1e-3) optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS} @@ -142,7 +142,7 @@ def forward(self, x): .check_not("prepacked::linear_clamp_run") \ .check_not("prepacked::conv2d_clamp_run") \ .run(optimized_scripted_model_no_prepack.graph) - torch.testing.assert_allclose(initial_result, optimized_result_no_prepack, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(initial_result, optimized_result_no_prepack, rtol=1e-2, atol=1e-3) bn_test_module = BNTestModule() @@ -157,14 +157,14 @@ def forward(self, x): bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_prepack) self.assertEqual(len(torch.jit.export_opnames(bn_fold_scripted_module)), 1) bn_input = torch.rand(1, 1, 6, 6) - torch.testing.assert_allclose(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3) + torch.testing.assert_close(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3) optimization_blocklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION} no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_fold_bn) FileCheck().check_count("aten::batch_norm", 1, exactly=True) \ .run(str(get_forward_graph(no_bn_fold_scripted_module._c))) bn_input = torch.rand(1, 1, 6, 6) - torch.testing.assert_allclose(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3) + torch.testing.assert_close(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3) class MyMobileOptimizedTagTest(torch.nn.Module): def __init__(self): @@ -231,7 +231,7 @@ def foo(self, x): FileCheck().check_not("dropout.__") \ .check_count("aten::_add_relu(", 1, exactly=True) \ .run(optimized_scripted_model.foo.graph) - torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(initial_result, optimized_result, rtol=1e-2, atol=1e-3) class BNTestNoForwardModule(torch.nn.Module): def __init__(self): @@ -257,7 +257,7 @@ def foo(self, x): bn_fold_no_forward_scripted_module = optimize_for_mobile(bn_no_forward_scripted_module, preserved_methods=['foo']) self.assertEqual(len(torch.jit.export_opnames(bn_fold_no_forward_scripted_module)), 1) bn_input = torch.rand(1, 1, 6, 6) - torch.testing.assert_allclose( + torch.testing.assert_close( bn_no_forward_scripted_module.foo(bn_input), bn_fold_no_forward_scripted_module.foo(bn_input), rtol=1e-2, @@ -493,7 +493,7 @@ def _quant_script_and_optimize(model): data = torch.randn(4, 1, 4, 4) m_res = m(data) m_optim_res = m_optim(data) - torch.testing.assert_allclose(m_res, m_optim_res, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3) # generic case @@ -507,7 +507,7 @@ def _quant_script_and_optimize(model): data = torch.randn(4, 1, 4, 4) m_res = m(data) m_optim_res = m_optim(data) - torch.testing.assert_allclose(m_res, m_optim_res, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3) @unittest.skipUnless(HAS_TORCHVISION, "Needs torchvision") def test_mobilenet_optimize_for_mobile(self): diff --git a/test/test_nn.py b/test/test_nn.py index ccf6f6e933c10..d21e0477715db 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -4717,7 +4717,7 @@ def fc_op(X, W, b): packed_w_tensor = torch.fbgemm_pack_gemm_matrix_fp16(w_tensor) actual_output = torch.fbgemm_linear_fp16_weight(x_tensor, packed_w_tensor, b_tensor) expected_output = fc_op(X, W, b) - torch.testing.assert_allclose(expected_output, actual_output.cpu(), atol=1e-3, rtol=1e-3) + torch.testing.assert_close(torch.from_numpy(expected_output), actual_output.cpu(), atol=1e-3, rtol=1e-3) def test_embeddingbag_from_pretrained(self): a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) @@ -6797,8 +6797,7 @@ def perm_fn(x): encoder_input = torch.tensor([[[20., 30., 40., 50.]]]) result = model(encoder_input) ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]]) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) # deterministic input encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], @@ -6806,8 +6805,7 @@ def perm_fn(x): result = model(encoder_input) ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]], [[2.264103, 0.121417, -0.696012, 0.159724]]])) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) # deterministic input encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], @@ -6831,8 +6829,7 @@ def perm_fn(x): [2.4237977, 0.03290575, -0.60561789, -0.05940082]], [[2.41383916, 0.02686345, -0.61256377, -0.06380707], [2.42000277, 0.03800944, -0.60824798, -0.04754947]]])) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) def test_transformerdecoderlayer(self): # this is a deterministic test for TransformerDecoderLayer @@ -7013,8 +7010,7 @@ def perm_fn(x): memory_input = torch.tensor([[[60., 70., 80., 90.]]]) result = model(decoder_input, memory_input) ref_output = torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) # deterministic input decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], @@ -7023,8 +7019,7 @@ def perm_fn(x): result = model(decoder_input, memory_input) ref_output = perm_fn(torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]], [[2.415448, 0.054389, -0.610932, -0.0156613]]])) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) # deterministic input decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], @@ -7034,8 +7029,7 @@ def perm_fn(x): result = model(decoder_input, memory_input) ref_output = perm_fn(torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]], [[2.338531, 0.087709, -0.65776, 0.080646]]])) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) # deterministic input decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], @@ -7061,8 +7055,7 @@ def perm_fn(x): [2.42216881, 0.03586554, -0.6067524, -0.05289126]], [[2.42205716, 0.03488046, -0.60683681, -0.05460596], [2.42240309, 0.0354595, -0.60659063, -0.05378816]]])) - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output) + torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) def test_transformerencoder(self): def get_a_test_layer(use_cuda, activation, batch_first=False): @@ -7130,13 +7123,13 @@ def perm_fn(x): [2.422901, 0.024187, -0.606178, -0.074929]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # all 0 mask = torch.zeros([2, 5]).to(device) == 1 result = model(encoder_input, src_key_padding_mask=mask) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) mask[0, 1] = 1 mask[1, 3] = 1 mask[1, 4] = 1 @@ -7153,7 +7146,7 @@ def perm_fn(x): [2.4242, 0.024653, -0.605266, -0.074959]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # test case 2, multiple layers no norm model = nn.TransformerEncoder(encoder_layer, 2).to(device) @@ -7170,7 +7163,7 @@ def perm_fn(x): [2.419075, 0.017449, -0.608722, -0.085014]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) model = nn.TransformerEncoder(encoder_layer, 6).to(device) result = model(encoder_input, src_key_padding_mask=mask) @@ -7186,7 +7179,7 @@ def perm_fn(x): [2.419101, 0.017453, -0.608704, -0.085025]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # test case 3, multiple layers with norm # d_model = 4 @@ -7205,7 +7198,7 @@ def perm_fn(x): [1.695952, -0.357637, -0.893065, -0.445251]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device) result = model(encoder_input, src_key_padding_mask=mask) @@ -7221,7 +7214,7 @@ def perm_fn(x): [1.695955, -0.357639, -0.893051, -0.445265]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) def test_transformerdecoder(self): @@ -7271,7 +7264,7 @@ def perm_fn(x): ref_output = torch.tensor( [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) # deterministic input decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], @@ -7282,7 +7275,7 @@ def perm_fn(x): [[2.422245, 0.051716, -0.606338, -0.024756]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) # deterministic input decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], @@ -7294,7 +7287,7 @@ def perm_fn(x): [[2.343536, 0.085561, -0.654954, 0.074991]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) # deterministic input decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], @@ -7324,7 +7317,7 @@ def perm_fn(x): [2.432306, 0.028858, -0.599542, -0.072846]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # key_padding_mask key_padding_mask = torch.zeros(2, 3).to(device) == 1 @@ -7338,7 +7331,7 @@ def perm_fn(x): [2.432306, 0.028858, -0.599542, -0.072846]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # key_padding_mask key_padding_mask[0, 2] = 1 @@ -7354,7 +7347,7 @@ def perm_fn(x): [2.432659, 0.029244, -0.599294, -0.072382]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # memory_key_padding_mask key_padding_mask = torch.zeros(2, 5).to(device) == 1 @@ -7368,7 +7361,7 @@ def perm_fn(x): [2.432306, 0.028858, -0.599542, -0.072846]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # memory_key_padding_mask key_padding_mask[0, 4] = 1 @@ -7385,7 +7378,7 @@ def perm_fn(x): [2.433075, 0.028543, -0.598987, -0.073985]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # multiple layers no norm model = nn.TransformerDecoder(decoder_layer, 2).to(device) @@ -7397,7 +7390,7 @@ def perm_fn(x): ref_output = torch.tensor( [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) # multiple layers no norm model = nn.TransformerDecoder(decoder_layer, 6).to(device) @@ -7430,7 +7423,7 @@ def perm_fn(x): [2.43113, 0.0279516, -0.600376, -0.0736896]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # multiple layers with norm # d_model = 4 @@ -7444,7 +7437,7 @@ def perm_fn(x): ref_output = torch.tensor( [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) # multiple layers with norm model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device) @@ -7477,7 +7470,7 @@ def perm_fn(x): [1.69571, -0.357363, -0.894154, -0.444196]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) # gelu activation test cases activation = "gelu" @@ -7495,7 +7488,7 @@ def perm_fn(x): result = model(decoder_input, memory_input) ref_output = torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) # deterministic input decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], @@ -7505,7 +7498,7 @@ def perm_fn(x): ref_output = perm_fn(torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]], [[2.415448, 0.054389, -0.610932, -0.0156613]]])).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) # deterministic input decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], @@ -7516,7 +7509,7 @@ def perm_fn(x): ref_output = perm_fn(torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]], [[2.338531, 0.087709, -0.65776, 0.080646]]])).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) # deterministic input decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], @@ -7546,7 +7539,7 @@ def perm_fn(x): [2.42240309, 0.0354595, -0.60659063, -0.05378816]]] )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available') def test_cudnn_rnn_dropout_states_device(self): diff --git a/test/test_pruning_op.py b/test/test_pruning_op.py index 28f31aeabd705..97a499b03ac15 100644 --- a/test/test_pruning_op.py +++ b/test/test_pruning_op.py @@ -50,7 +50,7 @@ def get_reference_result(embedding_weights, mask, indices_type): ref_pruned_weights, ref_compressed_indices_map = get_reference_result( embedding_weights, mask, indices_type) - torch.testing.assert_allclose(pt_pruned_weights, ref_pruned_weights) + torch.testing.assert_close(pt_pruned_weights, ref_pruned_weights) self.assertEqual(pt_compressed_indices_map, ref_compressed_indices_map) self.assertEqual(pt_compressed_indices_map.dtype, indices_type) diff --git a/test/test_reductions.py b/test/test_reductions.py index 42edfb3817ce1..f3f0d4c936451 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -2664,36 +2664,38 @@ def test_tensor_reduce_ops_empty(self, device): self.assertEqual(np_function(np_input, axis=-1), fn(master_input, dim=-1).cpu().numpy(), msg=error_msg, exact_dtype=False) - self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=2, keepdim=True), msg=error_msg) + self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=2, keepdim=True), + msg=error_msg) self.assertEqual(np_function(np_input, axis=2, keepdims=True), fn(master_input, dim=2, keepdim=True), msg=error_msg, exact_dtype=False) - self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=-1, keepdim=True), msg=error_msg) + self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=-1, keepdim=True), + msg=error_msg) self.assertEqual(np_function(np_input, axis=-1, keepdims=True), fn(master_input, dim=-1, keepdim=True), msg=error_msg, exact_dtype=False) - # Check if returned data is correct. - check_func = (torch.testing.assert_allclose if math.isnan(return_value) or math.isinf(return_value) else - self.assertEqual) - - check_func(torch.full((2, 4), return_value, device=device), fn(master_input, dim=1), msg=error_msg) - check_func(torch.full((2, 4), return_value, device=device), fn(master_input, dim=-2), msg=error_msg) - check_func(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=1, keepdim=True), msg=error_msg) - check_func(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=-2, keepdim=True), msg=error_msg) + self.assertEqual(torch.full((2, 4), return_value, device=device), fn(master_input, dim=1), msg=error_msg) + self.assertEqual(torch.full((2, 4), return_value, device=device), fn(master_input, dim=-2), msg=error_msg) + self.assertEqual(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=1, keepdim=True), + msg=error_msg) + self.assertEqual(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=-2, keepdim=True), + msg=error_msg) if name != 'logsumexp': # The scipy function does not work for reduction the zero dimension - check_func(np.float32(np_function(np_input, axis=1)), fn(master_input, dim=1).cpu().numpy(), msg=error_msg) - check_func(np.float32(np_function(np_input, axis=-2)), fn(master_input, dim=-2).cpu().numpy(), msg=error_msg) - check_func(np.float32(np_function(np_input, axis=1, keepdims=True)), - fn(master_input, dim=1, keepdim=True).cpu().numpy(), - msg=error_msg) - check_func(np.float32(np_function(np_input, axis=-2, keepdims=True)), - fn(master_input, dim=-2, keepdim=True).cpu().numpy(), - msg=error_msg) + self.assertEqual(np.float32(np_function(np_input, axis=1)), fn(master_input, dim=1).cpu().numpy(), + msg=error_msg) + self.assertEqual(np.float32(np_function(np_input, axis=-2)), fn(master_input, dim=-2).cpu().numpy(), + msg=error_msg) + self.assertEqual(np.float32(np_function(np_input, axis=1, keepdims=True)), + fn(master_input, dim=1, keepdim=True).cpu().numpy(), + msg=error_msg) + self.assertEqual(np.float32(np_function(np_input, axis=-2, keepdims=True)), + fn(master_input, dim=-2, keepdim=True).cpu().numpy(), + msg=error_msg) # logsumexp throws a type error when not specifying dim so test separately. - check_func(torch.full((), return_value, device=device), fn(master_input), msg=error_msg) + self.assertEqual(torch.full((), return_value, device=device), fn(master_input), msg=error_msg) else: self.assertRaises(TypeError, lambda: fn(master_input)) diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py index 9b38a5a7e36a8..94043e2745626 100644 --- a/test/test_static_runtime.py +++ b/test/test_static_runtime.py @@ -186,10 +186,10 @@ def test_multihead_attention_layer(self): o_test_kw = attention_a(src, src, value=src, mask=src_mask) for a, b in zip(o_ref, o_test): - torch.testing.assert_allclose(a, b) + torch.testing.assert_close(a, b) for a, b in zip(o_ref, o_test_kw): - torch.testing.assert_allclose(a, b) + torch.testing.assert_close(a, b) def test_multihead_attention_layer_benchmark(self): HID_DIM = 256 @@ -228,20 +228,20 @@ def test_mlp(self): top_inp = torch.randn(2048, 100) # torch.Size([2048, 100]) ref_bot = bot_l(bot_inp) acc_bot = bot_l_acc(bot_inp)[0] - torch.testing.assert_allclose(acc_bot, ref_bot) + torch.testing.assert_close(acc_bot, ref_bot) ref_top = top_l(top_inp) acc_top = top_l_acc(top_inp)[0] - torch.testing.assert_allclose(acc_top, ref_top) + torch.testing.assert_close(acc_top, ref_top) for _ in range(5): with torch.no_grad(): bot_inp = torch.randn(2048, 512) # torch.Size([2048, 512]) top_inp = torch.randn(2048, 100) # torch.Size([2048, 100]) ref_bot = bot_l(bot_inp) acc_bot = bot_l_acc(bot_inp)[0] - torch.testing.assert_allclose(acc_bot, ref_bot) + torch.testing.assert_close(acc_bot, ref_bot) ref_top = top_l(top_inp) acc_top = top_l_acc(top_inp)[0] - torch.testing.assert_allclose(acc_top, ref_top) + torch.testing.assert_close(acc_top, ref_top) def test_trivial_graph(self): s = torch.full((2, 2), 2) @@ -249,7 +249,7 @@ def test_trivial_graph(self): o_ref = tg(s, s, s) tg_a = StaticModule(tg) o_test = tg_a(s, s, s)[0] - torch.testing.assert_allclose(o_ref, o_test) + torch.testing.assert_close(o_ref, o_test) def test_leaky_relu(self): s = torch.randn(5, 5) @@ -257,7 +257,7 @@ def test_leaky_relu(self): o_ref = tg(s) tg_a = StaticModule(tg) o_test = tg_a(s)[0] - torch.testing.assert_allclose(o_ref, o_test) + torch.testing.assert_close(o_ref, o_test) def test_attr(self): """ @@ -293,7 +293,7 @@ def test_attr(self): ms = torch.jit.script(m) sm = StaticModule(ms) output_sm = sm(input)[0] - torch.testing.assert_allclose(output_s, output_sm) + torch.testing.assert_close(output_s, output_sm) sm.benchmark([input], {}, 2, 2) sm.benchmark_individual_ops([input], {}, 2, 2) sm.benchmark([], {"x": input}, 2, 2) @@ -307,7 +307,7 @@ def test_fusion_trivial_graph(self): torch._C._fuse_to_static_module(tg.graph) assert "StaticSubgraph" in str(tg.graph) o_test = tg(s, s, s) - torch.testing.assert_allclose(o_ref, o_test) + torch.testing.assert_close(o_ref, o_test) @unittest.skip("Temporarily disabled") def test_fusion_multihead_attention_layer(self): @@ -332,7 +332,7 @@ def test_fusion_multihead_attention_layer(self): o_test = attention(src, src, src, src_mask) for a, b in zip(o_ref, o_test): - torch.testing.assert_allclose(a, b) + torch.testing.assert_close(a, b) @unittest.skip("Temporarily disabled") def test_fusion_loop(self): @@ -344,7 +344,7 @@ def test_fusion_loop(self): torch._C._fuse_to_static_module(lg.graph) assert "StaticSubgraph" in str(lg.graph) o_test = lg(a, b, c) - torch.testing.assert_allclose(o_ref, o_test) + torch.testing.assert_close(o_ref, o_test) @unittest.skip("Temporarily disabled") def test_fusion_outputs(self): @@ -357,7 +357,7 @@ def test_fusion_outputs(self): assert "StaticSubgraph" in str(og.graph) o_test = og(a, b, b, c) for i in o_ref.keys(): - torch.testing.assert_allclose(o_ref[i], o_test[i]) + torch.testing.assert_close(o_ref[i], o_test[i]) if __name__ == "__main__": diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 50145100abf8f..6353113a1ec4c 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -1468,7 +1468,7 @@ def getModule(script): am_s = getModule(True) ref = am(x, x, x) test = am_s(x, x, x) - torch.testing.assert_allclose(ref, test) + torch.testing.assert_close(ref, test) # Now do the aliasing am.a = am.b @@ -1477,7 +1477,7 @@ def getModule(script): am_s.a = am_s.b test = am_s(x, x, x) - torch.testing.assert_allclose(ref, test) + torch.testing.assert_close(ref, test) def test_alias_analysis_inputs(self): class AliasModule(nn.Module): @@ -1510,7 +1510,7 @@ def getModule(script): x = torch.randn(128, 128) test = am_s(x, x, x) - torch.testing.assert_allclose(ref, test) + torch.testing.assert_close(ref, test) def test_alias_analysis_input_and_module(self): class AliasModule(nn.Module): @@ -1545,7 +1545,7 @@ def getModule(script): am_s.b = x test = am_s(x, x, x) - torch.testing.assert_allclose(ref, test) + torch.testing.assert_close(ref, test) def test_multiple_outputs(self): for device in self.devices: diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py index 4138b2f81dfda..d838892975c0c 100644 --- a/test/test_tensorexpr_pybind.py +++ b/test/test_tensorexpr_pybind.py @@ -44,7 +44,7 @@ def test_simple_sum(self): tB = torch.randn(n) tC = torch.empty(n) cg.call([tA, tB, tC]) - torch.testing.assert_allclose(tA + tB, tC) + torch.testing.assert_close(tA + tB, tC) def test_call_raw(self): with kernel_arena_scope(): @@ -55,7 +55,7 @@ def test_call_raw(self): tB = torch.randn(n, dtype=torch.float64) tC = torch.empty(n, dtype=torch.float64) cg.call_raw([tA.data_ptr(), tB.data_ptr(), tC.data_ptr()]) - torch.testing.assert_allclose(tA + tB, tC) + torch.testing.assert_close(tA + tB, tC) def test_external_calls(self): with kernel_arena_scope(): @@ -77,7 +77,7 @@ def test_external_calls(self): tB = torch.ones(4, 1) tC = torch.empty(1, 1) codegen.call([tA, tB, tC]) - torch.testing.assert_allclose(torch.matmul(tA, tB), tC) + torch.testing.assert_close(torch.matmul(tA, tB), tC) def test_dynamic_shape(self): with kernel_arena_scope(): @@ -103,7 +103,7 @@ def test_with_shape(n): tB = torch.randn(n, dtype=torch.double) tC = torch.empty(n, dtype=torch.double) cg.call([tA, tB, tC, n]) - torch.testing.assert_allclose(tA - tB, tC) + torch.testing.assert_close(tA - tB, tC) test_with_shape(8) test_with_shape(31) diff --git a/test/test_throughput_benchmark.py b/test/test_throughput_benchmark.py index 9d60344b5912b..139ca0c4cc559 100644 --- a/test/test_throughput_benchmark.py +++ b/test/test_throughput_benchmark.py @@ -1,7 +1,6 @@ import torch from torch.utils import ThroughputBenchmark -from torch.testing import assert_allclose from torch.testing._internal.common_utils import run_tests, TestCase, TemporaryFileName @@ -56,7 +55,7 @@ def linear_test(self, Module, profiler_output_path=""): # or just unpack the list of inputs module_result = module(*inputs[i]) bench_result = bench.run_once(*inputs[i]) - assert_allclose(bench_result, module_result) + torch.testing.assert_close(bench_result, module_result) stats = bench.benchmark( num_calling_threads=4, diff --git a/test/test_torch.py b/test/test_torch.py index 6766d50e6425d..515052ae5ad67 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -1566,7 +1566,7 @@ def test_sobolengine_continuing(self, scramble: bool = False): n_half = len(ref_sample) // 2 _ = engine.draw(n=n_half) sample = engine.draw(n=n_half) - torch.testing.assert_allclose(sample, ref_sample[n_half:]) + torch.testing.assert_close(sample, ref_sample[n_half:]) def test_sobolengine_continuing_scrambled(self): self.test_sobolengine_continuing(scramble=True) @@ -1578,7 +1578,7 @@ def test_sobolengine_reset(self, scramble: bool = False): engine.reset() self.assertEqual(engine.num_generated, 0) sample = engine.draw(n=len(ref_sample)) - torch.testing.assert_allclose(sample, ref_sample) + torch.testing.assert_close(sample, ref_sample) def test_sobolengine_reset_scrambled(self): self.test_sobolengine_reset(scramble=True) @@ -1588,7 +1588,7 @@ def test_sobolengine_fast_forward(self, scramble: bool = False): engine = torch.quasirandom.SobolEngine(2, scramble=scramble, seed=123456) engine.fast_forward(4) sample = engine.draw(n=4) - torch.testing.assert_allclose(sample, ref_sample[4:]) + torch.testing.assert_close(sample, ref_sample[4:]) # alternate fast forwarding with sampling engine.reset() even_draws = [] @@ -1597,9 +1597,9 @@ def test_sobolengine_fast_forward(self, scramble: bool = False): even_draws.append(engine.draw()) else: engine.fast_forward(1) - torch.testing.assert_allclose( + torch.testing.assert_close( ref_sample[[i for i in range(8) if i % 2 == 0]], - np.concatenate(even_draws), + torch.from_numpy(np.concatenate(even_draws)), ) def test_sobolengine_fast_forward_scrambled(self): @@ -1609,13 +1609,13 @@ def test_sobolengine_distribution(self, scramble=False): d = 50 engine = torch.quasirandom.SobolEngine(d, scramble=scramble, seed=123456) sample = engine.draw(1024) - torch.testing.assert_allclose( + torch.testing.assert_close( torch.mean(sample, dim=0), torch.full((d,), 0.5), atol=2, rtol=2 ) - torch.testing.assert_allclose( + torch.testing.assert_close( np.percentile(sample, 25, axis=0), np.repeat(0.25, d), atol=2, rtol=2 ) - torch.testing.assert_allclose( + torch.testing.assert_close( np.percentile(sample, 75, axis=0), np.repeat(0.75, d), atol=2, rtol=2 ) @@ -2440,7 +2440,7 @@ def test_c10_layer_norm(self): actual_norm, actual_mean, actual_stdev = \ torch.ops._caffe2.LayerNorm(torch.tensor(X), torch.tensor( weight), torch.tensor(bias), 1, epsilon, True) - torch.testing.assert_allclose(expected_norm, actual_norm) + torch.testing.assert_close(expected_norm, actual_norm) def test_memory_format(self): def test_helper(x, memory_format): diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py index 4fa64e75eceb4..a0f8328ec660b 100644 --- a/test/test_xnnpack_integration.py +++ b/test/test_xnnpack_integration.py @@ -34,7 +34,7 @@ def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias): ref_result = F.linear(input_data, weight, bias) packed_weight_bias = torch.ops.prepacked.linear_clamp_prepack(weight, bias) output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias) - torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) @given(input_size=st.integers(2, 32), weight_output_dim=st.integers(2, 64), @@ -49,7 +49,7 @@ def test_linear_1d_input(self, input_size, weight_output_dim, use_bias): ref_result = F.linear(input_data, weight, bias) packed_weight_bias = torch.ops.prepacked.linear_clamp_prepack(weight, bias) output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias) - torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) @given(batch_size=st.integers(0, 3), @@ -107,7 +107,7 @@ def test_conv2d(self, packed_weight_bias = torch.ops.prepacked.conv2d_clamp_prepack(weight, bias, strides, paddings, dilations, groups) xnnpack_result = torch.ops.prepacked.conv2d_clamp_run(input_data, packed_weight_bias) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) @given(batch_size=st.integers(1, 3), input_channels_per_group=st.integers(1, 32), @@ -174,7 +174,7 @@ def test_conv2d_transpose(self, output_paddings, dilations, groups) xnnpack_result = torch.ops.prepacked.conv2d_transpose_clamp_run(input_data, packed_weight_bias) - torch.testing.assert_allclose(ref_result.contiguous(), xnnpack_result.contiguous(), rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result.contiguous(), xnnpack_result.contiguous(), rtol=1e-2, atol=1e-3) @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." @@ -214,7 +214,7 @@ def forward(self, x): input_data = torch.rand(data_shape) ref_result = scripted_linear(input_data) output_linearprepacked = scripted_linear_clamp_prepacked(input_data) - torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) # Serialize the modules and then deserialize input_data = torch.rand(data_shape) @@ -228,7 +228,7 @@ def forward(self, x): deserialized_linear_clamp_prepacked = torch.jit.load(buffer) ref_result = deserialized_linear(input_data) output_linearprepacked = deserialized_linear_clamp_prepacked(input_data) - torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) @given(batch_size=st.integers(0, 3), input_channels_per_group=st.integers(1, 32), @@ -309,7 +309,7 @@ def forward(self, x): weight, bias, strides, paddings, dilations, groups)) ref_result = scripted_conv2d(input_data) xnnpack_result = scripted_conv2d_clamp_prepacked(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) # Serialize the modules and then deserialize input_data = torch.rand((batch_size, input_channels, height, width)) @@ -325,7 +325,7 @@ def forward(self, x): deserialized_conv2d_clamp_prepacked = torch.jit.load(buffer) ref_result = deserialized_conv2d(input_data) xnnpack_result = deserialized_conv2d_clamp_prepacked(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) @given(batch_size=st.integers(0, 3), input_channels_per_group=st.integers(1, 32), @@ -417,7 +417,7 @@ def forward(self, x): weight, bias, strides, paddings, output_paddings, dilations, groups)) ref_result = scripted_conv2d(input_data) xnnpack_result = scripted_conv2d_clamp_prepacked(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) # Serialize the modules and then deserialize input_data = torch.rand((batch_size, input_channels, height, width)) @@ -433,7 +433,7 @@ def forward(self, x): deserialized_conv2d_clamp_prepacked = torch.jit.load(buffer) ref_result = deserialized_conv2d(input_data) xnnpack_result = deserialized_conv2d_clamp_prepacked(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) @given(batch_size=st.integers(0, 3), input_channels_per_group=st.integers(1, 32), @@ -549,7 +549,7 @@ def forward(self, x): groups)) ref_result = scripted_m(input_data) xnnpack_result = scripted_m_prepacked(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) # Serialize the modules and then deserialize input_data = torch.rand((batch_size, input_channels, height, width)) @@ -564,7 +564,7 @@ def forward(self, x): deserialized_m_prepacked = torch.jit.load(buffer) ref_result = deserialized_m(input_data) xnnpack_result = deserialized_m_prepacked(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) @unittest.skipUnless(torch.backends.xnnpack.enabled, @@ -610,7 +610,7 @@ def validate_transformed_module( else: FileCheck().check_count(pattern, v, exactly=True).run(deserialized_scripted_model.graph) xnnpack_result = deserialized_scripted_model(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) def test_linear(self): data_shape = [2, 3, 32] @@ -965,7 +965,7 @@ def validate_transform_conv1d_to_conv2d( else: FileCheck().check_count(pattern, v, exactly=True).run(deserialized_scripted_model.graph) transformed_result = deserialized_scripted_model(input_data) - torch.testing.assert_allclose(ref_result, transformed_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, transformed_result, rtol=1e-2, atol=1e-3) optimized_buffer = io.BytesIO() torch.jit.save(optimized_scripted_model, optimized_buffer) @@ -980,7 +980,7 @@ def validate_transform_conv1d_to_conv2d( else: FileCheck().check_count(pattern, v, exactly=True).run(deserialized_optimized_scripted_model.graph) xnnpack_result = deserialized_optimized_scripted_model(input_data) - torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) def test_conv1d_basic(self): diff --git a/torch/fx/experimental/fx2trt/example/fx2trt_example.py b/torch/fx/experimental/fx2trt/example/fx2trt_example.py index fff539d3bbe99..76bf69a181ad6 100644 --- a/torch/fx/experimental/fx2trt/example/fx2trt_example.py +++ b/torch/fx/experimental/fx2trt/example/fx2trt_example.py @@ -236,7 +236,7 @@ def _find_culprit(self, mod, inputs): # Assert results are equal with the original model. rn18 = rn18.cuda() - torch.testing.assert_allclose(split_mod(x), rn18(x)) + torch.testing.assert_close(split_mod(x), rn18(x)) import time NITER = 100 diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py index 069b73e847d5a..5a2f6e5e0c487 100644 --- a/torch/jit/_trace.py +++ b/torch/jit/_trace.py @@ -24,7 +24,7 @@ from torch.autograd import function from torch.nn import Module -from torch.testing._core import _get_default_tolerance +from torch.testing._asserts import _get_default_rtol_and_atol _flatten = torch._C._jit_flatten _unflatten = torch._C._jit_unflatten @@ -417,7 +417,7 @@ def graph_diagnostic_info(): check_tensor_val = n_check.t("value") try: - torch.testing.assert_allclose(mod_tensor_val, check_tensor_val) + torch.testing.assert_close(mod_tensor_val, check_tensor_val, equal_nan=True) except (RuntimeError, AssertionError) as e: if tensor_compare_errors is None: tensor_compare_errors = "" @@ -489,11 +489,12 @@ def compare_outputs(original, reference, match_what): orig = orig.to_dense() if ref.is_mkldnn: ref = ref.to_dense() - torch.testing.assert_allclose( + torch.testing.assert_close( orig.double(), ref.double(), rtol=check_tolerance, - atol=_get_default_tolerance(orig, ref)[1], + atol=_get_default_rtol_and_atol(orig, ref)[1], + equal_nan=True, ) except AssertionError as e: maybe_warn_nondeterministic() diff --git a/torch/testing/_core.py b/torch/testing/_core.py index 9a5fb0c643097..d9806150047c5 100644 --- a/torch/testing/_core.py +++ b/torch/testing/_core.py @@ -18,7 +18,6 @@ "all_types_and_complex", "all_types_and_complex_and", "all_types_and_half", - "assert_allclose", "complex_types", "empty_types", "floating_and_complex_types", @@ -246,30 +245,6 @@ def _helper(a, b, s) -> _compare_return_type: return _helper(a, b, " ") -def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None: - if not isinstance(actual, torch.Tensor): - actual = torch.tensor(actual) - if not isinstance(expected, torch.Tensor): - expected = torch.tensor(expected, dtype=actual.dtype) - if expected.shape != actual.shape: - raise AssertionError("expected tensor shape {0} doesn't match with actual tensor " - "shape {1}!".format(expected.shape, actual.shape)) - if rtol is None or atol is None: - if rtol is not None or atol is not None: - raise ValueError("rtol and atol must both be specified or both be unspecified") - rtol, atol = _get_default_tolerance(actual, expected) - - result, debug_msg = _compare_tensors_internal(actual, expected, - rtol=rtol, atol=atol, - equal_nan=equal_nan) - - if result: - return - - if msg is None or msg == '': - msg = debug_msg - - raise AssertionError(msg) def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor: if tensor.numel() <= 1: # can't make non-contiguous @@ -406,19 +381,3 @@ def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dt def get_all_device_types() -> List[str]: return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] - -# 'dtype': (rtol, atol) -_default_tolerances = { - 'float64': (1e-5, 1e-8), # NumPy default - 'float32': (1e-4, 1e-5), # This may need to be changed - 'float16': (1e-3, 1e-3), # This may need to be changed -} - - -def _get_default_tolerance(a, b=None) -> Tuple[float, float]: - if b is None: - dtype = str(a.dtype).split('.')[-1] # e.g. "float32" - return _default_tolerances.get(dtype, (0, 0)) - a_tol = _get_default_tolerance(a) - b_tol = _get_default_tolerance(b) - return (max(a_tol[0], b_tol[0]), max(a_tol[1], b_tol[1])) diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py index 7355aeea1a292..3cf7338bff889 100644 --- a/torch/testing/_deprecated.py +++ b/torch/testing/_deprecated.py @@ -5,17 +5,24 @@ import functools import warnings -from typing import Any, Callable +from typing import Any, Callable, Optional, Tuple import torch -__all__ = ["rand", "randn"] +__all__ = [ + "rand", + "randn", + "assert_allclose", +] def warn_deprecated(instructions: str) -> Callable: def outer_wrapper(fn: Callable) -> Callable: - msg = f"torch.testing.{fn.__name__} is deprecated and will be removed in the future. {instructions.strip()}" + msg = ( + f"torch.testing.{fn.__name__} is deprecated and will be removed in a future release. " + f"{instructions.strip()}" + ) @functools.wraps(fn) def inner_wrapper(*args: Any, **kwargs: Any) -> Any: @@ -29,3 +36,51 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> Any: rand = warn_deprecated("Use torch.rand instead.")(torch.rand) randn = warn_deprecated("Use torch.randn instead.")(torch.randn) + + +_DTYPE_PRECISIONS = { + torch.float16: (1e-3, 1e-3), + torch.float32: (1e-4, 1e-5), + torch.float64: (1e-5, 1e-8), +} + + +def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]: + actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0)) + expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0)) + return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol) + + +# TODO: include the deprecation as soon as torch.testing.assert_close is stable +# @warn_deprecated( +# "Use torch.testing.assert_close instead. " +# "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844." +# ) +def assert_allclose( + actual: Any, + expected: Any, + rtol: Optional[float] = None, + atol: Optional[float] = None, + equal_nan: bool = True, + msg: str = "", +) -> None: + if not isinstance(actual, torch.Tensor): + actual = torch.tensor(actual) + if not isinstance(expected, torch.Tensor): + expected = torch.tensor(expected, dtype=actual.dtype) + + if rtol is None and atol is None: + rtol, atol = _get_default_rtol_and_atol(actual, expected) + + torch.testing.assert_close( + actual, + expected, + rtol=rtol, + atol=atol, + equal_nan=equal_nan, + check_device=True, + check_dtype=False, + check_stride=False, + check_is_coalesced=False, + msg=msg or None, + ) diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index 2470b5392de11..6b2d1dd13a33d 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -975,12 +975,12 @@ def _compare_script_and_mobile(self, mobile_module_result = mobile_module(input) - torch.testing.assert_allclose(script_module_result, mobile_module_result) + torch.testing.assert_close(script_module_result, mobile_module_result) mobile_module_forward_result = mobile_module.forward(input) - torch.testing.assert_allclose(script_module_result, mobile_module_forward_result) + torch.testing.assert_close(script_module_result, mobile_module_forward_result) mobile_module_run_method_result = mobile_module.run_method("forward", input) - torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result) + torch.testing.assert_close(script_module_result, mobile_module_run_method_result) except AssertionError as e: if retry == max_retry: raise e diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 2a126ab894a06..096b7182851c3 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -4119,20 +4119,13 @@ def _test_ddp_hook_parity(self, state, hook): grad_hook = net_with_hook.module.weight.grad avg_hook = grad_hook.clone() # Verify hook grad with expected. - # Cannot use exact match here due to a very small accuracy loss, - # e.g. 1e-05, for powerSGD hook case. - assert_func = ( - self.assertEqual - if hook == default.allreduce_hook - else torch.testing.assert_allclose - ) - assert_func( - avg_hook[0, 0], + self.assertEqual( + avg_hook[0, 0].item(), expected_grad, msg=f"Expected hook grad of {expected_grad} but got {avg_hook[0, 0]}", ) # Verify hook grad with vanilla allreduce - assert_func( + self.assertEqual( avg_hook[0, 0], avg[0, 0], msg=f"Expected hook grad to be close to allreduce {avg[0, 0]}, but got {avg_hook[0, 0]}", @@ -4937,8 +4930,8 @@ def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_Running_Value( model.module.running_mean, model.module.running_var, ) - torch.testing.assert_allclose(running_mean, all_input_var.mean(1)) - torch.testing.assert_allclose(running_var, all_input_var.var(1)) + torch.testing.assert_close(running_mean, all_input_var.mean(1)) + torch.testing.assert_close(running_var, all_input_var.var(1)) @sandcastle_skip_if( BACKEND != "nccl" and BACKEND != "gloo", From ce6fe50158c631f7f54b6df3ac91632cd41d48ea Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Thu, 19 Aug 2021 13:00:08 -0700 Subject: [PATCH 078/530] Revert embedding thrust->cub migration (#63451) Summary: Fixes https://github.com/pytorch/pytorch/issues/63427 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63451 Reviewed By: mruberry Differential Revision: D30398482 Pulled By: ngimel fbshipit-source-id: e153786d204215555a6571688eabae712facad7e --- aten/src/ATen/cuda/cub.cuh | 19 +---- aten/src/ATen/native/cuda/Embedding.cu | 85 ++++++++++++++----- .../native/cuda/EmbeddingBackwardKernel.cuh | 4 + aten/src/ATen/native/cuda/Indexing.cu | 3 + .../ATen/native/cuda/LegacyThrustHelpers.cu | 43 ---------- aten/src/ATen/native/cuda/Randperm.cu | 2 + aten/src/ATen/native/cuda/UniqueCub.cu | 13 ++- 7 files changed, 86 insertions(+), 83 deletions(-) diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 38e5852260f3a..62da28d34e8e5 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -3,7 +3,6 @@ #include #include #include -#include // include cub in a safe manner, see: // https://github.com/pytorch/pytorch/pull/55292 @@ -103,8 +102,6 @@ static inline void sort_keys( const key_t *keys_in, key_t *keys_out, int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8 ) { - TORCH_CHECK(n <= std::numeric_limits::max(), - "cub sort does not support sorting more than INT_MAX elements"); using key_t_ = typename detail::cuda_type::type; const key_t_ *keys_in_ = reinterpret_cast(keys_in); @@ -127,8 +124,6 @@ static inline void sort_pairs( const value_t *values_in, value_t *values_out, int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8 ) { - TORCH_CHECK(n <= std::numeric_limits::max(), - "cub sort does not support sorting more than INT_MAX elements"); using key_t_ = typename detail::cuda_type::type; auto allocator = c10::cuda::CUDACachingAllocator::get(); @@ -161,10 +156,6 @@ static inline void segmented_sort_pairs( OffsetIteratorT begin_offsets, OffsetIteratorT end_offsets, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8 ) { - TORCH_CHECK(num_elements <= std::numeric_limits::max(), - "cub sort does not support sorting more than INT_MAX elements"); - TORCH_CHECK(num_segments <= std::numeric_limits::max(), - "cub sort does not support sorting more than INT_MAX elements"); using key_t_ = typename detail::cuda_type::type; auto allocator = c10::cuda::CUDACachingAllocator::get(); @@ -314,12 +305,4 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT } } -template -inline void unique(InputIteratorT input, OutputIteratorT output, NumSelectedIteratorT num_selected_out, int64_t num_items) { - TORCH_CHECK(num_items <= std::numeric_limits::max(), - "cub unique does not support more than INT_MAX elements"); - CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSelect::Unique, - input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream()); -} - -}}} // namespace at::cuda::cub +}}} diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 100ffbd99388c..10a42b8914e62 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -7,9 +7,12 @@ #include #include +#include #include -#include +#include +#include +#include #include #include @@ -221,9 +224,6 @@ __global__ void renorm_kernel( } // anonymous namespace -template -void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); - Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) { @@ -272,16 +272,59 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); Tensor count; AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { - auto range = at::arange(num_indices, indices.options()); - int64_t nbits = cuda::cub::get_num_bits(num_weights); - cuda::cub::sort_pairs( - indices.data_ptr(), sorted_indices.data_ptr(), - range.data_ptr(), orig_indices.data_ptr(), - num_indices, false, 0, nbits); + using device_ptr = thrust::device_ptr; + + // Sort the inputs into sorted with the corresponding indices; we + // don't need a stable or multidimensional sort, so just use Thrust + // directly + { + sorted_indices.copy_(indices); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + // Fill sortedOrigIndices with sequential indices + auto count_iter = thrust::counting_iterator(0); + auto orig_data = device_ptr(orig_indices.data_ptr()); + thrust::copy(policy, count_iter, count_iter + num_indices, orig_data); + + // Sort; a stable sort is not required + auto sorted_data = device_ptr(sorted_indices.data_ptr()); + thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, + LTOp()); + } if (scale_grad_by_freq) { count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - embedding_dense_backward_cuda_scan(sorted_indices, count); + + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + // Compute an increasing sequence per unique item in sortedIndices: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + auto sorted_data = device_ptr(sorted_indices.data_ptr()); + auto count_data = device_ptr(count.data_ptr()); + thrust::inclusive_scan_by_key( + policy, + sorted_data, + sorted_data + num_indices, + thrust::make_constant_iterator(1), + count_data + ); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( + policy, + thrust::make_reverse_iterator(sorted_data + num_indices), + thrust::make_reverse_iterator(sorted_data), + thrust::make_reverse_iterator(count_data + num_indices), + thrust::make_reverse_iterator(count_data + num_indices), + thrust::equal_to(), + thrust::maximum() + ); } }); @@ -297,23 +340,23 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, checkSameGPU("embedding_renorm", self_arg, indices_arg); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_cuda_", [&] () { + using device_ptr = thrust::device_ptr; auto num_indices = indices.numel(); auto indices_contig = std::get<0>(indices.sort()).contiguous(); - auto unique_indices = at::empty(indices.numel(), indices.options()); - auto num_unique_indices = at::empty({}, indices.options().dtype(kLong)); + auto indices_data = device_ptr(indices_contig.data_ptr()); - cuda::cub::unique( - indices_contig.data_ptr(), - unique_indices.data_ptr(), - num_unique_indices.data_ptr(), - num_indices - ); + auto unique_indices = at::empty(indices.numel(), indices.options()); + auto unique_data = device_ptr(unique_indices.data_ptr()); + auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data); + auto num_unique_indices = static_cast(end - unique_data); - dim3 grid = num_unique_indices.item(); - dim3 block = 128; + dim3 grid(num_unique_indices); + dim3 block(128); int dim = self.stride(0); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_backward", [&] { diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh index c79bf83cc8a6a..f06b850668591 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -10,6 +10,10 @@ #include #include +#include +#include +#include + #pragma once namespace at { diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 57654f2fb9b74..95ab33e512f02 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -218,6 +218,9 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List::max(), + "index_put of tensors larger than INT_MAX is not supported yet in pytorch"); + if (num_indices > 0 && sliceSize > 0) { const bool permuted = !src.is_contiguous(); auto src_ = permuted ? src.contiguous() : src; diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu index 446aa085a31d3..582dc9ebe0498 100644 --- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu +++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu @@ -5,8 +5,6 @@ #include #include #include -#include -#include namespace at { namespace native { @@ -32,45 +30,4 @@ void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_ thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp()); } -template -void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) { - using device_ptr = thrust::device_ptr; - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); - auto policy = thrust::cuda::par(allocator).on(stream); - - auto num_indices = count.numel(); - - // Compute an increasing sequence per unique item in sortedIndices: - // sorted: 2 5 5 5 7 7 8 9 9 - // count: 1 1 2 3 1 2 1 1 2 - auto sorted_data = device_ptr(sorted_indices.data_ptr()); - auto count_data = device_ptr(count.data_ptr()); - thrust::inclusive_scan_by_key( - policy, - sorted_data, - sorted_data + num_indices, - thrust::make_constant_iterator(1), - count_data - ); - - // Take the maximum of each count per unique key in reverse: - // sorted: 2 5 5 5 7 7 8 9 9 - // count: 1 3 3 3 2 2 1 2 2 - thrust::inclusive_scan_by_key( - policy, - thrust::make_reverse_iterator(sorted_data + num_indices), - thrust::make_reverse_iterator(sorted_data), - thrust::make_reverse_iterator(count_data + num_indices), - thrust::make_reverse_iterator(count_data + num_indices), - thrust::equal_to(), - thrust::maximum() - ); -} - -template -void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); -template -void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); - }} diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu index 56b8eb20faae6..4c5e16a1ceed0 100644 --- a/aten/src/ATen/native/cuda/Randperm.cu +++ b/aten/src/ATen/native/cuda/Randperm.cu @@ -47,6 +47,8 @@ template struct alignas(N) OpaqueType { char data[N]; }; Tensor& randperm_out_cuda(int64_t n, c10::optional generator, Tensor& result) { TORCH_CHECK(n >= 0, "n must be non-negative, got", n); + TORCH_CHECK(n <= std::numeric_limits::max(), + "randperm of tensors larger than INT_MAX is not supported yet in pytorch"); check_supported_max_int_with_precision(n, result); diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu index eb31fd2f76bb8..1b9619b29812b 100644 --- a/aten/src/ATen/native/cuda/UniqueCub.cu +++ b/aten/src/ATen/native/cuda/UniqueCub.cu @@ -94,7 +94,13 @@ std::tuple compute_unique( Tensor length = at::empty({1}, options); int64_t num_out; if (!return_counts) { - cuda::cub::unique(data, data_out.data_ptr(), length.data_ptr(), num_inp); + CUB_WRAPPER( + cub::DeviceSelect::Unique, + data, + data_out.data_ptr(), + length.data_ptr(), + num_inp, + stream); num_out = length.item(); } else { counts.resize_(num_inp); @@ -129,6 +135,11 @@ std::tuple unique_cuda_template( auto options = self.options().dtype(kLong); int64_t num_inp = self.numel(); + TORCH_CHECK( + num_inp <= INT_MAX, + "num_inp ", + num_inp, + " is too big to be handled by cub"); Tensor sorted; Tensor self_c = self.contiguous(); if (consecutive) { From 6c3ebccc00dfca217094357a63e2be901ad7beb4 Mon Sep 17 00:00:00 2001 From: Charles David Hernandez Date: Thu, 19 Aug 2021 13:04:48 -0700 Subject: [PATCH 079/530] Updating the names of these functions (#63513) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63513 updating these names per Jerry's nits in the previous pr Test Plan: Imported from OSS Reviewed By: jerryzh168 Differential Revision: D30406710 fbshipit-source-id: a9f1577a2b8c4a93f5005e0f6278b7d7348d8b66 --- torch/quantization/fx/prepare.py | 4 ++-- torch/quantization/qconfig.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index 23d1d40bb543b..29600b8797c52 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -15,7 +15,7 @@ ) from torch.fx.node import Argument -from ..qconfig import QConfigAny, qconfig_function_equality +from ..qconfig import QConfigAny, qconfig_equals from .qconfig_utils import ( convert_dict_to_ordered_dict, generate_qconfig_map, @@ -195,7 +195,7 @@ def update_qconfig_for_fusion( # Raise an error if the modules in the fused module have # different qconfigs specified in the qconfig_dict for op in ops: - if not qconfig_function_equality(object_type_dict.get(op, None), fused_qconfig): + if not qconfig_equals(object_type_dict.get(op, None), fused_qconfig): raise LookupError("During fusion, we need to specify the same " + f"qconfigs for both modules in {module_type}.") diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py index 01d67ddcbd8b1..ae89b4a50b70a 100644 --- a/torch/quantization/qconfig.py +++ b/torch/quantization/qconfig.py @@ -211,9 +211,9 @@ def configure_constructor_to_put_obs_on_module_device(original_constructor): return QConfigDynamic(activation, weight) -def qconfig_function_equality(q1: QConfigAny, q2: QConfigAny): +def qconfig_equals(q1: QConfigAny, q2: QConfigAny): # functools.partial has no __eq__ operator defined so '==' defaults to 'is' - def compare_partial(p1, p2): + def partial_equals(p1, p2): same = p1.func == p2.func same = same and p1.args == p2.args return same and p1.keywords == p2.keywords @@ -223,6 +223,6 @@ def compare_partial(p1, p2): else: assert q1 is not None and q2 is not None try: - return compare_partial(q1.activation.p, q2.activation.p) and compare_partial(q1.weight.p, q2.weight.p) + return partial_equals(q1.activation.p, q2.activation.p) and partial_equals(q1.weight.p, q2.weight.p) except AttributeError: return q1 == q2 From 11a40ad915d4d3d8551588e303204810887fcf8d Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Thu, 19 Aug 2021 13:32:26 -0700 Subject: [PATCH 080/530] [Pytorch] Fix callstack pointer serialization bug (#63576) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63576 We serialize function name associated with InlinedCallStackPtr. This is derived via querying Function* stored in InlinedCallStack. However this is a raw pointer that is not gauranteed to be valid when we serialization happens. On the other hand we also store function name separately when constructing InlinedCallStack anyways. So this change just uniformly relies on function_name instead of Function* Test Plan: Internal build's asan failure + CI Reviewed By: larryliu0820 Differential Revision: D30427029 fbshipit-source-id: de9617482404785920ed2e67b72f38461590fba3 --- torch/csrc/jit/mobile/debug_info.cpp | 6 +----- .../callstack_debug_info_serialization.cpp | 12 ++++-------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp index 9c734f40a25a2..41ce3c6d46d52 100644 --- a/torch/csrc/jit/mobile/debug_info.cpp +++ b/torch/csrc/jit/mobile/debug_info.cpp @@ -49,11 +49,7 @@ std::pair, std::string> getStackTraceWithModuleHierarchy // Now add source range info to stack entries.emplace_back( StackEntry{prev_function_name, callstack_ptr->source_range()}); - if (callstack_ptr->function()) { - prev_function_name = callstack_ptr->function()->name(); - } else { - prev_function_name = callstack_ptr->function_name(); - } + prev_function_name = callstack_ptr->function_name(); // Function name appended here // It is renamed to prev_function_name because for StackEntry // it will be appended in the next iteration. This is the format diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp index c26c7e575c547..93da38ad768c5 100644 --- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp +++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp @@ -47,15 +47,11 @@ c10::IValue InlinedCallStackSerializer::serialize( } else { elements.emplace_back(c10::IValue()); } - if (cs_ptr->function()) { - elements.emplace_back(cs_ptr->function()->name()); + auto fn_name = cs_ptr->function_name(); + if (!fn_name.empty()) { + elements.emplace_back(fn_name); } else { - auto fn_name = cs_ptr->function_name(); - if (!fn_name.empty()) { - elements.emplace_back(fn_name); - } else { - elements.emplace_back("FunctionName_UNKNOWN"); - } + elements.emplace_back("FunctionName_UNKNOWN"); } c10::IValue serialized_cs = c10::ivalue::Tuple::create(elements); serialized_inlined_callstack_[cs_ptr] = serialized_cs; From e5ab0d1013072c26586b369536bccac648843958 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Thu, 19 Aug 2021 14:54:26 -0700 Subject: [PATCH 081/530] DataLoader: allow non-integer Samplers (#63500) Summary: Not entirely sure how to use TypeVar but if someone could give me a hint it would be appreciated. Also let me know if you want me to add tests so we can make sure non-integer samplers actually work. It seems like `test/test_dataloader.py` is the correct location but that's a big file. Fixes https://github.com/pytorch/pytorch/issues/63483 ejguan Pull Request resolved: https://github.com/pytorch/pytorch/pull/63500 Reviewed By: mruberry Differential Revision: D30403689 Pulled By: ejguan fbshipit-source-id: 464e09e5aad3215b94a29cc5e21cb4b10ec136e3 --- torch/utils/data/dataloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index c85296f8f807f..0f46ad283ea5a 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -160,8 +160,8 @@ class DataLoader(Generic[T_co]): __initialized = False def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1, - shuffle: bool = False, sampler: Optional[Sampler[int]] = None, - batch_sampler: Optional[Sampler[Sequence[int]]] = None, + shuffle: bool = False, sampler: Optional[Sampler] = None, + batch_sampler: Optional[Sampler[Sequence]] = None, num_workers: int = 0, collate_fn: Optional[_collate_fn_t] = None, pin_memory: bool = False, drop_last: bool = False, timeout: float = 0, worker_init_fn: Optional[_worker_init_fn_t] = None, From ccca66597ab7079c39b744c2906171aa63e7db61 Mon Sep 17 00:00:00 2001 From: Sergei Vorobev Date: Thu, 19 Aug 2021 14:57:00 -0700 Subject: [PATCH 082/530] Replace hardcoded values in IndexKernel.cu (#63372) Summary: This is a small change that helps to maintain Cruise pytorch fork, since we use a different hardcoded value. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63372 Reviewed By: mruberry Differential Revision: D30396171 Pulled By: ejguan fbshipit-source-id: cc0023f58b5922d3d98c7283495e6dc8d35049b6 --- aten/src/ATen/native/cuda/IndexKernel.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index 5a28e79136e81..1aabb0b9a5041 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -69,9 +69,9 @@ void gpu_index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef return; } - auto sizes = at::detail::Array(0); - auto strides = at::detail::Array(0); - auto index_ptrs = at::detail::Array(nullptr); + auto sizes = at::detail::Array(0); + auto strides = at::detail::Array(0); + auto index_ptrs = at::detail::Array(nullptr); for (int i = 0; i < num_indices; i++) { sizes[i] = index_size[i]; strides[i] = index_stride[i]; From 71ab48ed3b99fa43b715cb2efa3e4b8da40e92cf Mon Sep 17 00:00:00 2001 From: Zeina Migeed Date: Thu, 19 Aug 2021 15:22:52 -0700 Subject: [PATCH 083/530] acc type inference (#63119) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63119 Test Plan: buck run mode/opt-clang caffe2/torch/fb/model_transform/experimental:fx_ir_lower_inline_cvr -- \ --action=lower_and_run \ --filename=inline_cvr_7x_dec_2020.model \ --print_glow_glog=True Reviewed By: jamesr66a, jfix71, ansley Differential Revision: D30235895 fbshipit-source-id: dab7f96e1799b99eeae0ee519cf0ddd636fddf2e --- .../experimental/graph_gradual_typechecker.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py index e3c1ce82d7a46..5ce53a7ff6896 100644 --- a/torch/fx/experimental/graph_gradual_typechecker.py +++ b/torch/fx/experimental/graph_gradual_typechecker.py @@ -63,8 +63,6 @@ def broadcast_types(t1, t2): (t1, t2) = TensorType(tuple(new_t1)), TensorType(tuple(new_t2)) - if not is_consistent(t1, t2): - raise TypeError return (t1, t2) else: @@ -521,7 +519,7 @@ def type_check_node(self, n: Node): return n.type elif n.op == 'get_attr': - t = self.traced.get_parameter(n.target) + t = get_parameter(self.traced, n.target) # type: ignore[arg-type] if isinstance(t.data, torch.Tensor): n.type = TensorType(t.data.shape) return n.type @@ -705,3 +703,37 @@ def refine_node(self, n: Node): else: pass + + +def get_parameter(traced, target: str): + """ + Returns the parameter given by ``target`` if it exists, + otherwise throws an error. + + See the docstring for ``get_submodule`` for a more detailed + explanation of this method's functionality as well as how to + correctly specify ``target``. + + Args: + target: The fully-qualified string name of the Parameter + to look for. (See ``get_submodule`` for how to specify a + fully-qualified string.) + + Returns: + torch.nn.Parameter: The Parameter referenced by ``target`` + + Raises: + AttributeError: If the target string references an invalid + path or resolves to something that is not an + ``nn.Parameter`` + """ + module_path, _, param_name = target.rpartition(".") + + mod: torch.nn.Module = traced.get_submodule(module_path) + + if not hasattr(mod, param_name): + raise AttributeError(mod._get_name() + " has no attribute `" + param_name + "`") + + param: torch.nn.Parameter = getattr(mod, param_name) + + return param From d0d27f697101e45cf3c8c8b3f762af4bb1396626 Mon Sep 17 00:00:00 2001 From: zhouzhuojie Date: Thu, 19 Aug 2021 15:37:10 -0700 Subject: [PATCH 084/530] Add concurrency group for more workflows (#63606) Summary: Fixes unnecessary duplicated workflows runs ![image](https://user-images.githubusercontent.com/658840/130146332-ecf54e49-3538-49c1-88de-b099f1c1e41f.png) Pull Request resolved: https://github.com/pytorch/pytorch/pull/63606 Reviewed By: malfet, mruberry Differential Revision: D30436889 Pulled By: zhouzhuojie fbshipit-source-id: aafbad1edc45e3ab9bceb00e8f3b4204f18e43d0 --- .github/workflows/add_annotations.yml | 6 ++++++ .github/workflows/auto_label.yml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/.github/workflows/add_annotations.yml b/.github/workflows/add_annotations.yml index 40c2677aaf80d..9bb3c1b46e7b4 100644 --- a/.github/workflows/add_annotations.yml +++ b/.github/workflows/add_annotations.yml @@ -7,6 +7,12 @@ on: workflows: - Lint + +concurrency: + group: add-annotations-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + + jobs: annotate: if: ${{ github.repository_owner == 'pytorch' }} diff --git a/.github/workflows/auto_label.yml b/.github/workflows/auto_label.yml index 24fc02eff1439..1616ea9c90b8a 100644 --- a/.github/workflows/auto_label.yml +++ b/.github/workflows/auto_label.yml @@ -6,6 +6,12 @@ on: pull_request_target: types: [edited, opened, synchronize, reopened] + +concurrency: + group: auto-label-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + + jobs: auto-label-rocm: if: ${{ github.repository_owner == 'pytorch' }} From f2bf0f229fdd0713064bc0fbd6dbc2063c71e2d4 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 19 Aug 2021 16:46:31 -0700 Subject: [PATCH 085/530] Revert D30359218: [pytorch][PR] [doc] pre-commit fix instructions Test Plan: revert-hammer Differential Revision: D30359218 (https://github.com/pytorch/pytorch/commit/4e1d84ae8fae49995c8966ccbe0f34360978492f) Original commit changeset: 61771babeac4 fbshipit-source-id: c2ac0a4a7463fafa03ad0b20bfb0701a8c1476c4 --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d820a360dd7c..7d8659a8babff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1151,7 +1151,7 @@ formatting and semantic checking of code. We provide a pre-commit git hook for performing these checks, before a commit is created: ```bash - ln -s tools/git-pre-commit .git/hooks/pre-commit + ln -s ../../tools/git-pre-commit .git/hooks/pre-commit ``` You'll need to install an appropriately configured flake8; see From 0b6cc8daf22f574d57d01156a879a33fa244306f Mon Sep 17 00:00:00 2001 From: Pavithran Ramachandran Date: Thu, 19 Aug 2021 18:39:50 -0700 Subject: [PATCH 086/530] [PyTorch][Edge] Support backtrace symbolication for Android builds (#63339) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63339 # Context https://fb.workplace.com/groups/pytorch.dev/permalink/900474523864362/?comment_id=901125403799274&reply_comment_id=905023386742809 ##### WHAT IS A STACK TRACE? A stack trace (also called stack backtrace or stack traceback) is a report of the active stack frames at a certain point in time during the execution of a program. Typically when an exception is thrown, one would expect to see the code (file:line) that threw the exception, and every intermediate frame up to and including the main function. We are enabling android stack trace to help debugging on android devices. Test Plan: ## Steps to test ``` buck build fbsource//xplat/caffe2/mode/aibench_pytorch_android -c pt.enable_qpl=0 -c pt.has_backtraces=1 fbsource//xplat/caffe2/fb/lite_predictor:lite_predictorAndroid#android-x86_64 one_world android emulator android-28 adb push ~/fbsource/buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictorAndroid#android-x86_64 /data/local/tmp cd /data/local/tmp ./lite_predictorAndroid#android-x86_64 ./lite_predictorAndroid#android-x86_64 --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true ``` ## See how model file is not found stack traces is: ### before ``` ./lite_predictorAndroid#android-x86_64 --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true Run with 2 threads Run with 2 threads Loading model... terminating with uncaught exception of type c10::Error: open file failed, file path: ./detect.bc Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:13 (most recent call first): (no backtrace available) Aborted ``` ### after ``` 134|generic_x86_64:/data/local/tmp $ ./lite_predictorAndroid#android-x86_64 --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true Run with 2 threads Run with 2 threads Loading model... terminating with uncaught exception of type c10::Error: open file failed, file path: ./detect.bc Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:13 (most recent call first): frame #0 c10::get_backtrace(unsigned long, unsigned long, bool)[0x59494274f10e] frame #1 [0x5949427b1eee] frame #2 [0x5949427b1eb2] frame #3 [0x5949427b1cdc] frame #4 std::__ndk1::function, std::__ndk1::allocator > ()>::operator()() const[0x5949427afc34] frame #5 c10::Error::Error(c10::SourceLocation, std::__ndk1::basic_string, std::__ndk1::allocator >)[0x5949427b05b1] frame #6 c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__ndk1::basic_string, std::__ndk1::allocator > const&)[0x5949427aca5f] frame #7 caffe2::serialize::FileAdapter::RAIIFile::RAIIFile(std::__ndk1::basic_string, std::__ndk1::allocator > const&)[0x5949426b37b2] frame #8 caffe2::serialize::FileAdapter::FileAdapter(std::__ndk1::basic_string, std::__ndk1::allocator > const&)[0x5949426b3903] frame #9 torch::jit::_load_for_mobile(std::__ndk1::basic_string, std::__ndk1::allocator > const&, c10::optional, std::__ndk1::unordered_map, std::__ndk1::allocator >, std::__ndk1::basic_string, std::__ndk1::allocator >, std::__ndk1::hash, std::__ndk1::allocator > >, std::__ndk1::equal_to, std::__ndk1::allocator > >, std::__ndk1::allocator, std::__ndk1::allocator > const, std::__ndk1::basic_string, std::__ndk1::allocator > > > >&)[0x5949422737bd] frame #10 torch::jit::_load_for_mobile(std::__ndk1::basic_string, std::__ndk1::allocator > const&, c10::optional)[0x594942273769] frame #11 benchmark(std::__ndk1::basic_string, std::__ndk1::allocator > const&, int, std::__ndk1::basic_string, std::__ndk1::allocator > const&, std::__ndk1::basic_string, std::__ndk1::allocator > const&, std::__ndk1::basic_string, std::__ndk1::allocator > const&, bool, int, int, int, bool, int, bool, int, double, bool, bool, bool, std::__ndk1::basic_string, std::__ndk1::allocator > const&)[0x59494189b21d] frame #12 main[0x594941882aff] frame #13 __libc_init[0x7b699d08578d] ``` ### what we get for os:linux ``` (base) [pavithran@devvm1803.vll0 /data/users/pavithran/fbsource] ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true Run with 24 threads Run with 24 threads Loading model... terminate called after throwing an instance of 'c10::Error' what(): open file failed, file path: ./detect.bc Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:13 (most recent call first): frame #0: ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor() [0x20cb7fe] frame #1: ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor() [0x20cb6c6] frame #2: std::function, std::allocator > ()>::operator()() const + 0x54 (0x20ca4e4 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #3: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x57 (0x20ca9a7 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #4: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string, std::allocator > const&) + 0x7a (0x20c823a in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #5: caffe2::serialize::FileAdapter::RAIIFile::RAIIFile(std::__cxx11::basic_string, std::allocator > const&) + 0x96 (0x206f3d6 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #6: caffe2::serialize::FileAdapter::FileAdapter(std::__cxx11::basic_string, std::allocator > const&) + 0x42 (0x206f502 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #7: torch::jit::_load_for_mobile(std::__cxx11::basic_string, std::allocator > const&, c10::optional, std::unordered_map, std::allocator >, std::__cxx11::basic_string, std::allocator >, std::hash, std::allocator > >, std::equal_to, std::allocator > >, std::allocator, std::allocator > const, std::__cxx11::basic_string, std::allocator > > > >&) + 0x30 (0x1be826c in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #8: torch::jit::_load_for_mobile(std::__cxx11::basic_string, std::allocator > const&, c10::optional) + 0x35 (0x1be8214 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #9: benchmark(std::__cxx11::basic_string, std::allocator > const&, int, std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, bool, int, int, int, bool, int, bool, int, double, bool, bool, bool, std::__cxx11::basic_string, std::allocator > const&) + 0x16d (0x12093ad in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #10: main + 0x25c (0x11f933c in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) frame #11: __libc_start_main + 0x105 (0x7fc7b9f2ed95 in /usr/local/fbcode/platform009/lib/libc.so.6) frame #12: _start + 0x2a (0x11f902a in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor) Aborted (core dumped) ```` Reviewed By: dhruvbird Differential Revision: D30135947 fbshipit-source-id: f50c634ef4545843305cad4b4a14a8776b1aec76 --- c10/util/Backtrace.cpp | 69 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp index d978f32cd00e0..2c5e2e4cdca16 100644 --- a/c10/util/Backtrace.cpp +++ b/c10/util/Backtrace.cpp @@ -16,8 +16,13 @@ #if SUPPORTS_BACKTRACE #include +#ifdef C10_ANDROID +#include +#include +#else #include #endif +#endif #ifdef FBCODE_CAFFE2 #include @@ -25,6 +30,59 @@ namespace c10 { +#if SUPPORTS_BACKTRACE && defined(C10_ANDROID) + +struct AndroidBacktraceState { + std::vector buffer; +}; + +_Unwind_Reason_Code android_unwind_callback( + struct _Unwind_Context* context, + void* arg) { + AndroidBacktraceState* state = (AndroidBacktraceState*)arg; + uintptr_t pc = _Unwind_GetIP(context); + if (pc) { + state->buffer.emplace_back(reinterpret_cast(pc)); + } + return _URC_NO_REASON; +} + +void dump_stack( + std::ostream& os, + size_t frames_to_skip, + size_t maximum_number_of_frames) { + AndroidBacktraceState state; + + _Unwind_Backtrace(android_unwind_callback, &state); + + int idx = 0; + char* demangled = nullptr; + size_t length = 0; + + for (const void* addr : state.buffer) { + const char* symbol = ""; + + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } + + int status = 0; + demangled = __cxxabiv1::__cxa_demangle( + /*mangled_name*/ symbol, + /*output_buffer*/ demangled, + /*length*/ &length, + /*status*/ &status); + + os << " frame #" << idx++ << "\t" + << ((demangled != NULL && status == 0) ? demangled : symbol) << "[" + << addr << "]\t" << std::endl; + } + free(demangled); +} + +#endif /* SUPPORTS_BACKTRACE && defined(C10_ANDROID) */ + #if SUPPORTS_BACKTRACE namespace { @@ -42,6 +100,7 @@ struct FrameInformation { std::string object_file; }; +#ifndef C10_ANDROID bool is_python_frame(const FrameInformation& frame) { return frame.object_file == "python" || frame.object_file == "python3" || (frame.object_file.find("libpython") != std::string::npos); @@ -113,6 +172,7 @@ c10::optional parse_frame_information( frame.function_name = demangle(mangled_function_name.c_str()); return frame; } +#endif /* !defined(C10_ANDROID) */ } // anonymous namespace #elif defined(_MSC_VER) namespace { @@ -178,7 +238,7 @@ std::string get_backtrace( facebook::process::StackTrace st; return st.toString(); -#elif SUPPORTS_BACKTRACE +#elif SUPPORTS_BACKTRACE && !defined(C10_ANDROID) // We always skip this frame (backtrace). frames_to_skip += 1; @@ -249,6 +309,13 @@ std::string get_backtrace( } return stream.str(); + +#elif SUPPORTS_BACKTRACE && defined(C10_ANDROID) + + std::ostringstream oss; + dump_stack(oss, frames_to_skip, maximum_number_of_frames); + return oss.str().c_str(); + #elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE // This backtrace retrieval is implemented on Windows via the Windows // API using `CaptureStackBackTrace`, `SymFromAddr` and From b99a299c6002354acef5c43eba5cd25b41e773c7 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 19 Aug 2021 18:52:33 -0700 Subject: [PATCH 087/530] [PyTorch] Remove unused dump() methods in vec headers (#63533) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63533 These methods don't seem to be used, and they use std::cout, which incurs a small code size overhead on platforms using libstdc++ due to std::__ioinit (see #61500). Seems like we can just delete them? ghstack-source-id: 136163409 Test Plan: CI Reviwers: #sentinel, dhruvbird Reviewed By: dskhudia Differential Revision: D30412269 fbshipit-source-id: 380b9aa2f9aabc4107188b6b209d2afc1769c0ee --- aten/src/ATen/cpu/vec/vec256/vec256_int.h | 6 ----- aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 25 ------------------- .../cpu/vec/vec256/vsx/vec256_qint32_vsx.h | 12 --------- .../cpu/vec/vec256/vsx/vec256_qint8_vsx.h | 9 ------- aten/src/ATen/cpu/vec/vec512/vec512_int.h | 6 ----- aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 25 ------------------- 6 files changed, 83 deletions(-) diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index 86cf42556d192..ab8e1d0252fd4 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -237,12 +237,6 @@ class Vectorized : public Vectorizedi { std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); } } - void dump() const { - for (size_t i = 0; i < size(); ++i) { - std::cout << (int)((value_type*)&values)[i] << " "; - } - std::cout << std::endl; - } const int32_t& operator[](int idx) const = delete; int32_t& operator[](int idx) = delete; Vectorized abs() const { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index dc5e833127327..b247d46fff9bb 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -309,12 +309,6 @@ struct Vectorized : public Vectorizedqi { return _mm256_add_epi32(rounded, zero_point_v); } - void dump() const { - for (size_t i = 0; i < 8; ++i) { - std::cout << ((int32_t*)&vals)[i] << " "; - } - std::cout << std::endl; - } private: // Load from memory constructor Vectorized(const void* ptr) { @@ -537,12 +531,6 @@ struct Vectorized : public Vectorizedqi { return RequantizeAvx2(inp, multiplier_v, zero_point_v); } - void dump() const { - for (size_t i = 0; i < size(); ++i) { - std::cout << (int)((value_type*)&vals)[i] << " "; - } - std::cout << std::endl; - } private: // Load from memory constructor Vectorized(const void* ptr) { @@ -702,12 +690,6 @@ struct Vectorized : public Vectorizedqi { return RequantizeAvx2(inp, multiplier_v, zero_point_v); } - void dump() const { - for (size_t i = 0; i < size(); ++i) { - std::cout << (int)((value_type*)&vals)[i] << " "; - } - std::cout << std::endl; - } private: // Load from memory constructor @@ -792,13 +774,6 @@ struct VectorizedQuantizedConverter { return rv; } - void dump() const { - for (int i = 0; i < size(); ++i) { - std::cout << vals[i] << " "; - } - std::cout << std::endl; - } - protected: VectorizedQuantizedConverter() {} }; diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h index ed457b9adefc8..5b1622e825cb0 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h @@ -196,18 +196,6 @@ struct Vectorized { return {veci0, veci1}; } - void dump() const { - std::cout << _vec0[0] << " "; - std::cout << _vec0[1] << " "; - std::cout << _vec0[2] << " "; - std::cout << _vec0[3] << " "; - std::cout << _vec1[0] << " "; - std::cout << _vec1[1] << " "; - std::cout << _vec1[2] << " "; - std::cout << _vec1[3] << " "; - std::cout << std::endl; - } - DEFINE_MEMBER_OP(operator==, c10::qint32, vec_cmpeq) DEFINE_MEMBER_OP(operator!=, c10::qint32, vec_cmpne) DEFINE_MEMBER_OP(operator<, c10::qint32, vec_cmplt) diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h index f2a8446cd0ed9..82b2530b7ef3f 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h @@ -361,15 +361,6 @@ struct Vectorized { return {vec0, vec1}; } - void dump() const { - value_type vals[size()]; - store((void*)vals); - for (int i = 0; i < size(); ++i) { - std::cout << (int)(vals[i]) << " "; - } - std::cout << std::endl; - } - DEFINE_MEMBER_OP(operator==, c10::qint8, vec_cmpeq) DEFINE_MEMBER_OP(operator!=, c10::qint8, vec_cmpne) DEFINE_MEMBER_OP(operator<, c10::qint8, vec_cmplt) diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h index cc866c065bfba..f28c14ed3f73f 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h @@ -270,12 +270,6 @@ class Vectorized : public Vectorizedi { std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); } } - void dump() const { - for (size_t i = 0; i < size(); ++i) { - std::cout << (int)((value_type*)&values)[i] << " "; - } - std::cout << std::endl; - } const int32_t& operator[](int idx) const = delete; int32_t& operator[](int idx) = delete; Vectorized abs() const { diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h index 5b5ac195f3caa..3a1eda8874f1a 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h @@ -321,12 +321,6 @@ struct Vectorized : public Vectorizedqi { return _mm512_add_epi32(rounded, zero_point_v); } - void dump() const { - for (size_t i = 0; i < 16; ++i) { - std::cout << ((int32_t*)&vals)[i] << " "; - } - std::cout << std::endl; - } private: // Load from memory constructor Vectorized(const void* ptr) { @@ -549,12 +543,6 @@ struct Vectorized : public Vectorizedqi { return RequantizeAvx512(inp, multiplier_v, zero_point_v); } - void dump() const { - for (size_t i = 0; i < size(); ++i) { - std::cout << (int)((value_type*)&vals)[i] << " "; - } - std::cout << std::endl; - } private: // Load from memory constructor Vectorized(const void* ptr) { @@ -714,12 +702,6 @@ struct Vectorized : public Vectorizedqi { return RequantizeAvx512(inp, multiplier_v, zero_point_v); } - void dump() const { - for (size_t i = 0; i < size(); ++i) { - std::cout << (int)((value_type*)&vals)[i] << " "; - } - std::cout << std::endl; - } private: // Load from memory constructor @@ -806,13 +788,6 @@ struct VectorizedQuantizedConverter { return rv; } - void dump() const { - for (int i = 0; i < size(); ++i) { - std::cout << vals[i] << " "; - } - std::cout << std::endl; - } - protected: VectorizedQuantizedConverter() {} }; From 0a66d5b3253fd2d2304f3897526db3c8fb139376 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 19 Aug 2021 18:52:33 -0700 Subject: [PATCH 088/530] [PyTorch] Remove unnecessary iostream includes in headers (#61500) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61500 libstdc++ defines a static variable called `std::__ioinit` in iostream that adds global constructor size overhead to each translation that includes iostream. To reduce the size overhead from that, we can often include ostream instead. ghstack-source-id: 136163529 Test Plan: buildsizebot some mobile apps Reviewed By: dhruvbird Differential Revision: D29648016 fbshipit-source-id: 9c3139712c71248513cc5032d21e77f3ecbae8fe --- aten/src/ATen/core/Formatting.cpp | 3 +++ aten/src/ATen/core/Formatting.h | 6 ++---- aten/src/ATen/core/Vitals.cpp | 1 + aten/src/ATen/core/Vitals.h | 2 +- aten/src/ATen/core/function_schema.cpp | 2 ++ aten/src/ATen/core/interned_strings_class.h | 2 -- aten/src/ATen/core/ivalue.cpp | 1 + aten/src/ATen/core/jit_type.h | 5 +++-- aten/src/ATen/cpu/vec/vec256/vec256.h | 2 +- aten/src/ATen/cpu/vec/vec256/vec256_int.h | 1 + aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 1 + aten/src/ATen/cpu/vml.h | 1 - aten/src/ATen/cudnn/Descriptors.cpp | 2 +- aten/src/ATen/miopen/Descriptors.cpp | 2 ++ .../cpu/qnnpack/include/pack_block_sparse.h | 21 +------------------ .../cpu/qnnpack/src/pack_block_sparse.cc | 21 +++++++++++++++++++ c10/core/DispatchKey.h | 2 +- c10/core/Layout.h | 2 +- c10/core/MemoryFormat.h | 2 +- c10/core/ScalarType.h | 2 +- c10/util/Bitset.h | 1 - c10/util/complex.h | 1 - c10/util/either.h | 1 - c10/util/typeid.h | 1 - caffe2/core/init.cc | 1 + caffe2/core/operator.cc | 12 +++++++++++ caffe2/core/operator.h | 11 +--------- caffe2/core/operator_schema.cc | 18 ++++++++++++++++ caffe2/core/operator_schema.h | 16 +------------- caffe2/operators/utility_ops.cc | 1 + 30 files changed, 79 insertions(+), 65 deletions(-) diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp index baf1691bd1d53..dbbed6e3b0785 100644 --- a/aten/src/ATen/core/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -232,6 +232,9 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) } } +void print(const Tensor & t, int64_t linesize) { + print(std::cout,t,linesize); +} std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesize) { FormatGuard guard(stream); if(!tensor_.defined()) { diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h index 86ea603951613..55cfe7b3bdf7e 100644 --- a/aten/src/ATen/core/Formatting.h +++ b/aten/src/ATen/core/Formatting.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace c10 { @@ -18,9 +18,7 @@ TORCH_API std::ostream& print( static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { return print(out,t,80); } -static inline void print(const Tensor & t, int64_t linesize=80) { - print(std::cout,t,linesize); -} +TORCH_API void print(const Tensor & t, int64_t linesize=80); static inline std::ostream& operator<<(std::ostream & out, Scalar s) { if (s.isFloatingPoint()) { diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp index edff5211ea0f0..76fc652f9407e 100644 --- a/aten/src/ATen/core/Vitals.cpp +++ b/aten/src/ATen/core/Vitals.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace at { namespace vitals { diff --git a/aten/src/ATen/core/Vitals.h b/aten/src/ATen/core/Vitals.h index c64cf7e629210..48913c54185f3 100644 --- a/aten/src/ATen/core/Vitals.h +++ b/aten/src/ATen/core/Vitals.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include #include +#include #include #include diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp index cc6de61dccead..a4319f03132cc 100644 --- a/aten/src/ATen/core/function_schema.cpp +++ b/aten/src/ATen/core/function_schema.cpp @@ -1,5 +1,7 @@ #include +#include + namespace c10 { void FunctionSchema::dump() const { diff --git a/aten/src/ATen/core/interned_strings_class.h b/aten/src/ATen/core/interned_strings_class.h index 54303e0384d28..8bbf3294844a5 100644 --- a/aten/src/ATen/core/interned_strings_class.h +++ b/aten/src/ATen/core/interned_strings_class.h @@ -1,8 +1,6 @@ #include #include -#include #include -#include #include #include #include diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 6fab54ff9dd82..1404e01fa2434 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace c10 { bool _fastEqualsForContainer(const IValue& lhs, const IValue& rhs) { diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index d733fbd2da5b1..eee5acaccd655 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -9,10 +9,11 @@ #include #include -#include +#include #include +#include +#include #include -#include struct ClassType; namespace torch { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h index 0d13458bc4c1c..906d8a8653661 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include namespace at { namespace vec { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index ab8e1d0252fd4..5ee9919abca02 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace at { namespace vec { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index b247d46fff9bb..8cde485c90d7d 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -11,6 +11,7 @@ #include #include +#include // This file defines Vectorized<> for the quantized types. // diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h index b9cc47f3fe73b..dbdef0b459928 100644 --- a/aten/src/ATen/cpu/vml.h +++ b/aten/src/ATen/cpu/vml.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #if AT_MKL_ENABLED() && !defined(__APPLE__) diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index 873431c1d96e5..f52280e9d2401 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -2,7 +2,7 @@ #include -#include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp index 6a6476706ac6f..38875191b448b 100644 --- a/aten/src/ATen/miopen/Descriptors.cpp +++ b/aten/src/ATen/miopen/Descriptors.cpp @@ -1,6 +1,8 @@ #include #include +#include + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h index 0f329296bc18b..62fdef2cdf9b2 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h @@ -8,7 +8,6 @@ #pragma once #include -#include #include #include #include @@ -33,25 +32,7 @@ typedef struct BCSRMatrix { #endif uint32_t col_block_size; // input features block size uint32_t row_block_size; // output features block size - void print() { - std::cout << "row block size:" << row_block_size << std::endl; - std::cout << "col block size:" << col_block_size << std::endl; - std::cout << "row ptr\n"; - for (const auto& t : row_values) { - std::cout << t << ", "; - } - std::cout << std::endl; - std::cout << "col indices\n"; - for (const auto& t : col_indices) { - std::cout << t << ", "; - } - std::cout << std::endl; - std::cout << "Actual values\n"; - for (const auto& t : values) { - std::cout << (uint32_t)t << ", "; - } - std::cout << std::endl; - } + void print() const; } BCSRMatrix; std::unique_ptr generateBlockCSRMatrix( diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc index ca694df3aba45..6a6134023bfc8 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ #include +#include #include @@ -78,4 +79,24 @@ std::unique_ptr generateBlockCSRMatrix( bcsr_mat.col_block_size = col_block_size; return bcsr_mat_ptr; } + +void BCSRMatrix::print() const { + std::cout << "row block size:" << row_block_size << std::endl; + std::cout << "col block size:" << col_block_size << std::endl; + std::cout << "row ptr\n"; + for (const auto& t : row_values) { + std::cout << t << ", "; + } + std::cout << std::endl; + std::cout << "col indices\n"; + for (const auto& t : col_indices) { + std::cout << t << ", "; + } + std::cout << std::endl; + std::cout << "Actual values\n"; + for (const auto& t : values) { + std::cout << (uint32_t)t << ", "; + } + std::cout << std::endl; +} } // namsepace qnnpack diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 9f21838ddb4a3..5b20a1ca327df 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/c10/core/Layout.h b/c10/core/Layout.h index 44168ebca4360..f37ceb18a835d 100644 --- a/c10/core/Layout.h +++ b/c10/core/Layout.h @@ -3,7 +3,7 @@ #include #include -#include +#include namespace c10 { enum class Layout : int8_t { Strided, Sparse, SparseCsr, Mkldnn, NumOptions }; diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h index ba4e056e1e6c8..8cafde1b5c5e7 100644 --- a/c10/core/MemoryFormat.h +++ b/c10/core/MemoryFormat.h @@ -4,7 +4,7 @@ #include #include -#include +#include // Memory format is not the property of a Tensor. It is the way to tell an // operator how the result should be organized in memory and nothing more. That diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index d652db5a215c6..f7b07100365fa 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -12,7 +12,7 @@ #include #include -#include +#include namespace c10 { diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h index 6f7c4b9a1d78b..bed04a438abea 100644 --- a/c10/util/Bitset.h +++ b/c10/util/Bitset.h @@ -3,7 +3,6 @@ #include #include #include -#include #if defined(_MSC_VER) #include #endif diff --git a/c10/util/complex.h b/c10/util/complex.h index 2a565f8f2bf8f..67ed463febd94 100644 --- a/c10/util/complex.h +++ b/c10/util/complex.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include diff --git a/c10/util/either.h b/c10/util/either.h index da765b9a9bb17..757663f5896fb 100644 --- a/c10/util/either.h +++ b/c10/util/either.h @@ -6,7 +6,6 @@ #include #include #include -#include namespace c10 { /** diff --git a/c10/util/typeid.h b/c10/util/typeid.h index e6a5822a3e7ce..240c69e92400e 100644 --- a/c10/util/typeid.h +++ b/c10/util/typeid.h @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/caffe2/core/init.cc b/caffe2/core/init.cc index 529665869b3e1..bafbc825f8b79 100644 --- a/caffe2/core/init.cc +++ b/caffe2/core/init.cc @@ -3,6 +3,7 @@ #include "caffe2/core/scope_guard.h" #include +#include #include C10_DEFINE_bool( diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index 846ab8ab55b46..ca66f7846c300 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -1,6 +1,7 @@ #include "caffe2/core/operator.h" #include +#include #include "caffe2/core/init.h" #include "caffe2/core/logging.h" @@ -355,6 +356,17 @@ void SetOpEnginePref( } } +DeviceTypeRegisterer::DeviceTypeRegisterer(DeviceType type, RegistryFunction func) { + if (gDeviceTypeRegistry()->count(type)) { + std::cerr << "Device type " << DeviceTypeName(type) + << "registered twice. This should not happen. Did you have " + "duplicated numbers assigned to different devices?"; + std::exit(1); + } + // Calling the registry function to get the actual registry pointer. + gDeviceTypeRegistry()->emplace(type, func()); +} + unique_ptr CreateOperator( const OperatorDef& operator_def, Workspace* ws, diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index fc9a6769c4e65..b840254612929 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -1330,16 +1330,7 @@ typedef c10::Registry< TORCH_API std::map* gDeviceTypeRegistry(); struct TORCH_API DeviceTypeRegisterer { - explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func) { - if (gDeviceTypeRegistry()->count(type)) { - std::cerr << "Device type " << DeviceTypeName(type) - << "registered twice. This should not happen. Did you have " - "duplicated numbers assigned to different devices?"; - std::exit(1); - } - // Calling the registry function to get the actual registry pointer. - gDeviceTypeRegistry()->emplace(type, func()); - } + explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func); }; #if defined(_MSC_VER) diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc index fbfb8f404d359..29d0b3e78d9a4 100644 --- a/caffe2/core/operator_schema.cc +++ b/caffe2/core/operator_schema.cc @@ -1,6 +1,8 @@ #include "caffe2/core/operator_schema.h" #include "caffe2/core/logging.h" +#include + #include namespace caffe2 { @@ -520,6 +522,22 @@ C10_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) { return out; } +OpSchema& OpSchemaRegistry::NewSchema(const string& key, const string& file, const int line) { + auto& m = map(); + auto it = m.find(key); + if (it != m.end()) { + const auto& schema = it->second; + std::ios_base::Init init; + std::cerr << "Trying to register schema with name " << key + << " from file " << file << " line " << line + << ", but it is already registered from file " << schema.file() + << " line " << schema.line(); + abort(); + } + m.emplace(key, OpSchema(key, file, line)); + return m[key]; +} + CaffeMap& OpSchemaRegistry::map() { static CaffeMap map; return map; diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h index b19d5be079af2..64f5ef3ed883a 100644 --- a/caffe2/core/operator_schema.h +++ b/caffe2/core/operator_schema.h @@ -460,21 +460,7 @@ class TORCH_API OpSchema { class TORCH_API OpSchemaRegistry { public: static OpSchema& - NewSchema(const string& key, const string& file, const int line) { - auto& m = map(); - auto it = m.find(key); - if (it != m.end()) { - const auto& schema = it->second; - std::ios_base::Init init; - std::cerr << "Trying to register schema with name " << key - << " from file " << file << " line " << line - << ", but it is already registered from file " << schema.file() - << " line " << schema.line(); - abort(); - } - m.emplace(key, OpSchema(key, file, line)); - return m[key]; - } + NewSchema(const string& key, const string& file, const int line); static const OpSchema* Schema(const string& key) { auto& m = map(); diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 7b2a02fae696b..8b5e116024b81 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -1,5 +1,6 @@ #include "caffe2/operators/utility_ops.h" #include +#include #include "caffe2/utils/eigen_utils.h" namespace caffe2 { From 5e31a3b9044f7724a36bd7e491a63914279ef259 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Thu, 19 Aug 2021 22:50:32 -0700 Subject: [PATCH 089/530] [nnc] Updated sliceHead to do inplace mutation (#63531) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63531 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30412183 Pulled By: navahgar fbshipit-source-id: 47ee9482a36e606788d28d22eee4edaca45ffa50 --- test/cpp/tensorexpr/test_loopnest.cpp | 2 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 4a2a1d07db12e..b550f4819e970 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -284,7 +284,7 @@ TEST(LoopNest, ExprSliceHead) { ASSERT_NE(head, nullptr); ASSERT_NE(head, loops[0]); ASSERT_NE(tail, nullptr); - ASSERT_NE(tail, loops[0]); + ASSERT_EQ(tail, loops[0]); BlockPtr body = getSimplifiedBody(l); assertForRanges(body, {{0, 4}, {4, 10}}); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 2256369e2e9e2..3c39dcde82e46 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -1306,11 +1306,10 @@ void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) { ExprPtr head_end = alloc( alloc(f->start(), alloc(factor)), f->stop(), true); *head = alloc(f->var(), f->start(), head_end, Stmt::clone(f->body())); - *tail = alloc( - f->var(), head_end, f->stop(), Stmt::clone(f->body()), f->loop_options()); + p->insert_stmt_before(*head, f); - p->replace_stmt(f, *head); - p->insert_stmt_after(*tail, *head); + f->set_start(head_end); + *tail = f; if (f->loop_options().is_gpu_block_index() || f->loop_options().is_gpu_thread_index()) { From d82667f7e2cd812d98b9cc4f40df46b37a9ef653 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Thu, 19 Aug 2021 22:50:32 -0700 Subject: [PATCH 090/530] [nnc] Updated sliceTail to do inplace mutation (#63532) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63532 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30412184 Pulled By: navahgar fbshipit-source-id: e7669d3b9d24e14501f3feb6505c88d1d42030c6 --- test/cpp/tensorexpr/test_loopnest.cpp | 2 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index b550f4819e970..898ee5293edab 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -380,7 +380,7 @@ TEST(LoopNest, ExprSliceTail) { LoopNest::sliceTail(loops[0], 4, &head, &tail); ASSERT_NE(head, nullptr); - ASSERT_NE(head, loops[0]); + ASSERT_EQ(head, loops[0]); ASSERT_NE(tail, nullptr); ASSERT_NE(tail, loops[0]); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 3c39dcde82e46..a296d8c7af79b 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -1345,16 +1345,11 @@ void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) { ExprPtr tail_start = alloc( f->start(), alloc(f->stop(), alloc(factor)), true); - *head = alloc( - f->var(), - f->start(), - tail_start, - Stmt::clone(f->body()), - f->loop_options()); *tail = alloc(f->var(), tail_start, f->stop(), Stmt::clone(f->body())); + p->insert_stmt_after(*tail, f); - p->replace_stmt(f, *head); - p->insert_stmt_after(*tail, *head); + f->set_stop(tail_start); + *head = f; if (f->loop_options().is_gpu_block_index() || f->loop_options().is_gpu_thread_index()) { From bec75daa77ade04ccef4f3de67fcae216bfd122d Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 19 Aug 2021 23:42:24 -0700 Subject: [PATCH 091/530] Update protobuf to 3.13.1 (#62571) Summary: Update bazel to 4.10.0 Update ASAN_SYMBOLIZER_PATH to llvm-7 Suppress `vptr` ubsan violations in `test_jit` Fix ProtoBuf patching for ONNX which caused Windows builds to crash while attempting to free `std::string` allocated on stack Fixes https://github.com/pytorch/pytorch/issues/62569 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62571 Reviewed By: walterddr Differential Revision: D30048685 Pulled By: malfet fbshipit-source-id: 6462c1bef9c42318551d2cf906bbab41e1d4e1cd --- .jenkins/pytorch/common_utils.sh | 4 ++-- .jenkins/pytorch/test.sh | 2 +- cmake/ProtoBuf.cmake | 4 ++-- cmake/ProtoBufPatch.cmake | 4 ++-- third_party/protobuf | 2 +- ubsan.supp | 1 + 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index fd94ce14a1c5f..49db051a0f484 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -59,9 +59,9 @@ function file_diff_from_base() { function get_bazel() { # download bazel version - wget https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel-3.1.0-linux-x86_64 -O tools/bazel + wget https://github.com/bazelbuild/bazel/releases/download/4.1.0/bazel-4.1.0-linux-x86_64 -O tools/bazel # verify content - echo '753434f4fa730266cf5ce21d1fdd425e1e167dd9347ad3e8adc19e8c0d54edca tools/bazel' | sha256sum --quiet -c + echo '0eb2e378d2782e7810753e2162245ad1179c1bb12f848c692b4a595b4edf779b tools/bazel' | sha256sum --quiet -c chmod +x tools/bazel } diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 124fd7c8cdb3e..9f3e378a45fb8 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -91,7 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then export PYTORCH_TEST_WITH_ASAN=1 export PYTORCH_TEST_WITH_UBSAN=1 # TODO: Figure out how to avoid hard-coding these paths - export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-5.0/bin/llvm-symbolizer + export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-7/bin/llvm-symbolizer export TORCH_USE_RTLD_GLOBAL=1 # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our # default behavior. diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake index d8a2c279aee47..8d7633c4ab037 100644 --- a/cmake/ProtoBuf.cmake +++ b/cmake/ProtoBuf.cmake @@ -196,7 +196,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var) # If we remove all reference to these pb.h files from external # libraries and binaries this rewrite can be removed. - COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake + COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil} COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM ) @@ -209,7 +209,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var) COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil} COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil} - COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake + COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DSYSTEM_PROTOBUF=YES -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil} COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM ) endif() diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake index 704dcd7da1545..7f1de9a4a1de9 100644 --- a/cmake/ProtoBufPatch.cmake +++ b/cmake/ProtoBufPatch.cmake @@ -4,7 +4,7 @@ file(READ ${FILENAME} content) -if(LOCAL_PROTOBUF) +if(NOT SYSTEM_PROTOBUF) # protobuf-3.6.0 pattern string( REPLACE @@ -77,7 +77,7 @@ if(LOCAL_PROTOBUF) file(WRITE ${SOURCE_FILENAME} "${content_cc}") endif() -endif() +endif(NOT SYSTEM_PROTOBUF) # constexpr int TensorBoundShape_DimType_DimType_ARRAYSIZE = TensorBoundShape_DimType_DimType_MAX + 1; # throws diff --git a/third_party/protobuf b/third_party/protobuf index d0bfd5221182d..d1eca4e4b421c 160000 --- a/third_party/protobuf +++ b/third_party/protobuf @@ -1 +1 @@ -Subproject commit d0bfd5221182da1a7cc280f3337b5e41a89539cf +Subproject commit d1eca4e4b421cd2997495c4b4e65cea6be4e9b8a diff --git a/ubsan.supp b/ubsan.supp index 62e64b785b94c..395f5208c8437 100644 --- a/ubsan.supp +++ b/ubsan.supp @@ -1 +1,2 @@ vptr:libtorch_python.so +vptr:test_jit From 913c1f83f49f9e1e2a494186cc0069d780cee852 Mon Sep 17 00:00:00 2001 From: Don Jang Date: Fri, 20 Aug 2021 00:43:40 -0700 Subject: [PATCH 092/530] [Static Runtime] Add native op for aten::detach (#63625) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63625 This change adds a static runtime's native op implementation for `aten::detach` op. See the standard `aten::detach`'s implementation (https://codebrowser.bddppq.com/pytorch/pytorch/aten/src/ATen/native/TensorShape.cpp.html#_ZN2at6native6detachERKNS_6TensorE ) for comparison. Test Plan: - Added `StaticRuntime.IndividualOps_Detach`. - Observed ``` V0819 18:55:33.181188 3092034 impl.cpp:1398] Switch to native impl for node: %a.1 : Tensor = aten::detach(%input.1) ``` Reviewed By: hlu1 Differential Revision: D30443187 fbshipit-source-id: d6e0eadb1b817e0a126c4fc97526abc276ee8a17 --- benchmarks/static_runtime/test_scripts.h | 12 ++++++++++++ benchmarks/static_runtime/test_static_runtime.cc | 11 +++++++++++ torch/csrc/jit/runtime/static/native_ops.cpp | 15 +++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 8db8da2887799..9946c7af02e5a 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -286,6 +286,18 @@ const auto to_script_4 = R"JIT( return (c) )JIT"; +const auto detach_script_0 = R"JIT( + def forward(self, input: Tensor): + a = input.detach() + return input is a +)JIT"; + +const auto detach_script_1 = R"JIT( + def forward(self, input: Tensor): + a = input.detach() + return a.clone() +)JIT"; + const std::string embedding_bag_default = R"JIT( def forward(self, a: Tensor, b: Tensor, c: Tensor): return torch.embedding_bag(a, b, c) diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 14d613f074858..ec703ef8a2ec1 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -589,6 +589,17 @@ TEST(StaticRuntime, IndividualOps_to) { test_to(at::ScalarType::Half, false, true, c10::MemoryFormat::ChannelsLast); } +TEST(StaticRuntime, IndividualOps_Detach) { + auto a = at::randn({4, 3, 1, 2}); + auto b = at::randn({3, 2, 2}); + std::vector args{a}; + std::vector args2{b}; + testStaticRuntime(detach_script_0, args); + testStaticRuntime(detach_script_0, args, args2); + testStaticRuntime(detach_script_1, args); + testStaticRuntime(detach_script_1, args, args2); +} + TEST(StaticRuntime, IndividualOps_Full) { auto dtype = at::ScalarType::Int; auto cpu = at::Device(DeviceType::CPU); diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp index 616ad87119ab3..61a6554a3c5cc 100644 --- a/torch/csrc/jit/runtime/static/native_ops.cpp +++ b/torch/csrc/jit/runtime/static/native_ops.cpp @@ -355,6 +355,21 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(aten::to, aten_to, [](Node* n) -> SROperator { }; }); +REGISTER_NATIVE_OPERATOR_FUNCTOR( + aten::detach, + aten_detach, + [](Node* n) -> SROperator { + if (!n->matches( + torch::schema("aten::detach(Tensor(a) self) -> Tensor(a)"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& in0_t = p_node->Input(0).toTensor(); + p_node->Output(0) = at::native::alias(in0_t); + }; + }); + REGISTER_NATIVE_OPERATOR_FUNCTOR( prim::isinstance, prim_isinstance, From ae901e372e7b05fe1802e44fe2f1f6aa015710af Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Fri, 20 Aug 2021 06:14:13 -0700 Subject: [PATCH 093/530] [Static Runtime] Enable RemoveListMutation (#63536) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63536 Enable a pass that transforms sequences like this: ``` li = [] li.append(1) li.append(2) ``` into this: ``` li = [1, 2] ``` Initially I implemented this pass myself (D30387213), but I discovered that there is an existing pass that does the same thing. Reviewed By: hlu1 Differential Revision: D30412970 fbshipit-source-id: 0810ef03480878d5039bd800a40f5fd31c2652ec --- torch/csrc/jit/passes/freeze_module.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 063b867319629..df1c64bcc4740 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,7 @@ class AttributePropagator { auto applyOptimizations = [](std::shared_ptr& subgraph) { runOptimization( subgraph, /* unroll? */ false, /* const_prop_user_classes? */ false); + RemoveListMutation(subgraph); LowerSimpleTuples(subgraph); }; From 0f2c60f0e37fe1738d30d3db3fd48e88b8087af7 Mon Sep 17 00:00:00 2001 From: Vitaly Fedyunin Date: Fri, 20 Aug 2021 08:36:14 -0700 Subject: [PATCH 094/530] Adding IterableAsDataPipe IterDataPipe (#63522) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63522 Supports sharding and batching on loader level * **#63522 Adding IterableAsDataPipe IterDataPipe usefull for tests and simple cases** usefull for tests and simple cases Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30426528 Pulled By: VitalyFedyunin fbshipit-source-id: 535b5cc1505bb58731fcca8170541ac5ee7bd417 --- torch/utils/data/datapipes/iter/__init__.py | 4 ++++ torch/utils/data/datapipes/iter/utils.py | 10 ++++++++++ 2 files changed, 14 insertions(+) create mode 100644 torch/utils/data/datapipes/iter/utils.py diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index 0bcfdc44c31cf..bdaef95e9fa56 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -43,6 +43,9 @@ from torch.utils.data.datapipes.iter.tobytes import ( ToBytesIterDataPipe as ToBytes, ) +from torch.utils.data.datapipes.iter.utils import ( + IterableAsDataPipeIterDataPipe as IterableAsDataPipe, +) __all__ = ['Batch', 'BucketBatcher', @@ -51,6 +54,7 @@ 'Filter', 'GroupByKey', 'HttpReader', + 'IterableAsDataPipe', 'ListDirFiles', 'LoadFilesFromDisk', 'Map', diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py new file mode 100644 index 0000000000000..ea241d9f2716c --- /dev/null +++ b/torch/utils/data/datapipes/iter/utils.py @@ -0,0 +1,10 @@ +from torch.utils.data import IterDataPipe + + +class IterableAsDataPipeIterDataPipe(IterDataPipe): + def __init__(self, iterable): + self.iterable = iterable + + def __iter__(self): + for data in self.iterable: + yield data From 99e28baeba4f1ffb2623e64694b2aac13df5e0fb Mon Sep 17 00:00:00 2001 From: albanD Date: Fri, 20 Aug 2021 08:42:31 -0700 Subject: [PATCH 095/530] Small custom function refactor which doesn't change anything (#63433) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63433 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30431970 Pulled By: albanD fbshipit-source-id: 905fa4d2ddeca18005b1bcb13dd6f8a080327e7c --- torch/csrc/autograd/custom_function.cpp | 34 ++++++++++++++++++------- torch/csrc/autograd/custom_function.h | 2 ++ torch/csrc/autograd/python_function.cpp | 19 ++++++++++++-- torch/csrc/autograd/python_function.h | 11 -------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp index 502919ff3a6a4..fdcf9971a0606 100644 --- a/torch/csrc/autograd/custom_function.cpp +++ b/torch/csrc/autograd/custom_function.cpp @@ -26,17 +26,13 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const { } } -std::vector> _wrap_outputs(const variable_list &input_vars, +optional_variable_list _process_backward_mode_ad( + const std::unordered_set &inputs_set, const std::unordered_set &non_differentiable, const std::unordered_set &dirty_inputs, const at::ArrayRef> raw_outputs, const std::shared_ptr &cdata) { - std::unordered_set inputs; - inputs.reserve(input_vars.size()); - for (auto& var : input_vars) { - inputs.emplace(var.unsafeGetTensorImpl()); - } int num_outputs = raw_outputs.size(); @@ -63,7 +59,7 @@ std::vector> _wrap_outputs(const variable_list &input_va // Here, `y` requires_grad (!). } else if (is_modified) { if (var.is_leaf() && var.requires_grad()) { - throw std::runtime_error("a leaf Variable that requires grad has been used in an in-place operation."); + TORCH_CHECK(false, "a leaf Variable that requires grad has been used in an in-place operation."); } // No need to mark as modified Tensors that are not inputs. if (!is_input) { @@ -105,7 +101,7 @@ std::vector> _wrap_outputs(const variable_list &input_va } }; - std::vector> outputs; + optional_variable_list outputs; std::unordered_set outputs_impl; // For dirty_inputs check outputs.reserve(num_outputs); int num_diff_outputs = 0; @@ -125,7 +121,7 @@ std::vector> _wrap_outputs(const variable_list &input_va Variable var = raw_outputs[i].value(); auto out_tensor_impl = var.unsafeGetTensorImpl(); - bool is_input = inputs.count(out_tensor_impl) > 0; + bool is_input = inputs_set.count(out_tensor_impl) > 0; bool is_modified = dirty_inputs.count(out_tensor_impl) > 0; bool is_differentiable = cdata && non_differentiable.count(out_tensor_impl) == 0 && isDifferentiableType(var.scalar_type()); @@ -177,6 +173,26 @@ std::vector> _wrap_outputs(const variable_list &input_va return outputs; } + + +optional_variable_list _wrap_outputs(const variable_list &input_vars, + const std::unordered_set &non_differentiable, + const std::unordered_set &dirty_inputs, + const at::ArrayRef> raw_outputs, + const std::shared_ptr &cdata) { + + std::unordered_set inputs_set; + inputs_set.reserve(input_vars.size()); + for (auto& var : input_vars) { + inputs_set.emplace(var.unsafeGetTensorImpl()); + } + + auto outputs = _process_backward_mode_ad(inputs_set, non_differentiable, dirty_inputs, raw_outputs, cdata); + + + return outputs; +} + void check_variable_result(const Variable& original, const Variable& result, std::string hook_name) { if (!original.options().type_equal(result.options())) { std::stringstream ss; diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h index 243622f650666..376cab693e453 100644 --- a/torch/csrc/autograd/custom_function.h +++ b/torch/csrc/autograd/custom_function.h @@ -9,6 +9,8 @@ namespace torch { namespace autograd { +using optional_variable_list = std::vector>; + TORCH_API std::vector> _wrap_outputs( const variable_list &input_vars, const std::unordered_set &non_differentiable, diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index dd58a68134b8f..14874186d6f22 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -45,14 +45,29 @@ PyObject *THPFunctionClass = nullptr; #define THPFunction_assert(condition, ...) \ if (!(condition)) { THPUtils_setError(__VA_ARGS__); throw python_error(); } -namespace torch { namespace autograd { +// Anonymous namespace for helpful functions used in this file +namespace { -void PyNode::throw_python_error() { +// Throw a python_error with the PyErr state persisted, so that we +// don't lose the error state if the GIL is released when we don't +// have a PyThreadState created beforehand, this is made so that +// even for pure C++ thread without a pre-created PyThreadState could +// also capture the correct error message. +// TODO: This is a temporary approach to allow C++ thread to correctly +// capture Python Error in autograd, remove this when c10 thread pool +// allow to do one time initialization. +// see discussion in https://github.com/pytorch/pytorch/pull/34845 +// Follow up issue: https://github.com/pytorch/pytorch/issues/35006 +void throw_python_error() { python_error err; err.persist(); throw err; } +} + +namespace torch { namespace autograd { + // NOTE: this function is written in a way that assumes it's only called for backward; // it's used by engine.cpp. This is responsible for forwarding a call from // C++'s Node::apply to a Python method "apply". diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h index 8f4d12ba640fc..3657807f35964 100644 --- a/torch/csrc/autograd/python_function.h +++ b/torch/csrc/autograd/python_function.h @@ -27,17 +27,6 @@ struct PyNode : public Node { variable_list apply(variable_list&& inputs) override; - // Throw a python_error with the PyErr state persisted, so that we - // don't lose the error state if the GIL is released when we don't - // have a PyThreadState created beforehand, this is made so that - // even for pure C++ thread without a pre-created PyThreadState could - // also capture the correct error message. - // TODO: This is a temporary approach to allow C++ thread to correctly - // capture Python Error in autograd, remove this when c10 thread pool - // allow to do one time initialization. - // see discussion in https://github.com/pytorch/pytorch/pull/34845 - // Follow up issue: https://github.com/pytorch/pytorch/issues/35006 - void throw_python_error(); void release_variables() override; std::string name() const override; bool is_traceable() override; From 5a7133b87fe2fd7d025d36855ed4cc06539a9299 Mon Sep 17 00:00:00 2001 From: Vitaly Fedyunin Date: Fri, 20 Aug 2021 09:00:23 -0700 Subject: [PATCH 096/530] Adding DataLoader2 class as future replacement of DataLoader (#63523) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63523 Supports sharding and batching on loader level** * #63522 Adding IterableAsDataPipe IterDataPipe usefull for tests and simple cases Supports sharding and batching on loader level Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30426527 Pulled By: VitalyFedyunin fbshipit-source-id: e5905d3364c4880e720dd62fb066f08881c71a6e --- test/test_dataloader.py | 25 +++++- torch/utils/data/__init__.py | 6 +- torch/utils/data/dataloader_experimental.py | 89 +++++++++++++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 torch/utils/data/dataloader_experimental.py diff --git a/test/test_dataloader.py b/test/test_dataloader.py index c68d7e2e14b33..71230cfbb7a67 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -13,9 +13,20 @@ import warnings import tempfile from torch import multiprocessing as mp -from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset, Subset +from torch.utils.data import ( + ChainDataset, + ConcatDataset, + DataLoader, + DataLoader2, + Dataset, + IterableDataset, + Subset, + TensorDataset, + _utils +) from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL from torch.utils.data.dataset import random_split +from torch.utils.data.datapipes.iter import IterableAsDataPipe from torch._utils import ExceptionWrapper from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest, @@ -1934,6 +1945,18 @@ def test_excessive_thread_creation_warning(self): dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000) +@unittest.skipIf( + TEST_WITH_TSAN, + "Fails with TSAN with the following error: starting new threads after multi-threaded " + "fork is not supported. Dying (set die_after_fork=0 to override)") +class TestDataLoader2(TestCase): + def test_basics(self): + dp = IterableAsDataPipe(list(range(10))) + dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) + dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) + self.assertEquals(list(dl), list(dl2)) + + class StringDataset(Dataset): def __init__(self): self.s = '12345' diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py index 1d18b7b030894..0af9e6193af3d 100644 --- a/torch/utils/data/__init__.py +++ b/torch/utils/data/__init__.py @@ -11,9 +11,9 @@ from torch.utils.data.dataset import ( ChainDataset, ConcatDataset, + DataChunk, Dataset, Dataset as MapDataPipe, - DataChunk, IterableDataset, IterableDataset as IterDataPipe, Subset, @@ -34,11 +34,14 @@ runtime_validation, runtime_validation_disabled, ) +from torch.utils.data.dataloader_experimental import DataLoader2 + __all__ = ['BatchSampler', 'ChainDataset', 'ConcatDataset', 'DataLoader', + 'DataLoader2', 'Dataset', 'DistributedSampler', 'IterDataPipe', @@ -68,4 +71,3 @@ ################################################################################ # import subpackage ################################################################################ -from torch.utils.data import datapipes diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py new file mode 100644 index 0000000000000..85028afd22124 --- /dev/null +++ b/torch/utils/data/dataloader_experimental.py @@ -0,0 +1,89 @@ + +import functools + +import torch.utils.data.backward_compatibility +from torch.utils.data import DataLoader, IterDataPipe +from torch.utils.data.datapipes.iter import IterableAsDataPipe + +class DataLoader2: + def __new__(cls, + dataset, + batch_size=1, + shuffle=False, + sampler=None, + batch_sampler=None, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + timeout=0, + worker_init_fn=None, + *, + prefetch_factor=2, + persistent_workers=False, + batch_outside_worker=False): + if isinstance(dataset, IterDataPipe): + datapipe = dataset + if batch_sampler is not None: + raise Exception( + 'batch_sampler is not yet supported for DataPipes') + if sampler is not None: + raise Exception( + 'sampler is not yet supported for DataPipes') + if shuffle: + datapipe = datapipe.shuffle() + if batch_outside_worker and pin_memory: + raise Exception( + 'pin_memory is not yet compatible with batch_outside_worker') + if not batch_outside_worker: + if batch_size is not None: + datapipe = datapipe.batch(batch_size, drop_last=drop_last) + if collate_fn is None: + collate_fn = torch.utils.data._utils.collate.default_collate + + def sharding_worker_init_fn(worker_init_fn, worker_id): + if worker_init_fn is not None: + worker_init_fn(worker_id) + torch.utils.data.backward_compatibility.worker_init_fn( + worker_id) + + my_worker_init_fn = functools.partial( + sharding_worker_init_fn, worker_init_fn) + + data_loader = DataLoader(datapipe, + batch_size=None, # Replaced by .batch DataPipe + shuffle=False, # Replaced by .shuffle DataPipe + sampler=None, + batch_sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=False, # Replaced by .batch DataPipe + timeout=timeout, + worker_init_fn=my_worker_init_fn, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers) + + if not batch_outside_worker: + return data_loader + else: + if collate_fn is None: + collate_fn = torch.utils.data._utils.collate.default_collate + datapipe = IterableAsDataPipe(data_loader).batch( + batch_size, drop_last=drop_last).map(collate_fn) + return datapipe + + else: + return DataLoader(dataset, + batch_size=batch_size, + shuffle=shuffle, + sampler=sampler, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=drop_last, + timeout=timeout, + worker_init_fn=worker_init_fn, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers) From 5f997a7d2fcd81584d1c9f6e173e30c867892ee8 Mon Sep 17 00:00:00 2001 From: Pavithran Ramachandran Date: Fri, 20 Aug 2021 09:34:53 -0700 Subject: [PATCH 097/530] [PyTorch][Edge] Improve InflatableArgs for Bundled Inputs (#62368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62368 # Context The bundled inputs accepts an expression in the form of string InflatableArg.fmt that can be applied on the inputs to inflate. The InflatableArg.fmt provides flexibility to have custom transformation to inflate. When the input arguments to a function are not Tensor type, TorchScript casts the inputs from type T to Optional[T] expects the function to handle Nullable (None) clause as well. This becomes tricky to handle in one line code or lambda functions. We propose an alternative way which allows InflatableArg to include the text of a TorchScript function that would be defined on the module as a helper, then use that in its inflation expression. This can be provided by InflatableArg.fmt_fn. Please refer to pytorch/test/test_bundled_inputs.py for example on how to use the same. Also refer JacobSzwejbka comment on the same [here](https://github.com/pytorch/pytorch/pull/62368#issuecomment-892012812) # Mitigation Allow InflatedArg to include the text of a TorchScript function that would be defined on the module as a helper, then use that in its inflation expression. ghstack-source-id: 135158680 Test Plan: To run `test_dict_args` ``` (base) [pavithran@devvm1803.vll0 /data/users/pavithran/fbsource/fbcode] buck test //caffe2/test:test_bundled_inputs -- test_dict_args Action graph will be rebuilt because files have been added or removed. Building: finished in 5.4 sec (100%) 12180/12180 jobs, 0/12180 updated Total time: 5.8 sec More details at https://www.internalfb.com/intern/buck/build/fafcf277-1095-4cba-978d-6022f0d391ad Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details. Running with tpx session id: 5ef9de71-c1b1-406b-a6c0-3321c2368b8d Trace available for this run at /tmp/tpx-20210727-163946.454212/trace.log Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/7036874465805934 ✓ ListingSuccess: caffe2/test:test_bundled_inputs - main (11.365) ✓ Pass: caffe2/test:test_bundled_inputs - test_dict_args (test_bundled_inputs.TestBundledInputs) (12.307) Summary Pass: 1 ListingSuccess: 1 If you need help understanding your runs, please follow the wiki: https://fburl.com/posting_in_tpx_users Finished test run: https://www.internalfb.com/intern/testinfra/testrun/7036874465805934 ``` To check the py code of TS module: P433043973 Reviewed By: dreiss Differential Revision: D29950421 fbshipit-source-id: c819ec5c94429b7fbf6c4beb0259457f169b08ec --- test/test_bundled_inputs.py | 115 +++++++++++++++++++++++++++++++++- torch/utils/bundled_inputs.py | 75 +++++++++++++++++++--- 2 files changed, 180 insertions(+), 10 deletions(-) diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py index a0fb535da8a86..62263e130fd8b 100644 --- a/test/test_bundled_inputs.py +++ b/test/test_bundled_inputs.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import io import textwrap -from typing import List +from typing import List, Optional, Dict import torch import torch.utils.bundled_inputs @@ -324,5 +324,118 @@ def forward(self, arg): ) self.assertEqual(bundled_model2.get_all_bundled_inputs(), [(torch.ones(2),)]) + + def test_dict_args(self): + class MyModel(torch.nn.Module): + def forward( + self, + arg1: Optional[Dict[str, torch.Tensor]], + arg2: Optional[List[torch.Tensor]], + arg3: torch.Tensor, + ): + if arg1 is None: + return arg3 + elif arg2 is None: + return arg1["a"] + arg1["b"] + else: + return arg1["a"] + arg1["b"] + arg2[0] + + small_sample = dict( + a=torch.zeros([10, 20]), + b=torch.zeros([1, 1]), + c=torch.zeros([10, 20]), + ) + small_list = [torch.zeros([10, 20])] + + big_sample = dict( + a=torch.zeros([1 << 5, 1 << 8, 1 << 10]), + b=torch.zeros([1 << 5, 1 << 8, 1 << 10]), + c=torch.zeros([1 << 5, 1 << 8, 1 << 10]), + ) + big_list = [torch.zeros([1 << 5, 1 << 8, 1 << 10])] + + def condensed(t): + ret = torch.empty_like(t).flatten()[0].clone().expand(t.shape) + assert ret.storage().size() == 1 + # ret.storage()[0] = 0 + return ret + + def bundle_optional_dict_of_randn(template): + return torch.utils.bundled_inputs.InflatableArg( + value=( + None + if template is None + else {k: condensed(v) for (k, v) in template.items()} + ), + fmt="{}", + fmt_fn=""" + def {}(self, value: Optional[Dict[str, Tensor]]): + if value is None: + return None + output = {{}} + for k, v in value.items(): + output[k] = torch.randn_like(v) + return output + """, + ) + + def bundle_optional_list_of_randn(template): + return torch.utils.bundled_inputs.InflatableArg( + value=(None if template is None else [condensed(v) for v in template]), + fmt="{}", + fmt_fn=""" + def {}(self, value: Optional[List[Tensor]]): + if value is None: + return None + output = [] + for v in value: + output.append(torch.randn_like(v)) + return output + """, + ) + + out : List[str] = [] + sm = torch.jit.script(MyModel()) + original_size = model_size(sm) + small_inputs = ( + bundle_optional_dict_of_randn(small_sample), + bundle_optional_list_of_randn(small_list), + torch.zeros([3, 4]), + ) + big_inputs = ( + bundle_optional_dict_of_randn(big_sample), + bundle_optional_list_of_randn(big_list), + torch.zeros([1 << 5, 1 << 8, 1 << 10]), + ) + + torch.utils.bundled_inputs.augment_model_with_bundled_inputs( + sm, + [ + big_inputs, + small_inputs, + ], + _receive_inflate_expr=out, + ) + augmented_size = model_size(sm) + # assert the size has not increased more than 8KB + self.assertLess(augmented_size, original_size + (1 << 13)) + + loaded = save_and_load(sm) + inflated = loaded.get_all_bundled_inputs() + self.assertEqual(len(inflated[0]), len(small_inputs)) + + methods, _ = torch.utils.bundled_inputs._get_bundled_inputs_attributes_and_methods( + loaded + ) + + # One Function (forward) + # two bundled inputs (big_inputs and small_inputs) + # two args which have InflatableArg with fmt_fn + # 1 * 2 * 2 = 4 + self.assertEqual( + sum([method.startswith("_inflate_helper") for method in methods]), 4 + ) + + if __name__ == '__main__': run_tests() diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py index bce658b997255..8a6d466f20da4 100644 --- a/torch/utils/bundled_inputs.py +++ b/torch/utils/bundled_inputs.py @@ -21,13 +21,18 @@ class InflatableArg(NamedTuple): the appropriate input. It can use 'value' as an input to the format str. It must result in a value of the same type as 'value'. + 'fmt_fn' is a formatable function code string that is executed to inflate the compressed + data into the appropriate input. It must result in a value of the same type as 'value'. + The function name should be the formatable part of the string. + Note: Only top level InflatableArgs can be inflated. i.e. you cannot place an inflatable arg inside of some other structure. You should instead create an inflatable arg such that the fmt code string returns the full structure of your input. """ value: Any - fmt: str + fmt: str = "{}" + fmt_fn: str = "" def bundle_inputs( @@ -279,13 +284,21 @@ def augment_many_model_functions_with_bundled_inputs( deflated_args = [] parts.append("(") for arg_idx, arg in enumerate(args): - deflated, inflater = _inflate_expr(arg, f"deflated[{inp_idx}][{arg_idx}]") + inflate_helper_fn_name = _get_inflate_helper_fn_name(arg_idx, inp_idx, function_name) + deflated, inflater, helper_definition = _inflate_expr( + arg, + f"deflated[{inp_idx}][{arg_idx}]", + inflate_helper_fn_name, + ) deflated_args.append(deflated) parts.append(f" {inflater},") + if helper_definition: + model.define(textwrap.dedent(helper_definition)) deflated_inputs.append(tuple(deflated_args)) parts.append("),") parts.append("") expr = "\n".join(parts) + # Back-channel return this expr for debugging. if _receive_inflate_expr is not None: _receive_inflate_expr.append(expr) @@ -332,7 +345,6 @@ def get_num_bundled_inputs(self): return len(self.get_all_bundled_inputs_for_forward()) """)) - # Define some high level helper methods that act on all bundled inputs model.define(textwrap.dedent(""" def get_bundled_inputs_functions_and_info(self): @@ -341,27 +353,44 @@ def get_bundled_inputs_functions_and_info(self): return all_inputs """.format(template=get_bundled_inputs_functions_and_info_template))) -def _inflate_expr(arg: T, ref: str) -> Tuple[Union[T, torch.Tensor], str]: +def _inflate_expr( + arg: T, ref: str, inflate_helper_fn_name: str +) -> Tuple[Union[T, torch.Tensor], str, Optional[str]]: # Allow custom inflation expressions any object. # For example, calling custom image-decoding ops. # Or just use "{}" as the format string to ignore size limits. if isinstance(arg, InflatableArg): - return arg.value, arg.fmt.format(ref) + if arg.fmt_fn: + if arg.fmt not in ["{}", ""]: + raise Exception( + f"Bundled input argument at position '{ref}' has " + f"both arg.fmt_fn => \n{arg.fmt_fn} " + f"\n and arg.fmt => {arg.fmt}. " + "Please choose `arg.fmt` if the deflater is straightforward or " + "`arg.fmt_fn` if you need a function." + ) + + helper_definition = arg.fmt_fn.format(inflate_helper_fn_name) + expr = f"self.{inflate_helper_fn_name}({ref})" + + return arg.value, expr, helper_definition + else: + return arg.value, arg.fmt.format(ref), None if isinstance(arg, torch.Tensor): # Small-storage tensors can just be saved directly. if arg.storage().size() <= MAX_RAW_TENSOR_SIZE: - return arg, ref + return arg, ref, None # Small contiguous tensors can be cloned to have small storage. # TODO: Should we do this even for non-contiguous tensors? if arg.is_contiguous() and arg.numel() <= MAX_RAW_TENSOR_SIZE: - return arg.clone(), ref + return arg.clone(), ref, None # Example inputs commonly come from torch.zeros, torch.ones, or torch.full. # These can be represented compactly. for fmt in [torch.contiguous_format, torch.channels_last]: if arg.is_contiguous(memory_format=fmt) and (arg == arg.flatten()[0]).all().item(): return (arg.flatten()[0].clone().expand(*arg.size()), - f"{ref}.contiguous(memory_format={fmt})") + f"{ref}.contiguous(memory_format={fmt})", None) # Prevent big tensors from being bundled by default. # TODO: Provide more useful diagnostics. raise Exception( @@ -370,7 +399,7 @@ def _inflate_expr(arg: T, ref: str) -> Tuple[Union[T, torch.Tensor], str]: f"You probably don't want to bundle this as an input. " ) else: - return arg, ref + return arg, ref, None def _get_bundled_inputs_attributes_and_methods(script_module: torch.jit.ScriptModule) -> Tuple[List[str], List[str]]: methods: List[str] = [] @@ -389,9 +418,37 @@ def _get_bundled_inputs_attributes_and_methods(script_module: torch.jit.ScriptMo methods.append("get_all_bundled_inputs_for_" + function_name) methods.append("_generate_bundled_inputs_for_" + function_name) attributes.append("_bundled_inputs_deflated_" + function_name) + + bundled_inputs_fn = getattr( + script_module, + f"get_all_bundled_inputs_for_{function_name}" + ) + num_bundled_inputs: int = len(bundled_inputs_fn()) + + # Check inflate helper functions for each function, argument and bundled input + func = getattr(script_module, function_name, None) + for arg_idx in range(len(func.schema.arguments) - 1): + for input_idx in range(num_bundled_inputs): + helper_fn_name = _get_inflate_helper_fn_name( + arg_idx=arg_idx, + input_idx=input_idx, + function_name=function_name + ) + # if the arg has an InflatableArg with fmt_fn, add the helper function name + if hasattr(script_module, helper_fn_name): + methods.append(helper_fn_name) + return (methods, attributes) +def _get_inflate_helper_fn_name( + arg_idx: int, + input_idx: int, + function_name: str, +) -> str: + return f"_inflate_helper_for_{function_name}_input_{input_idx}_arg_{arg_idx}" + + def bundle_randn(*size, dtype=None): """Generate a tensor that will be inflated with torch.randn.""" From 03cc46a0acadcba618402a5b366f1d02bc3e21af Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Fri, 20 Aug 2021 10:49:21 -0700 Subject: [PATCH 098/530] [fx2trt] Add layernorm plugin for dynamic shape (#63620) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63620 Added layernorm dynamic plugin, so that it works when explicit batch dim is required. Needed for ig model. Changed the way of how we creating a plugin layer from instantiating the plugin directly to use plugin creator with `PluginFieldCollection`. Follow ups: Another way to convert layernorm is by breaking it down to supported trt layers. T97398182 Test Plan: layernorm unittest Reviewed By: yinghai Differential Revision: D30138205 fbshipit-source-id: aebe021d8de818e20376634f30e84579b9807f9b --- .../fx2trt/converters/acc_ops_converters.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index e7fcb94475b3e..eddb079afcac5 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -414,6 +414,66 @@ def acc_ops_batch_norm(network, target, args, kwargs, name): return layer.get_output(0) +@tensorrt_converter(acc_ops.layer_norm) +def acc_ops_layer_norm(network, target, args, kwargs, name): + input_val = kwargs["input"] + + if not isinstance(input_val, trt.tensorrt.ITensor): + raise RuntimeError(f"LayerNorm received input {input_val} that is not part " + "of the TensorRT region!") + + shape = kwargs["weight"].shape + broadcasted_shape = (1,) * (len(input_val.shape) - len(shape)) + shape + gamma = to_numpy(kwargs["weight"].reshape(*shape)) + beta = to_numpy(kwargs["bias"].reshape(*shape)) + eps = kwargs["eps"] + normalized_shape = kwargs["normalized_shape"] + + axes = 0 + for d in range(len(normalized_shape)): + axes |= 1 << (len(input_val.shape) - d - 1) + + # E[x] + mean_expected_layer = network.add_reduce(input_val, trt.ReduceOperation.AVG, axes, keep_dims=True) + mean_expected_layer.name = f"{name}_mean_expected" + # X-E[x] + sub_trt = add_binary_elementwise_layer( + network, input_val, mean_expected_layer.get_output(0), trt.ElementWiseOperation.SUB, f"{name}_sub" + ) + # Variance = mean(pow(x_sub_mean,2)) + pow_tensor = network.add_constant( + (1,) * len(input_val.shape), trt.Weights(np.ascontiguousarray([2.0], dtype=np.float32)) + ) + pow_tensor.name = f"{name}_power" + pow_var = add_binary_elementwise_layer( + network, sub_trt, pow_tensor.get_output(0), trt.ElementWiseOperation.POW, f"{name}_pow_var" + ) + mean_trt_layer = network.add_reduce(pow_var, trt.ReduceOperation.AVG, axes, keep_dims=True) + mean_trt_layer.name = f"{name}_mean" + # Variance + eps + eps_tensor = network.add_constant( + (1,) * len(input_val.shape), trt.Weights(np.ascontiguousarray([eps], dtype=np.float32)) + ) + eps_tensor.name = f"{name}_eps" + add_trt = add_binary_elementwise_layer( + network, mean_trt_layer.get_output(0), eps_tensor.get_output(0), trt.ElementWiseOperation.SUM, f"{name}_add" + ) + # SQRT((Var + eps)) + sqrt_trt = add_unary_layer(network, add_trt, trt.UnaryOperation.SQRT, f"{name}_sqrt") + # (x - E[x]) / sqrt((var + eps)) + div_trt = add_binary_elementwise_layer(network, sub_trt, sqrt_trt, trt.ElementWiseOperation.DIV, f"{name}_div_trt") + + gamma_tensor = network.add_constant(gamma.shape, trt.Weights(np.ascontiguousarray(gamma))) + gamma_tensor.name = f"{name}_gamma" + beta_tensor = network.add_constant(gamma.shape, trt.Weights(np.ascontiguousarray(beta))) + beta_tensor.name = f"{name}_beta" + # y * gamma + beta + scale_layer = add_binary_elementwise_layer( + network, div_trt, gamma_tensor.get_output(0), trt.ElementWiseOperation.PROD, f"{name}_scale" + ) + return add_binary_elementwise_layer( + network, scale_layer, beta_tensor.get_output(0), trt.ElementWiseOperation.SUM, name + ) @tensorrt_converter(acc_ops.softmax) def acc_ops_softmax(network, target, args, kwargs, name): From b95ce1591d56d545391ad5651f17ceb3b398a666 Mon Sep 17 00:00:00 2001 From: Victor Quach Date: Fri, 20 Aug 2021 11:07:22 -0700 Subject: [PATCH 099/530] Add docs describing saved tensor hooks (#62362) Summary: Add section to the Autograd mechanics docs to describe the recently exposed saved tensors (https://github.com/pytorch/pytorch/issues/52451), how to register packing / unpacking hooks (https://github.com/pytorch/pytorch/issues/60975) and how to use default hooks (https://github.com/pytorch/pytorch/issues/61834) Sister PR: https://github.com/pytorch/pytorch/issues/62361 (will add a link from autograd.rst to notes/autograd in whatever PR does not land first) Pull Request resolved: https://github.com/pytorch/pytorch/pull/62362 Reviewed By: soulitzer Differential Revision: D30453177 Pulled By: Varal7 fbshipit-source-id: f5759977b069ff0ef36a47b08856d297691a6caa --- docs/source/autograd.rst | 1 + docs/source/notes/autograd.rst | 199 +++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst index 6423d5d6d088c..8aace1ef12ab8 100644 --- a/docs/source/autograd.rst +++ b/docs/source/autograd.rst @@ -252,6 +252,7 @@ You can define how these saved tensors should be packed / unpacked using hooks. A common application is to trade compute for memory by saving those intermediary results to disk or to CPU instead of leaving them on the GPU. This is especially useful if you notice your model fits on GPU during evaluation, but not training. +Also see :ref:`saved-tensors-hooks-doc`. .. autoclass:: torch.autograd.graph.saved_tensors_hooks diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst index 0c1eed3f42457..2a59d976e9a6a 100644 --- a/docs/source/notes/autograd.rst +++ b/docs/source/notes/autograd.rst @@ -36,6 +36,57 @@ flow statements, that can change the overall shape and size of the graph at every iteration. You don't have to encode all possible paths before you launch the training - what you run is what you differentiate. +.. _saved-tensors-doc: + +Saved tensors +^^^^^^^^^^^^^ + +Some operations need intermediary results to be saved during the forward pass +in order to execute the backward pass. For example, the function +:math:`x\mapsto x^2` saves the input :math:`x` to compute the gradient. + +When defining a custom Python :class:`~torch.autograd.Function`, you can use +:func:`~torch.autograd.function._ContextMethodMixin.save_for_backward` to save +tensors during the forward pass and +:attr:`~torch.autograd.function.Function.saved_tensors` to retrieve them +during the backward pass. See :doc:`/notes/extending` for more information. + +For operations that PyTorch defines (e.g. :func:`torch.pow`), tensors are +automatically saved as needed. You can explore (for educational or debugging +purposes) which tensors are saved by a certain ``grad_fn`` by looking for its +attributes starting with the prefix ``_saved``. + +.. code:: + + x = torch.randn(5, requires_grad=True) + y = x.pow(2) + print(x.equal(y.grad_fn._saved_self)) # True + print(x is y.grad_fn._saved_self) # True + + +In the previous code, ``y.grad_fn._saved_self`` refers to the same Tensor object as `x`. +But that may not always be the case. For instance: + +.. code:: + + x = torch.randn(5, requires_grad=True) + y = x.exp() + print(y.equal(y.grad_fn._saved_result)) # True + print(y is y.grad_fn._saved_result) # False + + +Under the hood, to prevent reference cycles, PyTorch has *packed* the tensor +upon saving and *unpacked* it into a different tensor for reading. Here, the +tensor you get from accessing ``y.grad_fn._saved_result`` is a different tensor +object than ``x`` (but they still share the same storage). + +Whether a tensor will be packed into a different tensor object depends on +whether it is an output of its own `grad_fn`, which is an implementation detail +subject to change and that users should not rely on. + +You can control how PyTorch does packing / unpacking with :ref:`saved-tensors-hooks-doc`. + + .. _locally-disable-grad-doc: Locally disabling gradient computation @@ -598,3 +649,151 @@ chain rule: .. math:: \frac{\partial L}{\partial z^*} = 2 * Re(grad\_out^* * \frac{\partial s}{\partial z^{*}}) + +.. _saved-tensors-hooks-doc: + +Hooks for saved tensors +----------------------- + +You can control :ref:`how saved tensors are packed / unpacked +` by defining a pair of ``pack_hook`` / ``unpack_hook`` +hooks. The ``pack_hook`` function should take a tensor as its single argument +but can return any python object (e.g. another tensor, a tuple, or even a +string containing a filename). The ``unpack_hook`` function takes as its single +argument the output of ``pack_hook`` and should return a tensor to be used in +the backward pass. The tensor returned by ``unpack_hook`` only needs to have +the same content as the tensor passed as input to ``pack_hook``. In particular, +any autograd-related metadata can be ignored as they will be overwritten during +unpacking. + +An example of such pair is: + +.. code:: + + class SelfDeletingTempFile(): + def __init__(self): + self.name = os.path.join(tmp_dir, str(uuid.uuid4())) + + def __del__(self): + os.remove(self.name) + + def pack_hook(tensor): + temp_file = SelfDeletingTempFile() + torch.save(tensor, temp_file.name) + return temp_file + + def unpack_hook(temp_file): + return torch.load(temp_file.name) + +Notice that the ``unpack_hook`` should not delete the temporary file because it +might be called multiple times: the temporary file should be alive for as long +as the returned `SelfDeletingTempFile` object is alive. In the above example, +we prevent leaking the temporary file by closing it when it is no longer needed +(on deletion of the `SelfDeletingTempFile` object). + +.. note:: + + We guarantee that ``pack_hook`` will only be called once but ``unpack_hook`` can + be called as many times as the backward pass requires it and we expect it to + return the same data each time. + +.. warning:: + + Performing inplace operations on the input of any of the functions is forbidden + as they may lead to unexpected side-effects. PyTorch will throw an error if the + input to a pack hook is modified inplace but does not catch the case where the + input to an unpack hook is modified inplace. + + +Registering hooks for a saved tensor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can register a pair of hooks on a saved tensor by calling the +:meth:`~torch.autograd.SavedTensor.register_hooks` method on a +:class:`SavedTensor` object. Those objects are exposed as attributes of a +``grad_fn`` and start with the ``_raw_saved_`` prefix. + +.. code:: + + x = torch.randn(5, requires_grad=True) + y = x.pow(2) + y.grad_fn._raw_saved_self.register_hooks(pack_hook, unpack_hook) + +The ``pack_hook`` method is called as soon as the pair is registered. +The ``unpack_hook`` method is called each time the saved tensor needs to be +accessed, either by means of ``y.grad_fn._saved_self`` or during the backward +pass. + +.. warning:: + + If you maintain a reference to a :class:`SavedTensor` after the saved + tensors have been released (i.e. after backward has been called), calling + its :meth:`~torch.autograd.SavedTensor.register_hooks` is forbidden. + PyTorch will throw an error most of the time but it may fail + to do so in some cases and undefined behavior may arise. + +Registering default hooks for saved tensors +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Alternatively, you can use the context-manager +:class:`~torch.autograd.graph.saved_tensors_hooks` to register a pair of +hooks which will be applied to *all* saved tensors that are created in +that context. + +Example: + +.. code:: + + # Only save on disk tensors that have size >= 1000 + SAVE_ON_DISK_THRESHOLD = 1000 + + def pack_hook(x): + if x.numel() < SAVE_ON_DISK_THRESHOLD: + return x + temp_file = SelfDeletingTempFile() + torch.save(tensor, temp_file.name) + return temp_file + + def unpack_hook(tensor_or_sctf): + if isinstance(tensor_or_sctf, torch.Tensor): + return tensor_or_sctf + return torch.load(tensor_or_sctf.name) + + class Model(nn.Module): + def forward(self, x): + with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook): + # ... compute output + output = x + return output + + model = Model() + net = nn.DataParallel(model) + + + +The hooks defined with this context manager are thread-local. +Hence, the following code will not produce the desired effects because the hooks do not go +through `DataParallel`. + +.. code:: + + # Example what NOT to do + + net = nn.DataParallel(model) + with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook): + output = net(input) + + +Note that using those hooks disables all the optimization in place to reduce +Tensor object creation. For example: + +.. code:: + + with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x): + x = torch.randn(5, requires_grad=True) + y = x * x + +Without the hooks, ``x``, ``y.grad_fn._saved_self`` and +``y.grad_fn._saved_other`` all refer to the same tensor object. +With the hooks, PyTorch will pack and unpack `x` into two new tensor objects +that share the same storage with the original `x` (no copy performed). From c78ab28441f5616899d363d57317f4de29147e5c Mon Sep 17 00:00:00 2001 From: Aaron Bockover Date: Fri, 20 Aug 2021 11:11:47 -0700 Subject: [PATCH 100/530] Add support for the ONNX Runtime Eager Mode backend (#58248) Summary: This PR implements the necessary hooks/stubs/enums/etc for complete ONNX Runtime (ORT) Eager Mode integration. The actual extension will live out of tree at https://github.com/pytorch/ort. We have been [working on this at Microsoft](https://github.com/microsoft/onnxruntime-pytorch/tree/eager-ort/torch_onnxruntime) for the last few months, and are finally ready to contribute the PyTorch core changes upstream (nothing major or exciting, just the usual boilerplate for adding new backends). The ORT backend will allow us to ferry [almost] all torch ops into granular ONNX kernels that ORT will eagerly execute against any devices it supports (therefore, we only need a single ORT backend from a PyTorch perspective). Pull Request resolved: https://github.com/pytorch/pytorch/pull/58248 Reviewed By: astaff Differential Revision: D30344992 Pulled By: albanD fbshipit-source-id: 69082b32121246340d686e16653626114b7714b2 --- aten/src/ATen/Context.h | 8 ++++ aten/src/ATen/Version.cpp | 4 ++ aten/src/ATen/core/aten_interned_strings.h | 1 + aten/src/ATen/core/op_registration/README.md | 4 +- aten/src/ATen/detail/ORTHooksInterface.cpp | 31 +++++++++++++++ aten/src/ATen/detail/ORTHooksInterface.h | 36 ++++++++++++++++++ aten/src/ATen/templates/TensorBody.h | 6 +++ aten/src/ATen/test/extension_backend_test.cpp | 23 ++++++----- c10/core/Backend.h | 18 ++++----- c10/core/Device.cpp | 4 +- c10/core/DeviceType.cpp | 6 +-- c10/core/DeviceType.h | 4 +- c10/core/DispatchKey.cpp | 4 +- c10/core/DispatchKey.h | 13 +++++-- c10/core/DispatchKeySet.cpp | 1 + c10/core/DispatchKeySet.h | 2 +- c10/core/TensorImpl.h | 4 ++ c10/core/TensorOptions.h | 10 ++--- caffe2/proto/caffe2.proto | 2 +- caffe2/proto/caffe2_pb2.pyi | 4 +- ...{msnpu_extension.cpp => ort_extension.cpp} | 38 +++++++++---------- test/cpp_extensions/setup.py | 2 +- test/test_cpp_extensions_aot.py | 38 +++++++++---------- test/test_gen_backend_stubs.py | 12 +++--- test/test_torch.py | 6 +-- tools/build_variables.bzl | 1 + tools/codegen/model.py | 2 +- tools/pyi/gen_pyi.py | 1 + torch/_C/_autograd.pyi | 2 +- torch/_tensor.py | 27 +++++-------- torch/_utils.py | 9 ++--- torch/csrc/Device.h | 4 +- torch/csrc/autograd/init.cpp | 2 +- torch/csrc/autograd/python_variable.cpp | 12 ++++++ torch/csrc/jit/frontend/sugared_value.cpp | 2 +- torch/csrc/jit/runtime/register_prim_ops.cpp | 8 ++++ torch/library.h | 4 +- torch/overrides.py | 1 + 38 files changed, 236 insertions(+), 120 deletions(-) create mode 100644 aten/src/ATen/detail/ORTHooksInterface.cpp create mode 100644 aten/src/ATen/detail/ORTHooksInterface.h rename test/cpp_extensions/{msnpu_extension.cpp => ort_extension.cpp} (78%) diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 26f1d11f92b48..4a45ac6f8ac18 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,9 @@ class TORCH_API Context { static bool hasMLC() { return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC); } + static bool hasORT() { + return c10::impl::hasDeviceGuardImpl(at::DeviceType::ORT); + } // defined in header so that getNonVariableType has ability to inline // call_once check. getNonVariableType is called fairly frequently THCState* lazyInitCUDA() { @@ -292,6 +296,10 @@ static inline bool hasMLC() { return globalContext().hasMLC(); } +static inline bool hasORT() { + return globalContext().hasORT(); +} + // Despite its name, this function returns the number of *CUDA* GPUs. static inline size_t getNumGPUs() { // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index 750c90bb4c59f..0c0ea61ceb3c2 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -184,6 +184,10 @@ std::string show_config() { ss << detail::getCUDAHooks().showConfig(); } + if (hasORT()) { + ss << detail::getORTHooks().showConfig(); + } + ss << " - Build settings: "; for (const auto& pair : caffe2::GetBuildOptions()) { if (!pair.second.empty()) { diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index 584e3db9ee193..abdf397544468 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -405,6 +405,7 @@ _(aten, is_complex) \ _(aten, is_contiguous) \ _(aten, is_cuda) \ _(aten, is_mlc) \ +_(aten, is_ort) \ _(aten, is_distributed) \ _(aten, is_floating_point) \ _(aten, is_inference) \ diff --git a/aten/src/ATen/core/op_registration/README.md b/aten/src/ATen/core/op_registration/README.md index edd9f911cd0e1..5605e962a6e5e 100644 --- a/aten/src/ATen/core/op_registration/README.md +++ b/aten/src/ATen/core/op_registration/README.md @@ -13,13 +13,13 @@ There’s four main use cases * You’re writing a new operator that isn’t supposed to be part of the public PyTorch API. * You’re writing a new operator but don’t want to change the core pytorch code base, say you’re developing a shared library with operators. * You’re writing a C++ extension for PyTorch or you’re using inline c++ in your .py model files. -* You’re writing a backend library like XLA or MSNPU that adds new kernels to all operators defined in `native_functions.yaml`. +* You’re writing a backend library like XLA or ORT that adds new kernels to all operators defined in `native_functions.yaml`. For these use cases, the custom operator API is the better solution. ### What is the price for using the custom operator API instead of `native_functions.yaml`? -If you’re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/MSNPU example above), then you’re fine and don’t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats. +If you’re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/ORT example above), then you’re fine and don’t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats. * It will not get a C++ API generated. There will not be `Tensor::your_op()` methods or `at::your_op()` functions to call your operator. * The API for calling the operator from Python looks a little bit different. It needs to be called through `torch.ops.your_op()` instead of `torch._C`. diff --git a/aten/src/ATen/detail/ORTHooksInterface.cpp b/aten/src/ATen/detail/ORTHooksInterface.cpp new file mode 100644 index 0000000000000..33f70935a04d0 --- /dev/null +++ b/aten/src/ATen/detail/ORTHooksInterface.cpp @@ -0,0 +1,31 @@ +#include + +#include + +#include +#include +#include + +namespace at { +namespace detail { + +// See getCUDAHooks for some more commentary +const ORTHooksInterface& getORTHooks() { + static std::unique_ptr ort_hooks; + static std::once_flag once; + std::call_once(once, [] { + ort_hooks = ORTHooksRegistry()->Create("ORTHooks", {}); + if (!ort_hooks) { + ort_hooks = + // NOLINTNEXTLINE(modernize-make-unique) + std::unique_ptr(new ORTHooksInterface()); + } + }); + return *ort_hooks; +} +} // namespace detail + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +C10_DEFINE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs) + +} // namespace at diff --git a/aten/src/ATen/detail/ORTHooksInterface.h b/aten/src/ATen/detail/ORTHooksInterface.h new file mode 100644 index 0000000000000..caee55cdfaf99 --- /dev/null +++ b/aten/src/ATen/detail/ORTHooksInterface.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +constexpr const char* ORT_HELP = + " You need to 'import torch_ort' to use the 'ort' device in PyTorch. " + "The 'torch_ort' module is provided by the ONNX Runtime itself " + "(https://onnxruntime.ai)."; + +// NB: Class must live in `at` due to limitations of Registry.h. +namespace at { + +struct TORCH_API ORTHooksInterface { + // This should never actually be implemented, but it is used to + // squelch -Werror=non-virtual-dtor + virtual ~ORTHooksInterface() {} + + virtual std::string showConfig() const { + TORCH_CHECK(false, "Cannot query detailed ORT version information.", ORT_HELP); + } +}; + +// NB: dummy argument to suppress "ISO C++11 requires at least one argument +// for the "..." in a variadic macro" +struct TORCH_API ORTHooksArgs {}; + +C10_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs); +#define REGISTER_ORT_HOOKS(clsname) \ + C10_REGISTER_CLASS(ORTHooksRegistry, clsname, clsname) + +namespace detail { +TORCH_API const ORTHooksInterface& getORTHooks(); +} // namespace detail + +} // namespace at diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index be14980fb2d14..a6e6583c7b19c 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -492,6 +492,12 @@ class TORCH_API Tensor { return impl_->is_mlc(); } + /// Returns if a `Tensor` is ort tensor. + bool is_ort() const { + // NB: this is not a native function to avoid dispatching overhead. + return impl_->is_ort(); + } + /// Returns if a `Tensor` is vulkan tensor. bool is_vulkan() const { // NB: this is not a native function to avoid dispatching overhead. diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp index 531507e96697e..9b215a90ae74a 100644 --- a/aten/src/ATen/test/extension_backend_test.cpp +++ b/aten/src/ATen/test/extension_backend_test.cpp @@ -6,6 +6,11 @@ #include +// NB. These tests use the ORT dispatch key to test backend dispatching +// machinery, but these tests are not specific to ORT at all. The ORT +// backend is fully out-of-tree, so it's safe to use this key for +// in-tree tests. + using namespace at; static int test_int; @@ -17,16 +22,16 @@ Tensor empty_override(IntArrayRef size, c10::optional dtype, c10::op Storage( Storage::use_byte_size_t(), 0, - at::DataPtr(nullptr, Device(DeviceType::MSNPU, 1)), + at::DataPtr(nullptr, Device(DeviceType::ORT, 1)), nullptr, false), - DispatchKey::MSNPU, + DispatchKey::ORT, caffe2::TypeMeta::Make()); return Tensor(std::move(tensor_impl)); } Tensor add_override(const Tensor & a, const Tensor & b , const Scalar& c) { - auto out = empty({5, 5}, at::kMSNPU); // Don't return self as-is + auto out = empty({5, 5}, at::kORT); // Don't return self as-is test_int = 2; return out; } @@ -42,28 +47,28 @@ Tensor empty_strided_override( return empty_override(size, dtype, layout, device, pin_memory, c10::nullopt); } -TORCH_LIBRARY_IMPL(aten, MSNPU, m) { +TORCH_LIBRARY_IMPL(aten, ORT, m) { m.impl("aten::empty.memory_format", empty_override); m.impl("aten::empty_strided", empty_strided_override); m.impl("aten::add.Tensor", add_override); } TEST(BackendExtensionTest, TestRegisterOp) { - Tensor a = empty({5, 5}, at::kMSNPU); - ASSERT_EQ(a.device().type(), at::kMSNPU); + Tensor a = empty({5, 5}, at::kORT); + ASSERT_EQ(a.device().type(), at::kORT); ASSERT_EQ(a.device().index(), 1); ASSERT_EQ(a.dtype(), caffe2::TypeMeta::Make()); ASSERT_EQ(test_int, 1); - Tensor b = empty_like(a, at::kMSNPU); - ASSERT_EQ(b.device().type(), at::kMSNPU); + Tensor b = empty_like(a, at::kORT); + ASSERT_EQ(b.device().type(), at::kORT); ASSERT_EQ(b.device().index(), 1); ASSERT_EQ(b.dtype(), caffe2::TypeMeta::Make()); add(a, b); ASSERT_EQ(test_int, 2); - // Ensure that non-MSNPU operator still works + // Ensure that non-ORT operator still works Tensor d = empty({5, 5}, at::kCPU); ASSERT_EQ(d.device().type(), at::kCPU); } diff --git a/c10/core/Backend.h b/c10/core/Backend.h index 2f071345311f2..e17a1bc4226c6 100644 --- a/c10/core/Backend.h +++ b/c10/core/Backend.h @@ -40,7 +40,7 @@ enum class Backend { SparseHIP, SparseVE, SparseXPU, - MSNPU, + ORT, XLA, Vulkan, Metal, @@ -66,8 +66,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) { return Backend::VE; } else if (t == DispatchKey::FPGA) { return Backend::FPGA; - } else if (t == DispatchKey::MSNPU) { - return Backend::MSNPU; + } else if (t == DispatchKey::ORT) { + return Backend::ORT; } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) { return Backend::XLA; } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) { @@ -123,8 +123,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) { return DispatchKey::VE; case Backend::FPGA: return DispatchKey::FPGA; - case Backend::MSNPU: - return DispatchKey::MSNPU; + case Backend::ORT: + return DispatchKey::ORT; case Backend::XLA: return DispatchKey::XLA; case Backend::Lazy: @@ -178,8 +178,8 @@ static inline DeviceType backendToDeviceType(Backend b) { return DeviceType::VE; case Backend::FPGA: return DeviceType::FPGA; - case Backend::MSNPU: - return DeviceType::MSNPU; + case Backend::ORT: + return DeviceType::ORT; case Backend::XLA: return DeviceType::XLA; case Backend::Lazy: @@ -235,8 +235,8 @@ static inline const char* toString(Backend b) { return "FPGA"; case Backend::XPU: return "XPU"; - case Backend::MSNPU: - return "MSNPU"; + case Backend::ORT: + return "ORT"; case Backend::XLA: return "XLA"; case Backend::Lazy: diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index 2709c29ce8460..2531e3942271a 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -28,7 +28,7 @@ DeviceType parse_type(const std::string& device_string) { {"hip", DeviceType::HIP}, {"ve", DeviceType::VE}, {"fpga", DeviceType::FPGA}, - {"msnpu", DeviceType::MSNPU}, + {"ort", DeviceType::ORT}, {"xla", DeviceType::XLA}, {"lazy", DeviceType::Lazy}, {"vulkan", DeviceType::Vulkan}, @@ -47,7 +47,7 @@ DeviceType parse_type(const std::string& device_string) { } TORCH_CHECK( false, - "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ", + "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ", device_string); } enum DeviceStringParsingState { START, INDEX_START, INDEX_REST, ERROR }; diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp index 4ff939806f980..4635acdb148c2 100644 --- a/c10/core/DeviceType.cpp +++ b/c10/core/DeviceType.cpp @@ -25,8 +25,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) { return lower_case ? "ve" : "VE"; case DeviceType::FPGA: return lower_case ? "fpga" : "FPGA"; - case DeviceType::MSNPU: - return lower_case ? "msnpu" : "MSNPU"; + case DeviceType::ORT: + return lower_case ? "ort" : "ORT"; case DeviceType::XLA: return lower_case ? "xla" : "XLA"; case DeviceType::Lazy: @@ -75,7 +75,7 @@ bool isValidDeviceType(DeviceType d) { case DeviceType::HIP: case DeviceType::VE: case DeviceType::FPGA: - case DeviceType::MSNPU: + case DeviceType::ORT: case DeviceType::XLA: case DeviceType::Lazy: case DeviceType::MLC: diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h index 2ae028d144026..c6bd56914d6d1 100644 --- a/c10/core/DeviceType.h +++ b/c10/core/DeviceType.h @@ -21,7 +21,7 @@ enum class DeviceType : int8_t { IDEEP = 5, // IDEEP. HIP = 6, // AMD HIP FPGA = 7, // FPGA - MSNPU = 8, // MSNPU + ORT = 8, // ONNX Runtime / Microsoft XLA = 9, // XLA / TPU Vulkan = 10, // Vulkan Metal = 11, // Metal @@ -42,7 +42,7 @@ constexpr DeviceType kCPU = DeviceType::CPU; constexpr DeviceType kCUDA = DeviceType::CUDA; constexpr DeviceType kHIP = DeviceType::HIP; constexpr DeviceType kFPGA = DeviceType::FPGA; -constexpr DeviceType kMSNPU = DeviceType::MSNPU; +constexpr DeviceType kORT = DeviceType::ORT; constexpr DeviceType kXLA = DeviceType::XLA; constexpr DeviceType kMLC = DeviceType::MLC; constexpr DeviceType kMeta = DeviceType::Meta; diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index 5c414484b38fd..18aa4fc32fb64 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -19,8 +19,8 @@ const char* toString(DispatchKey t) { return "FPGA"; case DispatchKey::XPU: return "XPU"; - case DispatchKey::MSNPU: - return "MSNPU"; + case DispatchKey::ORT: + return "ORT"; case DispatchKey::XLA: return "XLA"; case DispatchKey::Lazy: diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 5b20a1ca327df..07222b79ee964 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -59,8 +59,15 @@ enum class DispatchKey : uint8_t { // CUDA] FPGA, // Xilinx support lives out of tree at // https://gitlab.com/pytorch-complex/vitis_kernels - MSNPU, // unused externally, but tested at - // test/cpp_extensions/msnpu_extension.cpp + + // ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and + // https://github.com/microsoft/onnxruntime, and is also used to test general + // backend/extension machinery in the core. cf: + // - test/cpp_extensions/ort_extension.cpp + // - test/test_torch.py + // - aten/src/ATen/test/extension_backend_test.cpp + ORT, + XLA, // lives out of tree at https://github.com/pytorch/xla MLC, // lives out of tree at https://github.com/pytorch/MLCompute Vulkan, @@ -114,7 +121,7 @@ enum class DispatchKey : uint8_t { // Here are reserved backends for user-defined backends, see Note [Private use // DispatchKey] - // To see some example about how to use this, check out MSNPU + // To see some example about how to use this, check out ORT PrivateUse1, PrivateUse2, PrivateUse3, diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index b796114d4a608..404acc7cb1db3 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -19,6 +19,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends | DispatchKey::PrivateUse3, DispatchKey::MLC, DispatchKey::HPU, + DispatchKey::ORT, DispatchKey::Meta, }); diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 0d3a25ea9d8d1..b1f5f04524d19 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -248,7 +248,7 @@ constexpr DispatchKeySet autogradother_backends = DispatchKeySet( {DispatchKey::HIP, DispatchKey::VE, DispatchKey::FPGA, - DispatchKey::MSNPU, + DispatchKey::ORT, DispatchKey::Vulkan, DispatchKey::Metal, DispatchKey::QuantizedCPU, diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 65d7af38e3599..7051e36b35516 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -873,6 +873,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { return key_set_.has(DispatchKey::MLC); } + bool is_ort() const { + return key_set_.has(DispatchKey::ORT); + } + // TODO: remove this once we don't automatically enabled Autograd dispatch // keys // in TensorImpl constructor. diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index fff9433e270f7..287b2fa41b2a3 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -663,8 +663,8 @@ inline DispatchKey computeDispatchKey( return DispatchKey::VE; case DeviceType::FPGA: return DispatchKey::FPGA; - case DeviceType::MSNPU: - return DispatchKey::MSNPU; + case DeviceType::ORT: + return DispatchKey::ORT; case DeviceType::XLA: return DispatchKey::XLA; case DeviceType::Lazy: @@ -790,10 +790,8 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) { case DispatchKey::HPU: case DispatchKey::AutogradHPU: return DeviceType::HPU; - - // stuff that isn't real - case DispatchKey::MSNPU: - return DeviceType::MSNPU; + case DispatchKey::ORT: + return DeviceType::ORT; default: TORCH_CHECK( false, diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto index 6e055778578ab..90a2020195f60 100644 --- a/caffe2/proto/caffe2.proto +++ b/caffe2/proto/caffe2.proto @@ -219,7 +219,7 @@ enum DeviceTypeProto { PROTO_IDEEP = 5; // IDEEP. PROTO_HIP = 6; // AMD HIP PROTO_FPGA = 7; // FPGA - PROTO_MSNPU = 8; // MSNPU + PROTO_ORT = 8; // ONNX Runtime PROTO_XLA = 9; // XLA / TPU PROTO_MLC = 10; // ML Compute // Change the following number if you add more devices in the code. diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi index 1258664bee165..f7f4430d7b761 100644 --- a/caffe2/proto/caffe2_pb2.pyi +++ b/caffe2/proto/caffe2_pb2.pyi @@ -23,7 +23,7 @@ class _DeviceTypeProto(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapp PROTO_IDEEP = DeviceTypeProto.V(5) PROTO_HIP = DeviceTypeProto.V(6) PROTO_FPGA = DeviceTypeProto.V(7) - PROTO_MSNPU = DeviceTypeProto.V(8) + PROTO_ORT = DeviceTypeProto.V(8) PROTO_XLA = DeviceTypeProto.V(9) PROTO_MLC = DeviceTypeProto.V(10) PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11) @@ -37,7 +37,7 @@ PROTO_OPENCL = DeviceTypeProto.V(4) PROTO_IDEEP = DeviceTypeProto.V(5) PROTO_HIP = DeviceTypeProto.V(6) PROTO_FPGA = DeviceTypeProto.V(7) -PROTO_MSNPU = DeviceTypeProto.V(8) +PROTO_ORT = DeviceTypeProto.V(8) PROTO_XLA = DeviceTypeProto.V(9) PROTO_MLC = DeviceTypeProto.V(10) PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11) diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/ort_extension.cpp similarity index 78% rename from test/cpp_extensions/msnpu_extension.cpp rename to test/cpp_extensions/ort_extension.cpp index e47347c40fbfa..b646f3b14939d 100644 --- a/test/cpp_extensions/msnpu_extension.cpp +++ b/test/cpp_extensions/ort_extension.cpp @@ -10,10 +10,10 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) { Storage( Storage::use_byte_size_t(), 0, - at::DataPtr(nullptr, Device(DeviceType::MSNPU, 0)), + at::DataPtr(nullptr, Device(DeviceType::ORT, 0)), nullptr, false), - DispatchKey::MSNPU, + DispatchKey::ORT, dtype); // This is a hack to workaround the shape checks in _convolution. tensor_impl->set_sizes_contiguous(size); @@ -52,7 +52,7 @@ std::tuple fake_convolution_backward( get_tensor(input.dtype(), {})); } -TORCH_LIBRARY_IMPL(aten, MSNPU, m) { +TORCH_LIBRARY_IMPL(aten, ORT, m) { m.impl("empty.memory_format", empty_override); m.impl("add.out", add_out_override); m.impl("convolution_overrideable", fake_convolution); @@ -61,34 +61,34 @@ TORCH_LIBRARY_IMPL(aten, MSNPU, m) { // TODO: Extend this to exercise multi-device setting. In that case, // we need to add a thread local variable to track the current device. -struct MSNPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { - static constexpr DeviceType static_type = DeviceType::MSNPU; - MSNPUGuardImpl() {} - MSNPUGuardImpl(DeviceType t) { - AT_ASSERT(t == DeviceType::MSNPU); +struct ORTGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr DeviceType static_type = DeviceType::ORT; + ORTGuardImpl() {} + ORTGuardImpl(DeviceType t) { + AT_ASSERT(t == DeviceType::ORT); } DeviceType type() const override { - return DeviceType::MSNPU; + return DeviceType::ORT; } Device exchangeDevice(Device d) const override { - AT_ASSERT(d.type() == DeviceType::MSNPU); + AT_ASSERT(d.type() == DeviceType::ORT); AT_ASSERT(d.index() == 0); return d; } Device getDevice() const override { - return Device(DeviceType::MSNPU, 0); + return Device(DeviceType::ORT, 0); } void setDevice(Device d) const override { - AT_ASSERT(d.type() == DeviceType::MSNPU); + AT_ASSERT(d.type() == DeviceType::ORT); AT_ASSERT(d.index() == 0); } void uncheckedSetDevice(Device d) const noexcept override { } Stream getStream(Device d) const noexcept override { - return Stream(Stream::DEFAULT, Device(DeviceType::MSNPU, 0)); + return Stream(Stream::DEFAULT, Device(DeviceType::ORT, 0)); } Stream exchangeStream(Stream s) const noexcept override { - return Stream(Stream::DEFAULT, Device(DeviceType::MSNPU, 0)); + return Stream(Stream::DEFAULT, Device(DeviceType::ORT, 0)); } DeviceIndex deviceCount() const noexcept override { return 1; @@ -99,23 +99,23 @@ struct MSNPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { const Stream& stream, const DeviceIndex device_index, const EventFlag flag) const override { - TORCH_CHECK(false, "MSNPU backend doesn't support events."); + TORCH_CHECK(false, "ORT backend doesn't support events."); } void block( void* event, const Stream& stream) const override { - TORCH_CHECK(false, "MSNPU backend doesn't support events."); + TORCH_CHECK(false, "ORT backend doesn't support events."); } bool queryEvent(void* event) const override { - TORCH_CHECK(false, "MSNPU backend doesn't support events."); + TORCH_CHECK(false, "ORT backend doesn't support events."); } void destroyEvent( void* event, const DeviceIndex device_index) const noexcept override { } }; -constexpr DeviceType MSNPUGuardImpl::static_type; -C10_REGISTER_GUARD_IMPL(MSNPU, MSNPUGuardImpl); +constexpr DeviceType ORTGuardImpl::static_type; +C10_REGISTER_GUARD_IMPL(ORT, ORTGuardImpl); int get_test_int() { return test_int; diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py index 8f77938ae3226..7888d0e3a88bb 100644 --- a/test/cpp_extensions/setup.py +++ b/test/cpp_extensions/setup.py @@ -21,7 +21,7 @@ 'torch_test_cpp_extension.cpp', ['extension.cpp'], extra_compile_args=CXX_FLAGS), CppExtension( - 'torch_test_cpp_extension.msnpu', ['msnpu_extension.cpp'], + 'torch_test_cpp_extension.ort', ['ort_extension.cpp'], extra_compile_args=CXX_FLAGS), CppExtension( 'torch_test_cpp_extension.rng', ['rng_extension.cpp'], diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py index 307df0eed5e9a..cf35e6b13265d 100644 --- a/test/test_cpp_extensions_aot.py +++ b/test/test_cpp_extensions_aot.py @@ -19,11 +19,11 @@ try: if HAS_PYTEST: cpp_extension = pytest.importorskip("torch_test_cpp_extension.cpp") - msnpu_extension = pytest.importorskip("torch_test_cpp_extension.msnpu") + ort_extension = pytest.importorskip("torch_test_cpp_extension.ort") rng_extension = pytest.importorskip("torch_test_cpp_extension.rng") else: import torch_test_cpp_extension.cpp as cpp_extension - import torch_test_cpp_extension.msnpu as msnpu_extension + import torch_test_cpp_extension.ort as ort_extension import torch_test_cpp_extension.rng as rng_extension except ImportError as e: raise RuntimeError( @@ -100,45 +100,45 @@ def test_optional(self): self.assertFalse(has_value) -class TestMSNPUTensor(common.TestCase): +class TestORTTensor(common.TestCase): def test_unregistered(self): a = torch.arange(0, 10, device='cpu') with self.assertRaisesRegex(RuntimeError, "Could not run"): - b = torch.arange(0, 10, device='msnpu') + b = torch.arange(0, 10, device='ort') def test_zeros(self): a = torch.empty(5, 5, device='cpu') self.assertEqual(a.device, torch.device('cpu')) - b = torch.empty(5, 5, device='msnpu') - self.assertEqual(b.device, torch.device('msnpu', 0)) - self.assertEqual(msnpu_extension.get_test_int(), 0) + b = torch.empty(5, 5, device='ort') + self.assertEqual(b.device, torch.device('ort', 0)) + self.assertEqual(ort_extension.get_test_int(), 0) self.assertEqual(torch.get_default_dtype(), b.dtype) - c = torch.empty((5, 5), dtype=torch.int64, device='msnpu') - self.assertEqual(msnpu_extension.get_test_int(), 0) + c = torch.empty((5, 5), dtype=torch.int64, device='ort') + self.assertEqual(ort_extension.get_test_int(), 0) self.assertEqual(torch.int64, c.dtype) def test_add(self): - a = torch.empty(5, 5, device='msnpu', requires_grad=True) - self.assertEqual(msnpu_extension.get_test_int(), 0) + a = torch.empty(5, 5, device='ort', requires_grad=True) + self.assertEqual(ort_extension.get_test_int(), 0) - b = torch.empty(5, 5, device='msnpu') - self.assertEqual(msnpu_extension.get_test_int(), 0) + b = torch.empty(5, 5, device='ort') + self.assertEqual(ort_extension.get_test_int(), 0) c = a + b - self.assertEqual(msnpu_extension.get_test_int(), 1) + self.assertEqual(ort_extension.get_test_int(), 1) def test_conv_backend_override(self): # To simplify tests, we use 4d input here to avoid doing view4d( which # needs more overrides) in _convolution. - input = torch.empty(2, 4, 10, 2, device='msnpu', requires_grad=True) - weight = torch.empty(6, 4, 2, 2, device='msnpu', requires_grad=True) - bias = torch.empty(6, device='msnpu') + input = torch.empty(2, 4, 10, 2, device='ort', requires_grad=True) + weight = torch.empty(6, 4, 2, 2, device='ort', requires_grad=True) + bias = torch.empty(6, device='ort') # Make sure forward is overriden out = torch.nn.functional.conv1d(input, weight, bias, 2, 0, 1, 1) - self.assertEqual(msnpu_extension.get_test_int(), 2) + self.assertEqual(ort_extension.get_test_int(), 2) self.assertEqual(out.shape[0], input.shape[0]) self.assertEqual(out.shape[1], weight.shape[0]) @@ -146,7 +146,7 @@ def test_conv_backend_override(self): # Double backward is dispatched to _convolution_double_backward. # It is not tested here as it involves more computation/overrides. grad = torch.autograd.grad(out, input, out, create_graph=True) - self.assertEqual(msnpu_extension.get_test_int(), 3) + self.assertEqual(ort_extension.get_test_int(), 3) self.assertEqual(grad[0].shape, input.shape) diff --git a/test/test_gen_backend_stubs.py b/test/test_gen_backend_stubs.py index e1a66c69fe6f5..f788a8f34c761 100644 --- a/test/test_gen_backend_stubs.py +++ b/test/test_gen_backend_stubs.py @@ -138,11 +138,11 @@ def test_supported_invalid_op(self): self.assertExpectedInline(output_error, '''Found an invalid operator name: abs_BAD''') # The backend is valid, but doesn't have a valid autograd key. They can't override autograd kernels in that case. - # Only using MSNPU here because it has a valid backend key but not an autograd key- if this changes we can update the test. + # Only using Vulkan here because it has a valid backend key but not an autograd key- if this changes we can update the test. def test_backend_has_no_autograd_key_but_provides_entries(self): yaml_str = '''\ -backend: MSNPU -cpp_namespace: torch_msnpu +backend: Vulkan +cpp_namespace: torch_vulkan supported: - add autograd: @@ -155,7 +155,7 @@ def test_backend_has_no_autograd_key_but_provides_entries(self): def test_backend_autograd_kernel_mismatch_out_functional(self): yaml_str = '''\ backend: XLA -cpp_namespace: torch_msnpu +cpp_namespace: torch_xla supported: - add.Tensor autograd: @@ -168,7 +168,7 @@ def test_backend_autograd_kernel_mismatch_out_functional(self): def test_backend_autograd_kernel_mismatch_functional_inplace(self): yaml_str = '''\ backend: XLA -cpp_namespace: torch_msnpu +cpp_namespace: torch_xla supported: - add.Tensor autograd: @@ -182,7 +182,7 @@ def test_backend_autograd_kernel_mismatch_functional_inplace(self): def test_op_appears_in_supported_and_autograd_lists(self): yaml_str = '''\ backend: XLA -cpp_namespace: torch_msnpu +cpp_namespace: torch_xla supported: - add.Tensor autograd: diff --git a/test/test_torch.py b/test/test_torch.py index 515052ae5ad67..d0f631a2eab52 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -221,10 +221,10 @@ def test_namespace(ns, *skips): # TODO: add torch.* tests when we have proper namespacing on ATen functions # test_namespace(torch) - def test_msnpu_error(self): + def test_ort_error(self): with self.assertRaisesRegex(RuntimeError, - "Could not run 'aten::empty.memory_format' with arguments from the 'MSNPU' backend"): - torch.zeros(1, device=torch.device('msnpu')) + "Could not run 'aten::empty.memory_format' with arguments from the 'ORT' backend"): + torch.zeros(1, device=torch.device('ort')) def test_has_storage(self): self.assertIsNotNone(torch.tensor([]).storage()) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 89697b4428ca1..e20d97333c83e 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -829,6 +829,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/detail/CPUGuardImpl.cpp", "aten/src/ATen/detail/CUDAHooksInterface.cpp", "aten/src/ATen/detail/HIPHooksInterface.cpp", + "aten/src/ATen/detail/ORTHooksInterface.cpp", "aten/src/ATen/metal/Context.cpp", "aten/src/ATen/native/AutogradComposite.cpp", "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp", diff --git a/tools/codegen/model.py b/tools/codegen/model.py index d6f02d5a6898d..4f82b70ee31f2 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -56,7 +56,7 @@ class DispatchKey(Enum): CUDA = auto() HIP = auto() FPGA = auto() - MSNPU = auto() + ORT = auto() XLA = auto() Lazy = auto() Vulkan = auto() diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 4f39fec2188fc..882b7f114e2e3 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -469,6 +469,7 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - 'is_sparse_csr' : ['is_sparse_csr: _bool'], 'is_quantized': ['is_quantized: _bool'], 'is_meta': ['is_meta: _bool'], + 'is_ort': ['is_ort: _bool'], 'is_mkldnn': ['is_mkldnn: _bool'], 'is_vulkan': ['is_vulkan: _bool'], 'storage_offset': ['def storage_offset(self) -> _int: ...'], diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi index 6468eb551f9cd..7ffb618e3f072 100644 --- a/torch/_C/_autograd.pyi +++ b/torch/_C/_autograd.pyi @@ -24,7 +24,7 @@ class DeviceType(Enum): IDEEP = ... HIP = ... FPGA = ... - MSNPU = ... + ORT = ... XLA = ... MLC = ... HPU = ... diff --git a/torch/_tensor.py b/torch/_tensor.py index 2bd617d3971a9..b4cee9aa2a32c 100644 --- a/torch/_tensor.py +++ b/torch/_tensor.py @@ -90,7 +90,7 @@ def __deepcopy__(self, memo): # does accurate alias tracking; however, the code below # doesn't work because of # https://github.com/pytorch/pytorch/issues/47442 - if self.is_sparse or self.device.type in ['xla', 'mlc', 'meta']: + if self.is_sparse or self.device.type in ['xla', 'mlc', 'ort', 'meta']: new_tensor = self.clone() else: new_storage = self.storage().__deepcopy__(memo) @@ -153,28 +153,21 @@ def _reduce_ex_internal(self, proto): # See Note [Don't serialize hooks] torch.utils.hooks.warn_if_has_hooks(self) backward_hooks: Dict[Any, Any] = OrderedDict() - # Note: Numpy array is chosen to be the rebuild component for XLA Tensor. + # Note: Numpy array is chosen to be the rebuild component for XLA, ORT, MLC Tensors. # We considered a few options: # 1. CPU tensor can't be used here. # Otherwise in torch.load CPU storage is reconstructed with randomly - # initialized data, moved onto XLA device, and then storage is updated - # to the serialized content. This works perfectly for CPU/CUDA but not XLA. - # XLA tensor is disconnected with storage so it doesn't get the update. + # initialized data, moved onto backend device, and then storage is updated + # to the serialized content. This works perfectly for CPU/CUDA but not these backends; + # their tensors are disconnected with storage so they don't get the update. # 2. Python list is not a good fit due to performance reason. # `tolist()` converts every single element in the tensor into python objects # and serialize them one by one. - if self.device.type == 'xla': - arg_xla = (self.cpu().numpy(), - self.dtype, - str(self.device), - self.requires_grad) - return (torch._utils._rebuild_xla_tensor, arg_xla) - if self.device.type == 'mlc': - arg_mlc = (self.cpu().numpy(), - self.dtype, - str(self.device), - self.requires_grad) - return (torch._utils._rebuild_mlc_tensor, arg_mlc) + if self.device.type in ['xla', 'ort', 'mlc']: + return (torch._utils._rebuild_device_tensor_from_numpy, (self.cpu().numpy(), + self.dtype, + str(self.device), + self.requires_grad)) if self.device.type == 'meta': # NB: This implementation BREAKS storage sharing. Current # hypothesis is that no one cares for meta tensors. diff --git a/torch/_utils.py b/torch/_utils.py index 210b0cde793a6..75e9075e4250f 100644 --- a/torch/_utils.py +++ b/torch/_utils.py @@ -173,16 +173,15 @@ def _rebuild_sparse_tensor(layout, data): raise NotImplementedError("rebuilding sparse tensor for layout %s" % (layout)) -def _rebuild_xla_tensor(data, dtype, device, requires_grad): +def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad): tensor = torch.from_numpy(data).to(dtype=dtype, device=device) tensor.requires_grad = requires_grad return tensor -def _rebuild_mlc_tensor(data, dtype, device, requires_grad): - tensor = torch.from_numpy(data).to(dtype=dtype, device=device) - tensor.requires_grad = requires_grad - return tensor +# Should not be used, only here to be able to load Tensors serialized with older versions of pytorch +_rebuild_xla_tensor = _rebuild_device_tensor_from_numpy +_rebuild_mlc_tensor = _rebuild_device_tensor_from_numpy def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad): diff --git a/torch/csrc/Device.h b/torch/csrc/Device.h index b1f18dcebd1ab..32868120c06a1 100644 --- a/torch/csrc/Device.h +++ b/torch/csrc/Device.h @@ -17,6 +17,6 @@ inline bool THPDevice_Check(PyObject *obj) { return Py_TYPE(obj) == &THPDeviceType; } -PyObject * THPDevice_New(const at::Device& device); +TORCH_API PyObject * THPDevice_New(const at::Device& device); -void THPDevice_init(PyObject *module); +TORCH_API void THPDevice_init(PyObject *module); diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 2eacbf1cd3839..697ca871f83c5 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -114,7 +114,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("IDEEP", c10::DeviceType::IDEEP) .value("HIP", c10::DeviceType::HIP) .value("FPGA", c10::DeviceType::FPGA) - .value("MSNPU", c10::DeviceType::MSNPU) + .value("ORT", c10::DeviceType::ORT) .value("XLA", c10::DeviceType::XLA) .value("Lazy", c10::DeviceType::Lazy) .value("MLC", c10::DeviceType::MLC) diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 303584603aaa0..50d6eb9ab7e05 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -834,6 +834,17 @@ PyObject *THPVariable_is_mlc(THPVariable *self, void *unused) END_HANDLE_TH_ERRORS } +PyObject *THPVariable_is_ort(THPVariable *self, void *unused) +{ + HANDLE_TH_ERRORS + if (check_has_torch_function((PyObject *)self)) { + return handle_torch_function_getter(self, "is_ort"); + } + auto& self_ = THPVariable_Unpack(self); + return torch::autograd::utils::wrap(self_.is_ort()); + END_HANDLE_TH_ERRORS +} + PyObject *THPVariable_is_vulkan(THPVariable *self, void *unused) { HANDLE_TH_ERRORS @@ -980,6 +991,7 @@ static struct PyGetSetDef THPVariable_properties[] = { {"is_sparse_csr", (getter)THPVariable_is_sparse_csr, nullptr, nullptr, nullptr}, {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr}, {"is_mlc", (getter)THPVariable_is_mlc, nullptr, nullptr, nullptr}, + {"is_ort", (getter)THPVariable_is_ort, nullptr, nullptr, nullptr}, {"is_vulkan", (getter)THPVariable_is_vulkan, nullptr, nullptr, nullptr}, {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr}, {"is_quantized", (getter)THPVariable_is_quantized, nullptr, nullptr, nullptr}, diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp index ab70d6c6f326a..a5f000769badc 100644 --- a/torch/csrc/jit/frontend/sugared_value.cpp +++ b/torch/csrc/jit/frontend/sugared_value.cpp @@ -119,7 +119,7 @@ std::shared_ptr SimpleValue::attr( {"layout", "prim"}, {"T", "prim"}, {"ndim", "prim"}, {"name", "prim"}, {"real", "aten"}, {"imag", "aten"}, - {"retains_grad", "aten"}, + {"retains_grad", "aten"}, {"is_ort", "prim"}, }}, {TypeKind::DeviceObjType, {{"type", "prim"}, {"index", "prim"}}}}; auto kind = value_->type()->kind(); diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index a61cb48b1ddce..984073fbf72c1 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -2211,6 +2211,14 @@ RegisterOperators reg1( push(stack, a.is_meta()); }, aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_ort()); + }, + aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"), [](Stack* stack) { diff --git a/torch/library.h b/torch/library.h index ce2bb92e5723e..a873b4226dbca 100644 --- a/torch/library.h +++ b/torch/library.h @@ -317,8 +317,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) { return c10::DispatchKey::Meta; case c10::DeviceType::HIP: return c10::DispatchKey::HIP; - case c10::DeviceType::MSNPU: - return c10::DispatchKey::MSNPU; + case c10::DeviceType::ORT: + return c10::DispatchKey::ORT; case c10::DeviceType::HPU: return c10::DispatchKey::HPU; default: diff --git a/torch/overrides.py b/torch/overrides.py index 5a0ea6ca81737..09748b982b428 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -1030,6 +1030,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: Tensor.retains_grad.__get__: lambda self: -1, Tensor.is_meta.__get__: lambda self: -1, Tensor.is_mlc.__get__: lambda self: -1, + Tensor.is_ort.__get__: lambda self: -1, Tensor.is_mkldnn.__get__: lambda self: -1, Tensor.is_quantized.__get__: lambda self: -1, Tensor.is_sparse.__get__: lambda self: -1, From d6d86efb1c839ddafd1398d6dab9caa4f31a9f0b Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 20 Aug 2021 11:11:49 -0700 Subject: [PATCH 101/530] [nnc] Support thread level parallelism in fused kernels (#63386) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63386 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30360382 Pulled By: bertmaher fbshipit-source-id: 29acf4e932c669ce0f35823faea9099bcd8119b6 --- test/cpp/tensorexpr/test_kernel.cpp | 30 ++++++++ torch/csrc/jit/tensorexpr/kernel.cpp | 87 ++++++++++++++++++++++ torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 24 ++++-- torch/csrc/jit/tensorexpr/llvm_jit.h | 8 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 7 ++ 5 files changed, 148 insertions(+), 8 deletions(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 8f36f54395f49..8d4e48c4a0bff 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -206,6 +206,36 @@ TEST_F(Kernel, _3) { } } +TEST_F(Kernel, ParallelStrided) { + KernelScope kernel_scope; + + const auto graph_string = R"IR( + graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu), + %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)): + %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1) + %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2) + return (%3))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat)); + auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat)) + .index( + {Slice(None, None, 2), + Slice(None, None, 2), + Slice(None, None, 2)}); + auto ref = a * (a * b); + auto o = at::zeros_like(ref); + TensorExprKernel k(graph); + std::vector inputs = {a, b}; + std::vector stack = fmap(inputs); + k.run(stack); + o = stack[0].toTensor(); + for (size_t i = 0; i < 5 * 3; i++) { + CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); + } +} + TEST_F(Kernel, DISABLED_Shape_Inference) { // disabled: doesn't do stride propagation, and isn't being used currently diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index faacd022e7e0b..c5333b2010610 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -2487,6 +2488,86 @@ void fuseAllLoops(StmtPtr st) { } } +// Compute the trip count of a loop if it is a constant. +c10::optional tripCount(ForPtr loop) { + auto tc = IRSimplifier::simplify( + cast(ExprHandle(loop->stop()) - ExprHandle(loop->start()))); + if (auto val = to(tc.node())) { + return val->value(); + } + return c10::nullopt; +} + +// Prune innermost loops until iterations satisfies a minimum grain size. +static void pruneByGrainSize(std::vector& loops) { + constexpr int64_t minGrainSize = 32768; + int64_t grainSize = 1; + for (int64_t i = loops.size(); i > 0; i--) { + auto tc = tripCount(loops[i - 1]); + if (!tc) { + break; + } + grainSize *= *tc; + if (grainSize < minGrainSize) { + loops.pop_back(); + } + } +} + +// Retain enough outermost loops to fill the number of threads. +static void pruneByThreadCount(std::vector& loops) { + int64_t trips = 1; + auto threads = at::get_num_threads(); + auto it = loops.begin(); + for (; it != loops.end(); it++) { + if (trips >= threads) { + break; + } + auto tc = tripCount(*it); + if (!tc) { + break; + } + trips *= *tc; + } + loops.erase(it, loops.end()); +} + +// Flatten and parallelize outer loops, subject to a minimum number of elements +// in the inner loop, and a maximum level of thread-level parallelism in the +// outer loops. +template +static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) { + for (auto const& buf : bufs) { + auto loops = l.getLoopStmtsFor(buf); + pruneByGrainSize(loops); + pruneByThreadCount(loops); + + // There are no loops to parallelize; give up. + if (loops.size() == 0) { + continue; + } + // The loop nest contains a reduction; give up. + auto reductions = NodeFinder::find(loops[0]); + if (reductions.size() > 0) { + continue; + } + // The loop nest has loop carried dependences; give up. + if (LoopNest::hasLoopCarriedDependence(loops[0])) { + continue; + } + // Try to flatten the outer loops and parallelize them if successful. + ForPtr flattened = nullptr; + if (loops.size() == 1) { + flattened = loops[0]; + } else { + LoopNest::flatten(loops, &flattened); + } + if (flattened) { + flattened->set_parallel(); + } + } +} + StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { torch::jit::tensorexpr::LoopNest l(st, bufOutputs_); GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n"); @@ -2528,6 +2609,8 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { if (backendType == kLLVMCodeGen) { fuseAllLoops(l.root_stmt()); GRAPH_DEBUG("after fuse", *l.root_stmt()); + parallelizeOuterLoops(l, bufOutputs_); + GRAPH_DEBUG("after parallelize", *l.root_stmt()); } if (backendType == kCudaCodeGen) { @@ -2602,9 +2685,13 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { } l.prepareForCodegen(); + GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt()); + l.simplify(); + GRAPH_DEBUG("after simplification", *l.root_stmt()); if (backendType == kLLVMCodeGen && !hasReduction) { l.vectorizeInnerLoops(); + GRAPH_DEBUG("after vectorization", *l.root_stmt()); } StmtPtr stmt = l.root_stmt(); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index eac1f82f25c4b..d5a95bc4cf886 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -274,15 +274,24 @@ class LLVMCodeGenImpl : public IRVisitor { } }; +extern "C" { typedef void (*ParallelCallee)(int index, int8_t* packed_data); -void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data) { +void DispatchParallel( + int8_t* func, + int start, + int stop, + int8_t* packed_data) noexcept { // TODO: preserve the func type. - ParallelCallee callee = reinterpret_cast(func); - at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { - for (int index = f_begin; index < f_end; index++) { - callee(index, packed_data); - } - }); + try { + ParallelCallee callee = reinterpret_cast(func); + at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { + for (int index = f_begin; index < f_end; index++) { + callee(index, packed_data); + } + }); + } catch (...) { + } +} } } // namespace tensorexpr @@ -1287,6 +1296,7 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) { module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype); llvm::Function* dispatcher = llvm::cast(dispatcher_callee.getCallee()); + dispatcher->addFnAttr(llvm::Attribute::NoUnwind); irb_.CreateCall( dispatcher, {func_value, start, stop, packed_caller_args_ptr}); value_ = llvm::ConstantInt::get(IntTy_, 0); diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h index 30ad5317a1b3c..8585900abc8d6 100644 --- a/torch/csrc/jit/tensorexpr/llvm_jit.h +++ b/torch/csrc/jit/tensorexpr/llvm_jit.h @@ -17,7 +17,13 @@ namespace torch { namespace jit { namespace tensorexpr { -void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data); +extern "C" { +void DispatchParallel( + int8_t* func, + int start, + int stop, + int8_t* packed_data) noexcept; +} inline std::string formatError(llvm::Error&& err, const char* msg) { static constexpr char* defaultErrorMsg = "Unexpected failure in LLVM JIT"; diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index a296d8c7af79b..7bcdd1a666f7b 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -179,6 +179,13 @@ class Vectorizer : public IRMutator { }); } + ExprPtr mutate(ModPtr v) override { + std::vector inputs = {v->lhs(), v->rhs()}; + return try_vectorize(v, inputs, [&]() { + return ExprHandle(inputs[0]) % ExprHandle(inputs[1]); + }); + } + ExprPtr mutate(AndPtr v) override { std::vector inputs = {v->lhs(), v->rhs()}; return try_vectorize(v, inputs, [&]() { From 6600bc96517269c608ea47b76b6bda9476c7bcef Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 20 Aug 2021 11:11:49 -0700 Subject: [PATCH 102/530] Remove flag to toggle CPU fusion in the presence of parallelism (#63514) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63514 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30417127 Pulled By: bertmaher fbshipit-source-id: b77d7c68364f2af73570740540f3b1152313016e --- test/cpp/tensorexpr/test_te_fuser_pass.cpp | 6 +----- test/jit/test_profiler.py | 3 --- test/test_jit_fuser_te.py | 5 ----- test/test_tensorexpr.py | 4 ---- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 +------------------ torch/csrc/jit/passes/tensorexpr_fuser.h | 2 -- torch/csrc/jit/python/init.cpp | 2 -- 7 files changed, 2 insertions(+), 39 deletions(-) diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index 8dd616453362b..91fb4c2b7582c 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -15,19 +15,15 @@ namespace jit { using namespace torch::jit::tensorexpr; struct WithCPUFuser { - WithCPUFuser(bool val = true) - : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) { + WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) { overrideCanFuseOnCPU(val); - setTexprParallelCPUEnabled(true); } ~WithCPUFuser() { overrideCanFuseOnCPU(cpuFuserEnabled); - setTexprParallelCPUEnabled(parallel); } bool cpuFuserEnabled; - bool parallel; }; TEST(TEFuserPass, FuserPass_1) { diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index aa8be0518385f..b9ed9d0b78eb5 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -29,8 +29,6 @@ def setUp(self): torch._C._debug_set_fusion_group_inlining(False) self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) - self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() - torch._C._jit_set_texpr_parallel_cpu_enabled(True) def tearDown(self): torch._C._jit_set_profiling_executor(self.prev_exec) @@ -42,7 +40,6 @@ def tearDown(self): torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled) torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) - torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def test_tensor_type_not_determined_by_inputs(self): @torch.jit.script diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 64c26b7936b54..614226ff871ba 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -85,10 +85,6 @@ def setUp(self): self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) - # TODO: CPU fuser currently is disabled when multithreading. - self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() - torch._C._jit_set_texpr_parallel_cpu_enabled(True) - self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] self.int_dtypes = [ torch.int8, @@ -116,7 +112,6 @@ def tearDown(self): torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) - torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 6353113a1ec4c..47c7e689aa6a4 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -24,9 +24,6 @@ def setUp(self): torch._C._debug_set_fusion_group_inlining(False) self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) - # TODO: CPU fuser currently is disabled when multithreading. - self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() - torch._C._jit_set_texpr_parallel_cpu_enabled(True) self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] @@ -39,7 +36,6 @@ def tearDown(self): torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state) torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) - torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index d4add03506c4f..52bf4539479df 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -1,6 +1,5 @@ #include -#include #include #include #include @@ -250,15 +249,6 @@ bool isSupported(Node* node) { } // namespace tensorexpr static bool texpr_fuser_enabled_ = true; -static bool texpr_parallel_cpu_enabled = false; - -bool texprParallelCPUEnabled() { - return texpr_parallel_cpu_enabled; -} - -void setTexprParallelCPUEnabled(bool val) { - texpr_parallel_cpu_enabled = val; -} void setTensorExprFuserEnabled(bool val) { texpr_fuser_enabled_ = val; @@ -898,14 +888,7 @@ class TensorExprFuser { return false; } if (device->is_cpu()) { - // CPU fusion is only supported for single-thread. - if (!canFuseOnCPU()) { - return false; - } - if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) { - return true; - } - return false; + return canFuseOnCPU(); } else if (device->is_cuda()) { return canFuseOnGPU(); } else if (device->is_xpu()) { diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h index 3f6538b7e587a..254aebd91d12f 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.h +++ b/torch/csrc/jit/passes/tensorexpr_fuser.h @@ -24,8 +24,6 @@ TORCH_API void setTensorExprFuserEnabled(bool val); TORCH_API bool tensorExprFuserEnabled(); TORCH_API bool setTexprReductionsEnabled(bool value); TORCH_API bool texprReductionsEnabled(); -TORCH_API bool texprParallelCPUEnabled(); -TORCH_API void setTexprParallelCPUEnabled(bool val); TORCH_API void RemoveProfileNodesAndSpecializeTypes( std::shared_ptr& graph); diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 5fca575593551..992e60edd7d19 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -711,8 +711,6 @@ void initJITBindings(PyObject* module) { .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed) .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled) .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled) - .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled) - .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled) .def( "_jit_set_te_generate_block_code", [](bool gen_block_code) { From b9fc656cf26d60127bd695e4e5a7d27622f2563d Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 20 Aug 2021 11:11:49 -0700 Subject: [PATCH 103/530] [nnc] Enable CPU fusion (#63545) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63545 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30417370 Pulled By: bertmaher fbshipit-source-id: 84ce7a578a3678d5562bab99d1dc00330c4f72d1 --- torch/csrc/jit/codegen/fuser/interface.cpp | 8 ++------ torch/csrc/jit/passes/graph_fuser.cpp | 12 +++++++++++- torch/csrc/jit/passes/graph_fuser.h | 3 +++ torch/csrc/jit/python/init.cpp | 2 ++ 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp index ec67c4bd83773..ef7e9e0b629d5 100644 --- a/torch/csrc/jit/codegen/fuser/interface.cpp +++ b/torch/csrc/jit/codegen/fuser/interface.cpp @@ -8,15 +8,12 @@ #include #include -C10_DEFINE_bool(torch_jit_enable_cpu_fusion, false, "enable cpu fusion"); - namespace torch { namespace jit { namespace detail { -// Note: CPU fusion is currently disabled due to test flakiness -#if defined(FBCODE_CAFFE2) +#ifdef TORCH_ENABLE_LLVM bool cpu_fuser_enabled = true; #else bool cpu_fuser_enabled = false; @@ -37,8 +34,7 @@ void runFusion(const int64_t key, Stack& stack) { } bool canFuseOnCPU() { - return fuser::hasFusionBackend(DeviceType::CPU) && - (detail::cpu_fuser_enabled || FLAGS_torch_jit_enable_cpu_fusion); + return fuser::hasFusionBackend(DeviceType::CPU) && detail::cpu_fuser_enabled; } bool canFuseOnGPU() { diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index f7dd466de4ff4..653f9fec08b32 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -183,7 +183,7 @@ struct GraphFuser { return !strict_fuser_check; } if ((*device).is_cpu()) { - return canFuseOnCPU(); + return canFuseOnCPULegacy(); } else if ((*device).is_cuda()) { return canFuseOnGPU(); } else if ((*device).is_xpu()) { @@ -1244,6 +1244,16 @@ void PeepholeOptimizeShapeExpressions(Block* block, AliasDb* db) { } // anonymous namespace +static bool cpu_fuser_enabled_legacy = false; + +bool canFuseOnCPULegacy() { + return cpu_fuser_enabled_legacy; +} + +void overrideCanFuseOnCPULegacy(bool value) { + cpu_fuser_enabled_legacy = value; +} + void FuseGraph(std::shared_ptr& graph, bool strict_fuser_check) { AliasDb db(graph); GraphFuser(&db, graph->block(), strict_fuser_check).run(); diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h index 0cdcc2e20f469..d710e5a098098 100644 --- a/torch/csrc/jit/passes/graph_fuser.h +++ b/torch/csrc/jit/passes/graph_fuser.h @@ -5,6 +5,9 @@ namespace torch { namespace jit { +TORCH_API bool canFuseOnCPULegacy(); +TORCH_API void overideCanFuseOnCPULegacy(bool value); + // NB: Be sure to run DCE before fusion, because dead instructions // can prevent fusion opportunities from being exploited. // On Windows will noop, NYI diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 992e60edd7d19..f5da7b30c29d7 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -589,6 +589,8 @@ void initJITBindings(PyObject* module) { .def("_jit_override_can_fuse_on_gpu", &overrideCanFuseOnGPU) .def("_jit_can_fuse_on_cpu", &canFuseOnCPU) .def("_jit_can_fuse_on_gpu", &canFuseOnGPU) + .def("_jit_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy) + .def("_jit_override_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy) .def( "_jit_differentiate", [](Graph& g) { From 70a3210ecaa0162b4673f53faa17675a9d3ca8de Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 20 Aug 2021 11:43:07 -0700 Subject: [PATCH 104/530] Add `BinaryUfuncOpInfo` and broadcasting tests (#61964) Summary: As proof of concept, this PR uses the new `BinaryUfuncOpInfo` in broadcasting tests for `add`, `sub`, `mul`, `div`, `floor_div`, and `true_div`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/61964 Reviewed By: ngimel Differential Revision: D30407734 Pulled By: mruberry fbshipit-source-id: ada28994f43b0635f279f45a02ecba18bc8ee033 --- test/test_binary_ufuncs.py | 80 ++++- test/test_jit_fuser_te.py | 2 + .../_internal/common_methods_invocations.py | 333 +++++++++++------- 3 files changed, 287 insertions(+), 128 deletions(-) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index f952911d206f6..4995e0dfc6cc7 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -17,8 +17,9 @@ from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA, - skipCUDAIfRocm, skipIf) + skipCUDAIfRocm, skipIf, ops) from torch.testing import all_types_and_complex_and, integral_types_and +from torch.testing._internal.common_methods_invocations import binary_ufuncs if TEST_SCIPY: import scipy.special @@ -89,6 +90,74 @@ def _make_tensor(shape, dtype, device, fill_ones=False) -> torch.Tensor: # TODO: update to use opinfos consistently class TestBinaryUfuncs(TestCase): + @ops(binary_ufuncs, allowed_dtypes=(torch.float32,)) + def test_broadcasting(self, device, dtype, op): + for shape_lhs, shape_rhs in ( + ((1,), ()), + ((2,), ()), + ((1,), (2,)), + ((2,), (2,)), + ((2, 1), (2,)), + ((1, 2), (2,)), + ((3, 2), (2,)), + ((3, 2), (3, 2)), + ((1, 3, 2), (2,)), + ((1, 3, 2), (3, 2)), + ((3, 1, 2), (3, 2)), + ((1, 3, 2), (1, 3, 2)), + ((2, 3, 2), ()), + ((2, 3, 2), (2, 3, 2)), + ((3, 1, 2), (1, 3, 2)), + ): + lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) + rhs = make_tensor(shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) + + actual = op(lhs, rhs).shape + expected = torch.broadcast_shapes(shape_lhs, shape_rhs) + + msg = ( + f"On {device}, torch.{op.name} broadcasts inputs of shapes {shape_lhs} and {shape_rhs} incorrectly: " + f"{actual} != {expected}" + ) + self.assertEqual(actual, expected, msg=msg) + + @ops(binary_ufuncs, allowed_dtypes=(torch.float32,)) + def test_broadcast_python_scalar(self, device, dtype, op): + for shape_lhs in ((), (1,), (2,), (1, 2, 3),): + lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) + rhs_tensor = make_tensor((), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) + rhs_python = rhs_tensor.item() + + actual = op(lhs, rhs_python) + expected = op(lhs, rhs_tensor) + + self.assertEqual( + actual.shape, + expected.shape, + msg=f"On {device}, torch.{op.name} broadcasts Python scalars different than 0d tensors.", + ) + + @ops(binary_ufuncs, allowed_dtypes=(torch.float32,)) + def test_not_broadcastable(self, device, dtype, op): + for shape_lhs, shape_rhs in ( + ((2,), (3,)), + ((3, 1), (2, 1)), + ((1, 3, 2), (3,)), + ((3, 1, 2), (2, 1, 2)), + ): + lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) + rhs = make_tensor(shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) + + try: + broadcasted_shape = op(lhs, rhs).shape + except RuntimeError: + continue + + msg = ( + f"On {device}, torch.{op.name} broadcasts inputs shapes {shape_lhs} and {shape_rhs} into " + f"{broadcasted_shape}, although they are not broadcastable." + ) + raise AssertionError(msg) def test_add_broadcast_empty(self, device): # empty + empty @@ -1184,11 +1253,10 @@ def _wrapped_ifloordiv_scalar(a): # Also tests that reverse operations are equivalent to forward ops # NOTE: division ops are tested separately above def test_binary_ops_with_scalars(self, device): - for ops in ((operator.add, torch.add), - (operator.sub, torch.sub), - (operator.mul, torch.mul), - (operator.truediv, torch.div)): - python_op, torch_op = ops + for python_op, torch_op in ((operator.add, torch.add), + (operator.sub, torch.sub), + (operator.mul, torch.mul), + (operator.truediv, torch.div)): for a, b in product(range(-10, 10), range(-10, 10)): for op in (lambda x: x * .5, lambda x: math.floor(x)): diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 614226ff871ba..b89caca44a1b2 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1929,6 +1929,8 @@ def eager(x): 'cosh', 'div.no_rounding_mode', 'div.true_rounding', + 'div.floor_rounding', + 'div.trunc_rounding', 'eq', 'erf', 'erfc', diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 873d91c0e1293..617b102642d05 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1268,53 +1268,151 @@ def sample_inputs_linalg_vector_norm(op_info, device, dtype, requires_grad, **kw return inputs -# In order to use the kwarg alpha, partials should be used in an OpInfo's sample_inputs_func -# eg. sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2) -# Then one sample input would also be generated corresponding to the value of alpha provided. -# In the future, kwargs 'alpha_floating', 'alpha_integral' & 'alpha_complex' can be used to -# specify scalars of floating, integral & complex types as values for "alpha". -# Keyword argument `rhs_exclude_zero` is used to exclude zero values from rhs tensor argument -# This is necessary for operations like `true_divide`, where divide by zero throws an exception. -def sample_inputs_binary_pwise(op_info, device, dtype, requires_grad, extra_kwargs=None, **kwargs): - if extra_kwargs is None: - extra_kwargs = {} - - scalar = 3.14 + 3.14j if dtype.is_complex else (3.14 if dtype.is_floating_point else 3) - scalar = 1 if dtype is torch.bool else scalar - tests_list = [ - ((S, S, S), (S, S, S), False), - ((S, S, S), (S, S), False), - ((), (), False), - ((S, S, S), (), False), - ((S, S, S), scalar, False), - ((), scalar, False) - ] - tests_with_lhs_broadcasting = [ - ((S, S), (S, S, S), True), - ((), (S, S, S), True), - ((S, 1, S), (M, S), True), + +# Metadata class for binary "universal functions (ufuncs)" that accept two +# tensor and have common properties +class BinaryUfuncInfo(OpInfo): + """Operator information for 'universal binary functions (binary ufuncs).' + These are functions of two tensors with common properties like: + - they are elementwise functions + - the output shape is determined by the input shape + - they typically have method and inplace variants + - they typically support the out kwarg + - they typically have NumPy or SciPy references + See NumPy's universal function documentation + (https://numpy.org/doc/stable/reference/ufuncs.html) for more details + about the concept of ufuncs. + """ + def __init__(self, name, *, lhs_make_tensor_kwargs=None, rhs_make_tensor_kwargs=None, **kwargs): + super().__init__(name, **kwargs) + + # [lr]hs_make_tensor_kwargs are part of the OpInfo to be able to dynamically generate valid samples later on. + if lhs_make_tensor_kwargs is None: + lhs_make_tensor_kwargs = {} + self.lhs_make_tensor_kwargs = lhs_make_tensor_kwargs + + if rhs_make_tensor_kwargs is None: + rhs_make_tensor_kwargs = {} + self.rhs_make_tensor_kwargs = rhs_make_tensor_kwargs + + +def _resolve_binay_pwise_kwargs( + op_info, *, op_kwargs=None, lhs_make_tensor_kwargs=None, rhs_make_tensor_kwargs=None +): + """Resolves default values for :func:`sample_inputs_binary_pwise`. + + By default :attr:`op_kwargs`, :attr:`lhs_make_tensor_kwargs`, and :attr:`rhs_make_tensor_kwargs` are just empty + dictionaries. In case :attr:`op_info` is a :class:`BinaryUfuncInfo`, :attr:`BinaryUfuncInfo.lhs_make_tensor_kwargs` + and :attr:`BinaryUfuncInfo.rhs_make_tensor_kwargs` will be used as defaults. + """ + if op_kwargs is None: + op_kwargs = {} + if lhs_make_tensor_kwargs is None: + lhs_make_tensor_kwargs = op_info.lhs_make_tensor_kwargs if isinstance(op_info, BinaryUfuncInfo) else {} + if rhs_make_tensor_kwargs is None: + rhs_make_tensor_kwargs = op_info.rhs_make_tensor_kwargs if isinstance(op_info, BinaryUfuncInfo) else {} + + return op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs + + +def sample_inputs_binary_pwise( + op_info, + device, + dtype, + requires_grad, + *, + python_scalars=False, + op_kwargs=None, + lhs_make_tensor_kwargs=None, + rhs_make_tensor_kwargs=None, + **kwargs, +): + op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binay_pwise_kwargs( + op_info, + op_kwargs=op_kwargs, + lhs_make_tensor_kwargs=lhs_make_tensor_kwargs, + rhs_make_tensor_kwargs=rhs_make_tensor_kwargs, + ) + + scalar = make_tensor((), device=device, dtype=dtype, **rhs_make_tensor_kwargs) + if python_scalars: + scalar = scalar.item() # type: ignore[assignment] + + shapes = [ + ((), scalar), + ((S,), scalar), + ((S, 1), (S,)), + ((M, S), scalar), + ((S, M, S), (M, S)), + ((S, M, S), (S, M, S)), + ((M, 1, S), (M, S)), + ((M, 1, S), (1, M, S)), ] - test_cases = tests_list + tests_with_lhs_broadcasting # type: ignore[operator] - samples = [] - for first_shape, shape_or_scalar, broadcasts_input in test_cases: - arg = shape_or_scalar - - if isinstance(shape_or_scalar, tuple): - exclude_zero = kwargs.get('rhs_exclude_zero', False) - arg = make_tensor(shape_or_scalar, device=device, dtype=dtype, - requires_grad=requires_grad, exclude_zero=exclude_zero) - samples.append(SampleInput(make_tensor(first_shape, device=device, dtype=dtype, - requires_grad=requires_grad), - args=(arg,), kwargs=extra_kwargs, - broadcasts_input=broadcasts_input)) - # Adds an extra sample using "alpha" if it's passed in kwargs - if 'alpha' in kwargs: - a = make_tensor((S, S, S), device=device, dtype=dtype, requires_grad=requires_grad) - b = make_tensor((S, S, S), device=device, dtype=dtype, requires_grad=requires_grad) - extra_kwargs['alpha'] = kwargs['alpha'] - sample = SampleInput(a, args=(b,), kwargs=extra_kwargs) - samples.append(sample) - return tuple(samples) + + sample_inputs = [] + for shape_lhs, shape_rhs_or_scalar in shapes: + lhs = make_tensor( + shape_lhs, + device=device, + dtype=dtype, + requires_grad=requires_grad, + **lhs_make_tensor_kwargs, + ) + if isinstance(shape_rhs_or_scalar, tuple): + # shape + rhs = make_tensor( + shape_rhs_or_scalar, + device=device, + dtype=dtype, + requires_grad=requires_grad, + **rhs_make_tensor_kwargs, + ) + broadcasts_input = torch.broadcast_shapes(shape_lhs, shape_rhs_or_scalar) != shape_lhs + else: + # scalar + rhs = shape_rhs_or_scalar # type: ignore[assignment] + broadcasts_input = False + + sample_inputs.append(SampleInput(lhs, args=(rhs,), kwargs=op_kwargs, broadcasts_input=broadcasts_input)) + return sample_inputs + + +def sample_inputs_add_sub( + op_info, + device, + dtype, + requires_grad, + python_scalars=False, + alpha=1, + op_kwargs=None, + lhs_make_tensor_kwargs=None, + rhs_make_tensor_kwargs=None, + **kwargs, +): + op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binay_pwise_kwargs( + op_info, + op_kwargs=op_kwargs, + lhs_make_tensor_kwargs=lhs_make_tensor_kwargs, + rhs_make_tensor_kwargs=rhs_make_tensor_kwargs, + ) + + sample_inputs = sample_inputs_binary_pwise( + op_info, + device, + dtype, + requires_grad, + python_scalars=python_scalars, + op_kwargs=op_kwargs, + lhs_make_tensor_kwargs=lhs_make_tensor_kwargs, + rhs_make_tensor_kwargs=rhs_make_tensor_kwargs, + **kwargs, + ) + + lhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **lhs_make_tensor_kwargs) + rhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **rhs_make_tensor_kwargs) + sample_inputs.append(SampleInput(lhs, args=(rhs,), kwargs=dict(op_kwargs, alpha=alpha), broadcasts_input=False)) + + return sample_inputs def sample_inputs_t(op_info, device, dtype, requires_grad, **kwargs): @@ -4045,19 +4143,6 @@ def sample_inputs_logit(op_info, device, dtype, requires_grad, **kwargs): return samples -def sample_inputs_floor_divide(op_info, device, dtype, requires_grad, **kwargs): - lhs = make_tensor((S, S, S), device, dtype, low=None, high=None, requires_grad=requires_grad) - rhs = make_tensor((S, S, S), device, dtype, low=None, high=None, requires_grad=requires_grad) - # Avoid integer divide by 0 - if not (dtype.is_floating_point or dtype.is_complex): - rhs[rhs == 0] = 1 - - return [ - SampleInput(lhs, args=(rhs,)), - SampleInput(lhs, args=(rhs[0],)), - SampleInput(lhs, args=(3.14,)), - ] - def sample_inputs_isin(op_info, device, dtype, requires_grad): element = make_tensor((L,), device, dtype, low=None, high=None, requires_grad=requires_grad) indices = torch.randint(0, L, size=[S]) @@ -5452,29 +5537,29 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): SkipInfo('TestGradients', 'test_forward_mode_AD', dtypes=[torch.cdouble]), )), - OpInfo('add', - # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate - ref=lambda input, other, *, alpha=1: np.add(input, np.multiply(alpha, other)), - dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), - assert_autodiffed=True, - sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2), - supports_inplace_autograd=False, - supports_forward_ad=True), - OpInfo('mul', - aliases=('multiply',), - dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool), - assert_autodiffed=True, - supports_forward_ad=True, - sample_inputs_func=sample_inputs_binary_pwise), - OpInfo('sub', - # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate - ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)), - aliases=('subtract',), - dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16), - assert_autodiffed=True, - supports_forward_ad=True, - sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2), - supports_inplace_autograd=False), + BinaryUfuncInfo('add', + # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate + ref=lambda input, other, *, alpha=1: np.add(input, np.multiply(alpha, other)), + dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), + assert_autodiffed=True, + sample_inputs_func=partial(sample_inputs_add_sub, alpha=2), + supports_inplace_autograd=False, + supports_forward_ad=True), + BinaryUfuncInfo('mul', + aliases=('multiply',), + dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool), + assert_autodiffed=True, + supports_forward_ad=True, + sample_inputs_func=sample_inputs_binary_pwise), + BinaryUfuncInfo('sub', + # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate + ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)), + aliases=('subtract',), + dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16), + assert_autodiffed=True, + supports_forward_ad=True, + sample_inputs_func=partial(sample_inputs_add_sub, alpha=2), + supports_inplace_autograd=False), OpInfo('addmm', # This addmm OpInfo is for when alpha and beta are not both equal to 1. # alpha=beta=1 is tested in the following opinfo, because that special case will @@ -6029,41 +6114,43 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), supports_forward_ad=True, sample_inputs_func=sample_inputs_diff), - OpInfo('div', - aliases=('divide',), - variant_test_name='no_rounding_mode', - dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - sample_inputs_func=partial(sample_inputs_binary_pwise, rhs_exclude_zero=True), - supports_forward_ad=True, - assert_autodiffed=True), - OpInfo('div', - aliases=('divide',), - variant_test_name='trunc_rounding', - dtypes=all_types_and(torch.half, torch.bfloat16), - sample_inputs_func=partial(sample_inputs_binary_pwise, extra_kwargs={ - "rounding_mode": 'trunc'}, rhs_exclude_zero=True), - supports_forward_ad=True, - skips=( - # Reference: https://github.com/pytorch/pytorch/issues/59174 - SkipInfo('TestJit', 'test_variant_consistency_jit'), - ), - assert_autodiffed=True), - OpInfo('div', - aliases=('divide',), - variant_test_name='floor_rounding', - dtypes=all_types_and(torch.half, torch.bfloat16), - sample_inputs_func=partial(sample_inputs_binary_pwise, extra_kwargs={ - "rounding_mode": 'floor'}, rhs_exclude_zero=True), - supports_forward_ad=True, - skips=( - # Reference: https://github.com/pytorch/pytorch/issues/59174 - SkipInfo('TestJit', 'test_variant_consistency_jit'), - ), - assert_autodiffed=True), - OpInfo('true_divide', - dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - supports_forward_ad=True, - sample_inputs_func=partial(sample_inputs_binary_pwise, rhs_exclude_zero=True)), + BinaryUfuncInfo('div', + aliases=('divide',), + variant_test_name='no_rounding_mode', + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_binary_pwise, + supports_forward_ad=True, + assert_autodiffed=True, + rhs_make_tensor_kwargs=dict(exclude_zero=True)), + BinaryUfuncInfo('div', + aliases=('divide',), + variant_test_name='trunc_rounding', + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=partial(sample_inputs_binary_pwise, rounding_mode="trunc"), + supports_forward_ad=True, + skips=( + # Reference: https://github.com/pytorch/pytorch/issues/59174 + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + assert_autodiffed=True, + rhs_make_tensor_kwargs=dict(exclude_zero=True)), + BinaryUfuncInfo('div', + aliases=('divide',), + variant_test_name='floor_rounding', + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=partial(sample_inputs_binary_pwise, rounding_mode="floor"), + supports_forward_ad=True, + skips=( + # Reference: https://github.com/pytorch/pytorch/issues/59174 + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + assert_autodiffed=True, + rhs_make_tensor_kwargs=dict(exclude_zero=True)), + BinaryUfuncInfo('true_divide', + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + supports_forward_ad=True, + sample_inputs_func=sample_inputs_binary_pwise, + rhs_make_tensor_kwargs=dict(exclude_zero=True)), UnaryUfuncInfo('exp', ref=np_unary_ufunc_integer_promotion_wrapper(np.exp), dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16), @@ -6316,11 +6403,12 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypes=all_types_and(torch.bool, torch.bfloat16), dtypesIfCUDA=all_types_and(torch.bool, torch.bfloat16, torch.float16), safe_casts_outputs=True), - OpInfo('floor_divide', - dtypes=all_types_and(torch.half, torch.bfloat16), - sample_inputs_func=sample_inputs_floor_divide, - supports_autograd=False, - ), + BinaryUfuncInfo('floor_divide', + dtypes=all_types_and(torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_binary_pwise, + supports_autograd=False, + rhs_make_tensor_kwargs=dict(exclude_zero=True), + ), UnaryUfuncInfo('frexp', op=torch.frexp, ref=np.frexp, @@ -8752,6 +8840,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): # Common operator groupings unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo)] +binary_ufuncs = [op for op in op_db if isinstance(op, BinaryUfuncInfo)] spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)] sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True] shape_funcs = [op for op in op_db if isinstance(op, ShapeFuncInfo)] From 71da1144126d07f4ac26aa372ed4f86fa3c0ba62 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Fri, 20 Aug 2021 12:05:32 -0700 Subject: [PATCH 105/530] Revert D30426527: Adding DataLoader2 class as future replacement of DataLoader Test Plan: revert-hammer Differential Revision: D30426527 (https://github.com/pytorch/pytorch/commit/5a7133b87fe2fd7d025d36855ed4cc06539a9299) Original commit changeset: e5905d3364c4 fbshipit-source-id: 794d8a4e9256ccff8cf894aee10eff6adc30d502 --- test/test_dataloader.py | 25 +----- torch/utils/data/__init__.py | 6 +- torch/utils/data/dataloader_experimental.py | 89 --------------------- 3 files changed, 3 insertions(+), 117 deletions(-) delete mode 100644 torch/utils/data/dataloader_experimental.py diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 71230cfbb7a67..c68d7e2e14b33 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -13,20 +13,9 @@ import warnings import tempfile from torch import multiprocessing as mp -from torch.utils.data import ( - ChainDataset, - ConcatDataset, - DataLoader, - DataLoader2, - Dataset, - IterableDataset, - Subset, - TensorDataset, - _utils -) +from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset, Subset from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL from torch.utils.data.dataset import random_split -from torch.utils.data.datapipes.iter import IterableAsDataPipe from torch._utils import ExceptionWrapper from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest, @@ -1945,18 +1934,6 @@ def test_excessive_thread_creation_warning(self): dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000) -@unittest.skipIf( - TEST_WITH_TSAN, - "Fails with TSAN with the following error: starting new threads after multi-threaded " - "fork is not supported. Dying (set die_after_fork=0 to override)") -class TestDataLoader2(TestCase): - def test_basics(self): - dp = IterableAsDataPipe(list(range(10))) - dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) - dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) - self.assertEquals(list(dl), list(dl2)) - - class StringDataset(Dataset): def __init__(self): self.s = '12345' diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py index 0af9e6193af3d..1d18b7b030894 100644 --- a/torch/utils/data/__init__.py +++ b/torch/utils/data/__init__.py @@ -11,9 +11,9 @@ from torch.utils.data.dataset import ( ChainDataset, ConcatDataset, - DataChunk, Dataset, Dataset as MapDataPipe, + DataChunk, IterableDataset, IterableDataset as IterDataPipe, Subset, @@ -34,14 +34,11 @@ runtime_validation, runtime_validation_disabled, ) -from torch.utils.data.dataloader_experimental import DataLoader2 - __all__ = ['BatchSampler', 'ChainDataset', 'ConcatDataset', 'DataLoader', - 'DataLoader2', 'Dataset', 'DistributedSampler', 'IterDataPipe', @@ -71,3 +68,4 @@ ################################################################################ # import subpackage ################################################################################ +from torch.utils.data import datapipes diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py deleted file mode 100644 index 85028afd22124..0000000000000 --- a/torch/utils/data/dataloader_experimental.py +++ /dev/null @@ -1,89 +0,0 @@ - -import functools - -import torch.utils.data.backward_compatibility -from torch.utils.data import DataLoader, IterDataPipe -from torch.utils.data.datapipes.iter import IterableAsDataPipe - -class DataLoader2: - def __new__(cls, - dataset, - batch_size=1, - shuffle=False, - sampler=None, - batch_sampler=None, - num_workers=0, - collate_fn=None, - pin_memory=False, - drop_last=False, - timeout=0, - worker_init_fn=None, - *, - prefetch_factor=2, - persistent_workers=False, - batch_outside_worker=False): - if isinstance(dataset, IterDataPipe): - datapipe = dataset - if batch_sampler is not None: - raise Exception( - 'batch_sampler is not yet supported for DataPipes') - if sampler is not None: - raise Exception( - 'sampler is not yet supported for DataPipes') - if shuffle: - datapipe = datapipe.shuffle() - if batch_outside_worker and pin_memory: - raise Exception( - 'pin_memory is not yet compatible with batch_outside_worker') - if not batch_outside_worker: - if batch_size is not None: - datapipe = datapipe.batch(batch_size, drop_last=drop_last) - if collate_fn is None: - collate_fn = torch.utils.data._utils.collate.default_collate - - def sharding_worker_init_fn(worker_init_fn, worker_id): - if worker_init_fn is not None: - worker_init_fn(worker_id) - torch.utils.data.backward_compatibility.worker_init_fn( - worker_id) - - my_worker_init_fn = functools.partial( - sharding_worker_init_fn, worker_init_fn) - - data_loader = DataLoader(datapipe, - batch_size=None, # Replaced by .batch DataPipe - shuffle=False, # Replaced by .shuffle DataPipe - sampler=None, - batch_sampler=None, - num_workers=num_workers, - collate_fn=collate_fn, - pin_memory=pin_memory, - drop_last=False, # Replaced by .batch DataPipe - timeout=timeout, - worker_init_fn=my_worker_init_fn, - prefetch_factor=prefetch_factor, - persistent_workers=persistent_workers) - - if not batch_outside_worker: - return data_loader - else: - if collate_fn is None: - collate_fn = torch.utils.data._utils.collate.default_collate - datapipe = IterableAsDataPipe(data_loader).batch( - batch_size, drop_last=drop_last).map(collate_fn) - return datapipe - - else: - return DataLoader(dataset, - batch_size=batch_size, - shuffle=shuffle, - sampler=sampler, - batch_sampler=batch_sampler, - num_workers=num_workers, - collate_fn=collate_fn, - pin_memory=pin_memory, - drop_last=drop_last, - timeout=timeout, - worker_init_fn=worker_init_fn, - prefetch_factor=prefetch_factor, - persistent_workers=persistent_workers) From 2d671ca41b437ebe0a183d39f4c70ecd19c76a78 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Fri, 20 Aug 2021 12:09:49 -0700 Subject: [PATCH 106/530] [8/N] Remove c10d/ddp fork tests. (#63454) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63454 Continuation of https://github.com/pytorch/pytorch/pull/63443, this PR removes all fork tests from torch.distributed. ghstack-source-id: 136285511 Test Plan: waitforbuildbot Reviewed By: SciPioneer Differential Revision: D30387872 fbshipit-source-id: f6d6313db126ae7b95b86f78a1e0726887c5c513 --- .jenkins/pytorch/multigpu-test.sh | 1 - .../ddp_comm_hooks/test_ddp_hooks.py | 10 +- .../server/test/local_elastic_agent_test.py | 99 +++++----- .../elastic/multiprocessing/api_test.py | 28 ++- .../multiprocessing/errors/api_test.py | 5 - .../elastic/timer/local_timer_example.py | 11 +- .../elastic/timer/local_timer_test.py | 34 ++-- test/distributed/launcher/api_test.py | 15 +- test/distributed/launcher/launch_test.py | 9 +- test/distributed/launcher/run_test.py | 53 +++--- test/distributed/test_c10d_common.py | 133 +++++++------- test/distributed/test_c10d_gloo.py | 25 +-- test/distributed/test_c10d_nccl.py | 8 - test/distributed/test_c10d_spawn_gloo.py | 172 +++++++++-------- test/distributed/test_distributed_fork.py | 113 ------------ test/distributed/test_jit_c10d.py | 6 +- test/distributed/test_launcher.py | 7 +- test/distributed/test_pg_wrapper.py | 173 +++++++++--------- test/run_test.py | 6 +- test/test_determination.py | 2 - tools/stats/print_test_stats.py | 1 - torch/distributed/CONTRIBUTING.md | 1 - torch/testing/_internal/common_distributed.py | 5 + 23 files changed, 366 insertions(+), 551 deletions(-) delete mode 100644 test/distributed/test_distributed_fork.py diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh index 2b918dad31385..76975310843c4 100755 --- a/.jenkins/pytorch/multigpu-test.sh +++ b/.jenkins/pytorch/multigpu-test.sh @@ -19,7 +19,6 @@ fi python tools/download_mnist.py --quiet -d test/cpp/api/mnist OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api time python test/run_test.py --verbose -i distributed/test_jit_c10d -time python test/run_test.py --verbose -i distributed/test_distributed_fork time python test/run_test.py --verbose -i distributed/test_c10d_common time python test/run_test.py --verbose -i distributed/test_c10d_gloo time python test/run_test.py --verbose -i distributed/test_c10d_nccl diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py index 1f78d50b604e8..7b889fdc3f1bb 100644 --- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py +++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py @@ -21,8 +21,14 @@ requires_nccl, skip_if_lt_x_gpu, ) -from torch.testing._internal.common_utils import run_tests +from torch.testing._internal.common_utils import ( + run_tests, + TEST_WITH_DEV_DBG_ASAN, +) +if TEST_WITH_DEV_DBG_ASAN: + print("Multiprocessing spawn is not compatible with dev/dbg asan", file=sys.stderr) + sys.exit(0) def gpus_for_rank(world_size): visible_devices = list(range(torch.cuda.device_count())) @@ -57,7 +63,7 @@ def forward(self, x, rank): class DistributedDataParallelCommHookTest(MultiProcessTestCase): def setUp(self): super(DistributedDataParallelCommHookTest, self).setUp() - self._fork_processes() + self._spawn_processes() def tearDown(self): try: diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py index 2536b1033d56c..f8972a2be73cf 100644 --- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py +++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py @@ -37,7 +37,6 @@ from torch.distributed.rpc.backend_registry import BackendType from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -406,19 +405,19 @@ def dummy_compute(self): self.assertEqual((100, 100), return_value.shape) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_dummy_compute_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_dummy_compute_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_dummy_compute_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute) @@ -431,19 +430,19 @@ def run_happy_function(self): self.assertIsNone(res.return_values[1]) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_happy_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_happy_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_happy_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_happy_function) @@ -465,13 +464,13 @@ def check_master_addr_port_override(self): self.assertIsNone(res.return_values[0]) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_check_master_addr_port_override_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.check_master_addr_port_override) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_check_master_addr_port_override_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.check_master_addr_port_override) @@ -484,7 +483,7 @@ def run_check_env_function(self): self.assertFalse(res.is_failed()) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_check_env_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_check_env_function) @@ -497,19 +496,19 @@ def run_function_with_return_value(self): self.assertEqual("foo", res.return_values[1]) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_function_with_return_value_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_function_with_return_value) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_function_with_return_value_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_function_with_return_value) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_function_with_return_value_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_function_with_return_value) @@ -520,19 +519,19 @@ def simple_dist_sum(self): # _dist_sum internally checks that the sum computed is valid @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_simple_dist_sum_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_simple_dist_sum_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_simple_dist_sum_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum) @@ -556,19 +555,19 @@ def run_distributed_sum_homogeneous(self): self.assertSetEqual(set(range(4 + 4)), ranks) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_distributed_sum_homogeneous_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_homogeneous) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_distributed_sum_homogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_homogeneous) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_distributed_sum_homogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous) @@ -596,19 +595,19 @@ def run_distributed_sum_heterogeneous(self): self.assertSetEqual(set(range(1 + 2 + 3)), ranks) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_distributed_sum_heterogeneous_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_distributed_sum_heterogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_distributed_sum_heterogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous) @@ -636,19 +635,19 @@ def run_sad_function(self): self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_sad_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_sad_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_sad_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function) @@ -668,19 +667,19 @@ def run_bipolar_function(self): self.assertTrue(agent._total_execution_time > 0) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_bipolar_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_bipolar_function) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_bipolar_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_bipolar_function) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_run_bipolar_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_bipolar_function) @@ -711,13 +710,13 @@ def correct_rank_assignment_heterogeneous(self): ) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_correct_rank_assignment_heterogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_correct_rank_assignment_heterogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous) @@ -744,13 +743,13 @@ def correct_rank_assignment_homogeneous(self): ) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_correct_rank_assignment_homogeneous_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_correct_rank_assignment_homogeneous_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous) @@ -852,13 +851,13 @@ def double_agent_fault_tolerance(self): self.assertEqual(0, p.exitcode) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_double_agent_fault_tolerance_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_fault_tolerance) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_double_agent_fault_tolerance_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance) @@ -905,19 +904,19 @@ def double_agent_elastic(self): self.assertEqual(-signal.SIGKILL, p.exitcode) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_double_agent_elastic_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.double_agent_elastic) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_double_agent_elastic_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_elastic) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_double_agent_elastic_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_elastic) @@ -955,19 +954,19 @@ def torch_rpc(self): self.assertEqual([f"{msg} from worker"], list(master_retvals.values())) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_torch_rpc_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.torch_rpc) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_torch_rpc_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.torch_rpc) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_torch_rpc_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.torch_rpc) @@ -993,13 +992,13 @@ def workers_drift_success(self): self.assertEqual(rank, output) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_workers_drift_success_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_success) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_workers_drift_success_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_success) @@ -1024,13 +1023,13 @@ def workers_drift_fail(self): self.assertEqual(rank, output) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_workers_drift_fail_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_fail) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_workers_drift_fail_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_fail) @@ -1047,19 +1046,19 @@ def barrier_failed(self, barrier_mock): barrier_mock.assert_called_once() @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_barrier_failed_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_barrier_failed_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_barrier_failed_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed) @@ -1081,19 +1080,19 @@ def shutdown_called(self, start_processes_mock): pcontext_mock.close.assert_called_once() @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_shutdown_called_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_shutdown_called_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_shutdown_called_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called) diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py index c27d932e43cb9..811137a8d83b4 100644 --- a/test/distributed/elastic/multiprocessing/api_test.py +++ b/test/distributed/elastic/multiprocessing/api_test.py @@ -35,8 +35,8 @@ from torch.testing._internal.common_utils import ( NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN, - TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, + TEST_WITH_DEV_DBG_ASAN, IS_IN_CI, IS_WINDOWS, IS_MACOS, @@ -223,15 +223,11 @@ def start_processes_zombie_test( # tests incompatible with tsan or asan -if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS): +if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS): class StartProcessesTest(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_") - - if NO_MULTIPROCESSING_SPAWN: # python 2.7 doesn't have spawn - self._start_methods = ["fork"] - else: - self._start_methods = ["fork", "spawn"] + self._start_methods = ["spawn"] def tearDown(self): shutil.rmtree(self.test_dir) @@ -317,7 +313,7 @@ def test_pcontext_wait(self): args={0: (1,)}, envs={0: {}}, log_dir=self.log_dir(), - start_method="fork", + start_method="spawn", ) self.assertIsNone(pc.wait(timeout=0.1, period=0.01)) @@ -332,7 +328,7 @@ def test_multiprocess_context_close(self): args={0: (1,)}, envs={0: {}}, log_dir=self.log_dir(), - start_method="fork", + start_method="spawn", ) pids = pc.pids() @@ -387,7 +383,7 @@ def test_void_function(self): self.assertEqual({0: None, 1: None}, results.return_values) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan" + TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan" ) def test_function_large_ret_val(self): # python multiprocessing.queue module uses pipes and actually PipedQueues @@ -549,7 +545,7 @@ def test_multiprocessing_context_poll_raises_exception(self): # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows -if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS): +if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS): class StartProcessesListTest(StartProcessesTest): ######################################## # start_processes as binary tests @@ -630,7 +626,7 @@ def test_binary_redirect_and_tee(self): args={0: ("hello",), 1: ("world",)}, envs={0: {"RANK": "0"}, 1: {"RANK": "1"}}, log_dir=self.log_dir(), - start_method="fork", + start_method="spawn", redirects={0: Std.ERR, 1: Std.NONE}, tee={0: Std.OUT, 1: Std.ERR}, ) @@ -647,7 +643,7 @@ def test_binary_redirect_and_tee(self): # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows -if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS or IS_IN_CI): +if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_IN_CI): class StartProcessesNotCITest(StartProcessesTest): def test_wrap_bad(self): none = "" @@ -697,8 +693,8 @@ def test_binary_signal(self): failure = results.failures[0] self.assertNotEqual(signal.SIGSEGV, failure.exitcode) - if TEST_WITH_ASAN: - # ASAN exit code is 1. + if TEST_WITH_ASAN or TEST_WITH_TSAN: + # ASAN/TSAN exit code is 1. self.assertEqual("", failure.signal_name()) else: self.assertEqual("SIGSEGV", failure.signal_name()) @@ -714,7 +710,7 @@ def test_function_redirect_and_tee(self): args={0: ("hello",), 1: ("world",)}, envs={0: {"RANK": "0"}, 1: {"RANK": "1"}}, log_dir=log_dir, - start_method="fork", + start_method="spawn", redirects={0: Std.ERR, 1: Std.NONE}, tee={0: Std.OUT, 1: Std.ERR}, ) diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py index 14b7ab1d13970..859069004ae71 100644 --- a/test/distributed/elastic/multiprocessing/errors/api_test.py +++ b/test/distributed/elastic/multiprocessing/errors/api_test.py @@ -13,7 +13,6 @@ record, ) from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error -from torch.testing._internal.common_utils import TEST_WITH_TSAN class SentinelError(Exception): @@ -45,10 +44,6 @@ def read_resource_file(resource_file: str) -> str: return "".join(fp.readlines()) -if TEST_WITH_TSAN: - print("test incompatible with tsan", file=sys.stderr) - sys.exit(0) - class ApiTest(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__) diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py index 7845c4b5001e5..b52c64752e413 100644 --- a/test/distributed/elastic/timer/local_timer_example.py +++ b/test/distributed/elastic/timer/local_timer_example.py @@ -15,7 +15,6 @@ import torch.multiprocessing as torch_mp from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, run_tests, IS_WINDOWS, IS_MACOS, @@ -55,7 +54,7 @@ class LocalTimerExample(unittest.TestCase): unittest. As of now this will SIGSEGV. """ - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible") def test_torch_mp_example(self): # in practice set the max_interval to a larger value (e.g. 60 seconds) mp_queue = mp.get_context("spawn").Queue() @@ -80,18 +79,14 @@ def test_torch_mp_example(self): server.stop() - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible") def test_example_start_method_spawn(self): self._run_example_with(start_method="spawn") - # @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible") + # @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible") # def test_example_start_method_forkserver(self): # self._run_example_with(start_method="forkserver") - @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible") - def test_example_start_method_fork(self): - self._run_example_with(start_method="fork") - def _run_example_with(self, start_method): spawn_ctx = mp.get_context(start_method) mp_queue = spawn_ctx.Queue() diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py index 4c977113aa42e..f27e5939660e5 100644 --- a/test/distributed/elastic/timer/local_timer_test.py +++ b/test/distributed/elastic/timer/local_timer_test.py @@ -13,19 +13,28 @@ from torch.distributed.elastic.timer.api import TimerRequest from torch.distributed.elastic.timer.local_timer import MultiprocessingRequestQueue from torch.testing._internal.common_utils import ( - TEST_WITH_TSAN, run_tests, IS_WINDOWS, IS_MACOS, - sandcastle_skip_if, + TEST_WITH_DEV_DBG_ASAN, ) # timer is not supported on windows or macos -if not (IS_WINDOWS or IS_MACOS): +if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN): + # func2 should time out + def func2(n, mp_queue): + if mp_queue is not None: + timer.configure(timer.LocalTimerClient(mp_queue)) + if n > 0: + with timer.expires(after=0.1): + func2(n - 1, None) + time.sleep(0.2) + class LocalTimerTest(unittest.TestCase): def setUp(self): - self.mp_queue = mp.Queue() + self.ctx = mp.get_context('spawn') + self.mp_queue = self.ctx.Queue() self.max_interval = 0.01 self.server = timer.LocalTimerServer(self.mp_queue, self.max_interval) self.server.start() @@ -62,7 +71,6 @@ def test_happy_path(self): with timer.expires(after=0.5): time.sleep(0.1) - @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible") def test_get_timer_recursive(self): """ If a function acquires a countdown timer with default scope, @@ -82,14 +90,7 @@ def func(n): func(4) - # func2 should time out - def func2(n): - if n > 0: - with timer.expires(after=0.1): - func2(n - 1) - time.sleep(0.2) - - p = mp.Process(target=func2, args=(2,)) + p = self.ctx.Process(target=func2, args=(2, self.mp_queue)) p.start() p.join() self.assertEqual(-signal.SIGKILL, p.exitcode) @@ -102,7 +103,6 @@ def _run(mp_queue, timeout, duration): with timer.expires(after=timeout): time.sleep(duration) - @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible") def test_timer(self): timeout = 0.1 duration = 1 @@ -124,7 +124,7 @@ def _enqueue_on_interval(mp_queue, n, interval, sem): # timer is not supported on windows or macos -if not (IS_WINDOWS or IS_MACOS): +if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN): class MultiprocessingRequestQueueTest(unittest.TestCase): def test_get(self): mp_queue = mp.Queue() @@ -183,7 +183,7 @@ def test_get_less_than_size(self): # timer is not supported on windows or macos -if not (IS_WINDOWS or IS_MACOS): +if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN): class LocalTimerServerTest(unittest.TestCase): def setUp(self): self.mp_queue = mp.Queue() @@ -193,7 +193,6 @@ def setUp(self): def tearDown(self): self.server.stop() - @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible") def test_watchdog_call_count(self): """ checks that the watchdog function ran wait/interval +- 1 times @@ -226,7 +225,6 @@ def _valid_timer(self, pid, scope): def _release_timer(self, pid, scope): return TimerRequest(worker_id=pid, scope_id=scope, expiration_time=-1) - @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible") @mock.patch("os.kill") def test_expired_timers(self, mock_os_kill): """ diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py index d2bfd360f9c31..685e843c10653 100644 --- a/test/distributed/launcher/api_test.py +++ b/test/distributed/launcher/api_test.py @@ -31,7 +31,6 @@ ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -117,7 +116,7 @@ def get_test_launch_config( rdzv_endpoint=endpoint, monitor_interval=1, rdzv_backend=rdzv_backend, - start_method="fork", + start_method="spawn", max_restarts=0, rdzv_configs=rdzv_configs, ) @@ -128,7 +127,7 @@ def check_works_ran(self, world_size: int): ) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_script_python(self): nnodes = 1 @@ -145,7 +144,7 @@ def test_launch_script_python(self): self.check_works_ran(world_size) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_script_python_local_rank_transfer(self): nnodes = 1 @@ -162,7 +161,7 @@ def test_launch_script_python_local_rank_transfer(self): self.check_works_ran(world_size) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_script_bash(self): nnodes = 1 @@ -177,7 +176,7 @@ def test_launch_script_bash(self): self.check_works_ran(world_size) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_function(self): nnodes = 1 @@ -193,7 +192,7 @@ def test_launch_function(self): self.assertEqual(expected_res, actual_res) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_dist_sum_with_static_rdzv(self): nnodes = 1 @@ -224,7 +223,7 @@ def test_launch_dist_sum_with_static_rdzv(self): self.assertEqual(expected_res, actual_res) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_elastic(self): nproc_per_node = 4 diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py index 73aed1a4ea59f..d79a18d39b995 100644 --- a/test/distributed/launcher/launch_test.py +++ b/test/distributed/launcher/launch_test.py @@ -15,7 +15,6 @@ from torch.distributed.elastic.utils import get_socket_with_port from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -36,7 +35,7 @@ def tearDown(self): shutil.rmtree(self.test_dir) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_without_env(self): nnodes = 1 @@ -49,7 +48,7 @@ def test_launch_without_env(self): f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--master_addr=localhost", f"--master_port={master_port}", "--node_rank=0", @@ -58,7 +57,7 @@ def test_launch_without_env(self): launch.main(args) @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan" + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) def test_launch_with_env(self): nnodes = 1 @@ -71,7 +70,7 @@ def test_launch_with_env(self): f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--master_addr=localhost", f"--master_port={master_port}", "--node_rank=0", diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py index a63ec0382dfa4..079fea792ed02 100644 --- a/test/distributed/launcher/run_test.py +++ b/test/distributed/launcher/run_test.py @@ -23,7 +23,6 @@ from torch.distributed.elastic.utils import get_socket_with_port from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, sandcastle_skip_if, ) @@ -100,7 +99,7 @@ def test_launch_user_script_python(self): f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] @@ -123,7 +122,7 @@ def test_launch_user_script_python_caffe2_bc(self): f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--master_addr=localhost", f"--master_port={master_port}", "--node_rank=0", @@ -138,7 +137,7 @@ def test_launch_user_script_python_caffe2_bc(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_user_script_bash(self): run_id = str(uuid.uuid4().int) nnodes = 1 @@ -151,7 +150,7 @@ def test_launch_user_script_bash(self): f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--no_python", ] @@ -169,7 +168,7 @@ def test_launch_user_script_bash(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_user_script_default_nproc(self): run_id = str(uuid.uuid4().int) nnodes = 1 @@ -180,7 +179,7 @@ def test_launch_user_script_default_nproc(self): f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--no_python", ] @@ -198,7 +197,7 @@ def test_launch_user_script_default_nproc(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_with_env_vars(self): run_id = str(uuid.uuid4().int) nnodes = 1 @@ -211,7 +210,7 @@ def test_launch_with_env_vars(self): os.environ["PET_RDZV_ENDPOINT"] = self._etcd_endpoint os.environ["PET_RDZV_ID"] = run_id os.environ["PET_MONITOR_INTERVAL"] = "1" - os.environ["PET_START_METHOD"] = "fork" + os.environ["PET_START_METHOD"] = "spawn" os.environ["PET_NO_PYTHON"] = "1" script_args = [path("bin/test_script.sh"), f"{self.test_dir}"] @@ -241,7 +240,7 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number): f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--no_python", ] @@ -256,27 +255,27 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_nproc_launch_auto_configurations(self): self._test_nproc_launch_configuration("auto", os.cpu_count()) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_nproc_launch_number_configurations(self): self._test_nproc_launch_configuration("4", 4) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_nproc_launch_unknown_configurations(self): with self.assertRaises(ValueError): self._test_nproc_launch_configuration("unknown", 4) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") @patch("torch.cuda.is_available", return_value=True) @patch("torch.cuda.device_count", return_value=3) def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): self._test_nproc_launch_configuration("auto", 3) self._test_nproc_launch_configuration("gpu", 3) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_elastic(self): run_id = str(uuid.uuid4().int) min_nodes = 1 @@ -291,7 +290,7 @@ def test_launch_elastic(self): f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] @@ -304,7 +303,7 @@ def test_launch_elastic(self): ) @mock.patch("torch.distributed.elastic.events.record") - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_elastic_worker_raise_exception(self, record_mock): """ Asserts that when the worker program fails and lancher raieses exception @@ -323,7 +322,7 @@ def test_launch_elastic_worker_raise_exception(self, record_mock): f"--rdzv_id={run_id}", "--monitor_interval=1", "--max_restarts=0", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), "--fail", ] @@ -332,7 +331,7 @@ def test_launch_elastic_worker_raise_exception(self, record_mock): record_mock.assert_called_once() - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") @mock.patch( "torch.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent.run" ) @@ -354,7 +353,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run) f"--rdzv_id={run_id}", "--monitor_interval=1", "--max_restarts=0", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] @@ -364,7 +363,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run) launch.main(args) record_mock.assert_called_once() - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_standalone(self): nnodes = 1 nproc_per_node = 4 @@ -374,7 +373,7 @@ def test_launch_standalone(self): f"--nproc_per_node={nproc_per_node}", "--standalone", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] @@ -386,7 +385,7 @@ def test_launch_standalone(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_run_path(self): nnodes = 1 nproc_per_node = 4 @@ -396,7 +395,7 @@ def test_launch_run_path(self): f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] @@ -408,7 +407,7 @@ def test_launch_run_path(self): {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)) ) - @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan") + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_launch_elastic_multiple_agents(self): run_id = str(uuid.uuid4().int) min_nodes = 1 @@ -423,7 +422,7 @@ def test_launch_elastic_multiple_agents(self): f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] @@ -462,7 +461,7 @@ def test_launch_shutdown(self, agent_mock_cls): f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 6aa5c64658415..33939d093ca3f 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -28,9 +28,13 @@ TestCase, load_tests, run_tests, - TEST_WITH_TSAN, + TEST_WITH_DEV_DBG_ASAN, ) +if TEST_WITH_DEV_DBG_ASAN: + print("Multiprocessing spawn is not compatible with dev/dbg asan", file=sys.stderr) + sys.exit(0) + # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -438,37 +442,31 @@ def fut_then(fut): return fut.then(fut_then) -# TSAN is not fork-safe since we're forking in a multi-threaded environment -if not TEST_WITH_TSAN: - - class DistributedDataParallelTest( - AbstractDistributedDataParallelTest, MultiProcessTestCase - ): - def setUp(self): - super(DistributedDataParallelTest, self).setUp() - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() - - def test_invalid_powerSGD_state(self): - for start_powerSGD_iter, use_error_feedback, warm_start in product( - [0, 1], [True, False], [True, False] +class DistributedDataParallelTest( + AbstractDistributedDataParallelTest, MultiProcessTestCase +): + def setUp(self): + super(DistributedDataParallelTest, self).setUp() + self._spawn_processes() + + def test_invalid_powerSGD_state(self): + for start_powerSGD_iter, use_error_feedback, warm_start in product( + [0, 1], [True, False], [True, False] + ): + if not use_error_feedback and not warm_start: + continue + with self.assertRaisesRegex( + ValueError, + "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, " + "because PowerSGD can only be applied after the first two iterations in DDP.", ): - if not use_error_feedback and not warm_start: - continue - with self.assertRaisesRegex( - ValueError, - "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, " - "because PowerSGD can only be applied after the first two iterations in DDP.", - ): - state = powerSGD.PowerSGDState( - process_group=None, - matrix_approximation_rank=1, - start_powerSGD_iter=start_powerSGD_iter, - use_error_feedback=use_error_feedback, - warm_start=warm_start, - ) + state = powerSGD.PowerSGDState( + process_group=None, + matrix_approximation_rank=1, + start_powerSGD_iter=start_powerSGD_iter, + use_error_feedback=use_error_feedback, + warm_start=warm_start, + ) class ComputeBucketAssignmentTest(TestCase): @@ -656,49 +654,42 @@ def _test_sequence_num_set_new_group(self, backend): dist.all_gather_object(obj_list, subgroup_seq, group=subgroup) self.assertEqual(len(set(obj_list)), 1) +class CommTest(AbstractCommTest, MultiProcessTestCase): + def setUp(self): + super(CommTest, self).setUp() + self._spawn_processes() -# TSAN is not fork-safe since we're forking in a multi-threaded environment -if not TEST_WITH_TSAN: + def tearDown(self): + super(CommTest, self).tearDown() + try: + os.remove(self.file_name) + except OSError: + pass - class CommTest(AbstractCommTest, MultiProcessTestCase): - def setUp(self): - super(CommTest, self).setUp() - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() - - def tearDown(self): - super(CommTest, self).tearDown() - try: - os.remove(self.file_name) - except OSError: - pass - - def test_distributed_debug_mode(self): - # Default should be off - default_debug_mode = dist._get_debug_mode() - self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF) - mapping = { - "OFF": dist._DistributedDebugLevel.OFF, - "INFO": dist._DistributedDebugLevel.INFO, - "DETAIL": dist._DistributedDebugLevel.DETAIL, - } - invalid_debug_modes = ["foo", 0, 1, -1] - - for mode in mapping.keys(): - os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) - set_debug_mode = dist._get_debug_mode() - self.assertEqual( - set_debug_mode, - mapping[mode], - f"Expected {mode} to map to {mapping[mode]} but got {set_debug_mode}", - ) + def test_distributed_debug_mode(self): + # Default should be off + default_debug_mode = dist._get_debug_mode() + self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF) + mapping = { + "OFF": dist._DistributedDebugLevel.OFF, + "INFO": dist._DistributedDebugLevel.INFO, + "DETAIL": dist._DistributedDebugLevel.DETAIL, + } + invalid_debug_modes = ["foo", 0, 1, -1] + + for mode in mapping.keys(): + os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) + set_debug_mode = dist._get_debug_mode() + self.assertEqual( + set_debug_mode, + mapping[mode], + f"Expected {mode} to map to {mapping[mode]} but got {set_debug_mode}", + ) - for mode in invalid_debug_modes: - os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) - with self.assertRaisesRegex(RuntimeError, "to be one of"): - dist._get_debug_mode() + for mode in invalid_debug_modes: + os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) + with self.assertRaisesRegex(RuntimeError, "to be one of"): + dist._get_debug_mode() if __name__ == "__main__": diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py index 54f29f3b11a7b..55b2948b93b71 100644 --- a/test/distributed/test_c10d_gloo.py +++ b/test/distributed/test_c10d_gloo.py @@ -43,17 +43,9 @@ TestCase, run_tests, retry_on_connect_failures, - TEST_WITH_TSAN, sandcastle_skip, ) -if TEST_WITH_TSAN: - print( - "Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", - file=sys.stderr, - ) - sys.exit(0) - def simple_reduce_tests(rank, world_size): tests = [ @@ -218,12 +210,7 @@ def _create_process_group_gloo(self, store, rank, world_size, opts): def setUp(self): super(ProcessGroupGlooTest, self).setUp() - - # For Windows platform, Python does not support fork, change it to spawn here. - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() + self._spawn_processes() def opts(self, threads=2): opts = c10d.ProcessGroupGloo._Options() @@ -1425,10 +1412,7 @@ class DistributedDataParallelTest( ): def setUp(self): super(DistributedDataParallelTest, self).setUp() - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() + self._spawn_processes() def _test_gloo_backend( self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False @@ -2197,10 +2181,7 @@ def test_forward_backward_optimizer(self): class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): def setUp(self): super(CommTest, self).setUp() - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() + self._spawn_processes() def tearDown(self): super(CommTest, self).tearDown() diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index 9efebc94a9288..e42c5c6be1759 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -45,7 +45,6 @@ retry_on_connect_failures, TEST_WITH_DEV_DBG_ASAN, TEST_WITH_ROCM, - TEST_WITH_TSAN, sandcastle_skip, sandcastle_skip_if, ) @@ -57,13 +56,6 @@ from torch.distributed.optim.functional_adam import _FunctionalAdam from torch.distributed.optim.functional_adamw import _FunctionalAdamW -if TEST_WITH_TSAN: - print( - "Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", - file=sys.stderr, - ) - sys.exit(0) - if TEST_WITH_DEV_DBG_ASAN: print( "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr diff --git a/test/distributed/test_c10d_spawn_gloo.py b/test/distributed/test_c10d_spawn_gloo.py index 8e5e0519356cf..21f43f7ca95f6 100644 --- a/test/distributed/test_c10d_spawn_gloo.py +++ b/test/distributed/test_c10d_spawn_gloo.py @@ -11,7 +11,7 @@ from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU from torch.testing._internal.common_distributed import requires_gloo, \ create_device, MultiProcessTestCase, skip_if_lt_x_gpu -from torch.testing._internal.common_utils import TestCase, run_tests, sandcastle_skip_if, TEST_WITH_TSAN, TEST_WITH_DEV_DBG_ASAN +from torch.testing._internal.common_utils import TestCase, run_tests, sandcastle_skip_if, TEST_WITH_DEV_DBG_ASAN # Fails on Python-3.9, see https://github.com/pytorch/pytorch/issues/51619 if sys.version_info < (3, 9): @@ -76,102 +76,100 @@ def test_shared_allgather_chunk_gloo(self): self.world_size) -# TSAN is not fork-safe since we're forking in a multi-threaded environment -if not TEST_WITH_TSAN: - class DistributedDataParallelSingleProcessTest(TestCase): - def setUp(self): - self.rank = 0 - self.world_size = 1 - self.file = tempfile.NamedTemporaryFile(delete=False) # noqa: P201 - - def tearDown(self): - try: - os.remove(self.file.name) - except OSError: - pass - - def _test_base(self, net, inp, check_allclose=True): - store = c10d.FileStore(self.file.name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - if inp[0].is_cuda: - device_ids = [torch.cuda.current_device()] - else: - device_ids = None +class DistributedDataParallelSingleProcessTest(TestCase): + def setUp(self): + self.rank = 0 + self.world_size = 1 + self.file = tempfile.NamedTemporaryFile(delete=False) # noqa: P201 - ddp = nn.parallel.DistributedDataParallel( - copy.deepcopy(net), - device_ids=device_ids, - process_group=process_group - ) + def tearDown(self): + try: + os.remove(self.file.name) + except OSError: + pass - net_opt = torch.optim.Adam(net.parameters(), lr=0.001) - ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001) + def _test_base(self, net, inp, check_allclose=True): + store = c10d.FileStore(self.file.name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + if inp[0].is_cuda: + device_ids = [torch.cuda.current_device()] + else: + device_ids = None - for i, j in zip(ddp.parameters(), net.parameters()): - self.assertTrue(i.allclose(j)) + ddp = nn.parallel.DistributedDataParallel( + copy.deepcopy(net), + device_ids=device_ids, + process_group=process_group + ) - for _ in range(10): - net_out = net(*inp) - ddp_out = ddp(*inp) + net_opt = torch.optim.Adam(net.parameters(), lr=0.001) + ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001) - net_out.sum().backward() - ddp_out.sum().backward() + for i, j in zip(ddp.parameters(), net.parameters()): + self.assertTrue(i.allclose(j)) - net_opt.step() - ddp_opt.step() + for _ in range(10): + net_out = net(*inp) + ddp_out = ddp(*inp) - if check_allclose: - for i, j in zip(ddp.parameters(), net.parameters()): - self.assertTrue(i.allclose(j)) + net_out.sum().backward() + ddp_out.sum().backward() - @requires_gloo() - def test_cpu(self): - self._test_base(nn.Linear(2, 2), [torch.randn(30, 2)]) + net_opt.step() + ddp_opt.step() - @requires_gloo() - @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed") - def test_cuda(self): - self._test_base(nn.Linear(2, 2).to(0), [torch.randn(30, 2).to(0)]) + if check_allclose: + for i, j in zip(ddp.parameters(), net.parameters()): + self.assertTrue(i.allclose(j)) - @requires_gloo() - @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed") - def test_rnn(self): - # This test is inspired by the bug reported in - # https://github.com/pytorch/pytorch/issues/36268 - BATCH_SIZE = 12 # Divisible by 2, 3, 4 - INPUT_DIM = 256 - OUTPUT_DIM = 256 - HIDDEN_DIM = 256 - N_LAYERS = 3 - SEQ_LEN = 100 - - class Net(nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers): - super(Net, self).__init__() - self.input_dim = input_dim - self.hidden_dim = hidden_dim - self.output_dim = output_dim - self.hidden_layers = hidden_layers - - self.lstm = nn.LSTM(input_dim, hidden_dim, hidden_layers, batch_first=True) - self.h2o = nn.Linear(hidden_dim, output_dim) - - def forward(self, x, y): - self.lstm.flatten_parameters() - h_t, _ = self.lstm(x) - output = self.h2o(h_t) - loss = nn.functional.mse_loss(output, y) - return loss - - net = Net(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(0) - inp = [ - torch.randn((BATCH_SIZE, SEQ_LEN, INPUT_DIM)).to(0), - torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0) - ] - - # Not checking result allclose as the parameter inconsistency exist - # prior to this change. See #37079 - self._test_base(net, inp, check_allclose=False) + @requires_gloo() + def test_cpu(self): + self._test_base(nn.Linear(2, 2), [torch.randn(30, 2)]) + + @requires_gloo() + @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed") + def test_cuda(self): + self._test_base(nn.Linear(2, 2).to(0), [torch.randn(30, 2).to(0)]) + + @requires_gloo() + @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed") + def test_rnn(self): + # This test is inspired by the bug reported in + # https://github.com/pytorch/pytorch/issues/36268 + BATCH_SIZE = 12 # Divisible by 2, 3, 4 + INPUT_DIM = 256 + OUTPUT_DIM = 256 + HIDDEN_DIM = 256 + N_LAYERS = 3 + SEQ_LEN = 100 + + class Net(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers): + super(Net, self).__init__() + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.output_dim = output_dim + self.hidden_layers = hidden_layers + + self.lstm = nn.LSTM(input_dim, hidden_dim, hidden_layers, batch_first=True) + self.h2o = nn.Linear(hidden_dim, output_dim) + + def forward(self, x, y): + self.lstm.flatten_parameters() + h_t, _ = self.lstm(x) + output = self.h2o(h_t) + loss = nn.functional.mse_loss(output, y) + return loss + + net = Net(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(0) + inp = [ + torch.randn((BATCH_SIZE, SEQ_LEN, INPUT_DIM)).to(0), + torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0) + ] + + # Not checking result allclose as the parameter inconsistency exist + # prior to this change. See #37079 + self._test_base(net, inp, check_allclose=False) # Skip dev-asan as torch + multiprocessing spawn have known issues diff --git a/test/distributed/test_distributed_fork.py b/test/distributed/test_distributed_fork.py deleted file mode 100644 index c707a313a5e47..0000000000000 --- a/test/distributed/test_distributed_fork.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -import sys -import tempfile -from functools import wraps -import torch -import torch.cuda -import torch.distributed as dist -from torch.testing._internal.common_utils import TEST_WITH_TSAN - -if not dist.is_available(): - print("Distributed not available, skipping tests", file=sys.stderr) - sys.exit(0) - -from torch.testing._internal.common_utils import TestCase, find_free_port, run_tests -from torch.distributed.distributed_c10d import _get_default_group -from torch.testing._internal.distributed.distributed_test import ( - DistributedTest, TestDistBackend -) - -torch.backends.cuda.matmul.allow_tf32 = False - -CPP_EXTENSIONS_WARNING = """ -Ninja (https://ninja-build.org) must be available to run C++ extensions tests, -but it could not be found. Install ninja with `pip install ninja` -or `conda install ninja`. -""" - -BACKEND = os.environ["BACKEND"] -INIT_METHOD = os.getenv("INIT_METHOD", "env://") - - -def skip_if_no_ninja(func): - - @wraps(func) - def wrapper(*args, **kwargs): - try: - import torch.utils.cpp_extension - torch.utils.cpp_extension.verify_ninja_availability() - except RuntimeError: - print(CPP_EXTENSIONS_WARNING) - return 0 - - return func(*args, **kwargs) - - return wrapper - -if TEST_WITH_TSAN: - print("Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", file=sys.stderr) - sys.exit(0) - -if BACKEND == "gloo" or BACKEND == "nccl": - - class TestDistBackendWithFork(TestDistBackend, DistributedTest._DistTestBase): - - def setUp(self): - super().setUp() - self._fork_processes() - torch.backends.cudnn.flags(allow_tf32=False).__enter__() - - -elif BACKEND == "mpi": - WORLD_SIZE = os.environ["WORLD_SIZE"] - dist.init_process_group(init_method=INIT_METHOD, backend="mpi") - - class TestMPIWithFork(TestCase, DistributedTest._DistTestBase): - pass - -elif BACKEND == "test": - class TestBackendDynamicLoad(TestCase): - def setUp(self): - super(TestBackendDynamicLoad, self).setUp() - - def _load_test_backend(self): - temp_dir = tempfile.mkdtemp() - src = "{}/../cpp_extensions/cpp_c10d_extension.cpp".format(os.path.abspath(os.path.dirname(__file__))) - extension = torch.utils.cpp_extension.load( - name="torch_test", - sources=[src], - build_directory=temp_dir - ) - - @skip_if_no_ninja - def test_backend_apis(self): - self._load_test_backend() - - os.environ['WORLD_SIZE'] = '1' - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = str(find_free_port()) - os.environ['RANK'] = '0' - - dist.init_process_group(backend='test', init_method='env://', world_size=1, rank=0) - self.assertEqual(dist.get_rank(), 0) - self.assertEqual(dist.get_world_size(), 1) - - process_group = _get_default_group() - work = process_group.allreduce([torch.rand(1), torch.rand(1)]) - self.assertTrue(work.wait()) - self.assertTrue(work.is_completed()) - self.assertTrue(work.is_success()) - - work = process_group.broadcast([torch.rand(1)]) - self.assertTrue(work.wait()) - self.assertTrue(work.is_completed()) - self.assertTrue(work.is_success()) - - dist.destroy_process_group() - -if __name__ == "__main__": - assert ( - not torch.cuda._initialized - ), "test_distributed must not have initialized CUDA context on main process" - - run_tests() diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py index be392730b3fad..65d82fb033b7d 100644 --- a/test/distributed/test_jit_c10d.py +++ b/test/distributed/test_jit_c10d.py @@ -6,7 +6,7 @@ from typing import List from torch.testing._internal.common_distributed import requires_nccl, create_tcp_store -from torch.testing._internal.common_utils import load_tests, TEST_WITH_TSAN, run_tests, sandcastle_skip_if +from torch.testing._internal.common_utils import load_tests, run_tests, sandcastle_skip_if from torch.testing._internal.jit_utils import JitTestCase # load_tests from common_utils is used to automatically filter tests for @@ -29,10 +29,6 @@ def unique_process_group_name(prefix): now = int(time.time() * 1000) return "%s_%d" % (prefix, now) -if TEST_WITH_TSAN: - print("Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", file=sys.stderr) - sys.exit(0) - class ProcessGroupNCCLJitTest(JitTestCase): MAIN_PROCESS_RANK = 0 diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py index 53faefba95f95..4565a266bc9ec 100644 --- a/test/distributed/test_launcher.py +++ b/test/distributed/test_launcher.py @@ -12,7 +12,6 @@ from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - TEST_WITH_TSAN, TestCase, run_tests, ) @@ -25,10 +24,6 @@ def path(script): print("Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr) sys.exit(0) -if TEST_WITH_TSAN: - print("Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", file=sys.stderr) - sys.exit(0) - class TestDistributedLaunch(TestCase): def test_launch_user_script(self): nnodes = 1 @@ -41,7 +36,7 @@ def test_launch_user_script(self): f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", - "--start_method=fork", + "--start_method=spawn", "--master_addr=localhost", f"--master_port={master_port}", "--node_rank=0", diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py index de3a66712bffe..abf77d4fdaa02 100644 --- a/test/distributed/test_pg_wrapper.py +++ b/test/distributed/test_pg_wrapper.py @@ -20,7 +20,6 @@ ) from torch.testing._internal.common_utils import ( run_tests, - TEST_WITH_TSAN, TEST_WITH_DEV_DBG_ASAN, ) @@ -28,11 +27,7 @@ class AbstractProcessGroupWrapperTest(MultiProcessTestCase): def setUp(self): super(AbstractProcessGroupWrapperTest, self).setUp() - # For Windows platform, Python does not support fork, change it to spawn here. - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() + self._spawn_processes() def _validate_error(self, exception, op_type, rank, tensor): err = str(exception) @@ -291,91 +286,89 @@ def test_collective_shape_mismatch(self): self._test_collective_shape_mismatch(pg, use_cuda=True) -# TSAN is not fork-safe since we're forking in a multi-threaded environment -if not TEST_WITH_TSAN: - @requires_gloo() - class ProcessGroupGlooWrapperTest(AbstractProcessGroupWrapperTest): - def setUp(self): - super(ProcessGroupGlooWrapperTest, self).setUp() - - def opts(self, threads=2, timeout=10.0): - opts = c10d.ProcessGroupGloo._Options() - opts._timeout = timeout - opts._devices = [create_device(interface=LOOPBACK)] - opts._threads = threads - return opts - - def _create_wrapper_pg(self, with_new_group=False, timeout=10.0): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="gloo", rank=self.rank, world_size=self.world_size, store=store +@requires_gloo() +class ProcessGroupGlooWrapperTest(AbstractProcessGroupWrapperTest): + def setUp(self): + super(ProcessGroupGlooWrapperTest, self).setUp() + + def opts(self, threads=2, timeout=10.0): + opts = c10d.ProcessGroupGloo._Options() + opts._timeout = timeout + opts._devices = [create_device(interface=LOOPBACK)] + opts._threads = threads + return opts + + def _create_wrapper_pg(self, with_new_group=False, timeout=10.0): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="gloo", rank=self.rank, world_size=self.world_size, store=store + ) + if with_new_group: + pg = c10d.new_group(backend="gloo") + else: + _pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(timeout=timeout) ) - if with_new_group: - pg = c10d.new_group(backend="gloo") - else: - _pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(timeout=timeout) - ) - pg = c10d._create_process_group_wrapper( - _pg, - "unused", - store, - self.rank, - self.world_size, - timeout=timeout, - ) - return pg - - def test_collective_hang(self): - pg = self._create_wrapper_pg(timeout=2.0) - self._test_collective_hang(pg) - - # NOTE: these tests are separated by debug level instead of combined into - # one due to https://github.com/pytorch/pytorch/issues/55967, they can be - # combined after that is resolved. - @with_dist_debug_levels(levels=["DETAIL"]) - def test_collectives_op_mismatch_debug_mode(self): - pg = self._create_wrapper_pg(with_new_group=True) - self._test_collectives_op_mismatch(pg) - - @with_dist_debug_levels(levels=["OFF"]) - def test_collectives_op_mismatch(self): - pg = self._create_wrapper_pg(with_new_group=False) - self._test_collectives_op_mismatch(pg) - - @with_dist_debug_levels(levels=["DETAIL"]) - def test_collective_shape_mismatch_debug_mode(self): - pg = self._create_wrapper_pg(with_new_group=True) - self._test_collective_shape_mismatch(pg) - - @with_dist_debug_levels(levels=["OFF"]) - def test_collective_shape_mismatch(self): - pg = self._create_wrapper_pg(with_new_group=False) - self._test_collective_shape_mismatch(pg) - - @skip_if_lt_x_gpu(4) - @with_dist_debug_levels(levels=["DETAIL"]) - def test_collectives_op_mismatch_cuda_debug_mode(self): - pg = self._create_wrapper_pg(with_new_group=True) - self._test_collectives_op_mismatch(pg, use_cuda=True) - - @skip_if_lt_x_gpu(4) - @with_dist_debug_levels(levels=["OFF"]) - def test_collectives_op_mismatch_cuda(self): - pg = self._create_wrapper_pg(with_new_group=False) - self._test_collectives_op_mismatch(pg, use_cuda=True) - - @skip_if_lt_x_gpu(4) - @with_dist_debug_levels(levels=["DETAIL"]) - def test_collective_shape_mismatch_cuda_debug_mode(self): - pg = self._create_wrapper_pg(with_new_group=True) - self._test_collective_shape_mismatch(pg, use_cuda=True) - - @skip_if_lt_x_gpu(4) - @with_dist_debug_levels(levels=["OFF"]) - def test_collective_shape_mismatch_cuda(self): - pg = self._create_wrapper_pg(with_new_group=False) - self._test_collective_shape_mismatch(pg, use_cuda=True) + pg = c10d._create_process_group_wrapper( + _pg, + "unused", + store, + self.rank, + self.world_size, + timeout=timeout, + ) + return pg + + def test_collective_hang(self): + pg = self._create_wrapper_pg(timeout=2.0) + self._test_collective_hang(pg) + + # NOTE: these tests are separated by debug level instead of combined into + # one due to https://github.com/pytorch/pytorch/issues/55967, they can be + # combined after that is resolved. + @with_dist_debug_levels(levels=["DETAIL"]) + def test_collectives_op_mismatch_debug_mode(self): + pg = self._create_wrapper_pg(with_new_group=True) + self._test_collectives_op_mismatch(pg) + + @with_dist_debug_levels(levels=["OFF"]) + def test_collectives_op_mismatch(self): + pg = self._create_wrapper_pg(with_new_group=False) + self._test_collectives_op_mismatch(pg) + + @with_dist_debug_levels(levels=["DETAIL"]) + def test_collective_shape_mismatch_debug_mode(self): + pg = self._create_wrapper_pg(with_new_group=True) + self._test_collective_shape_mismatch(pg) + + @with_dist_debug_levels(levels=["OFF"]) + def test_collective_shape_mismatch(self): + pg = self._create_wrapper_pg(with_new_group=False) + self._test_collective_shape_mismatch(pg) + + @skip_if_lt_x_gpu(4) + @with_dist_debug_levels(levels=["DETAIL"]) + def test_collectives_op_mismatch_cuda_debug_mode(self): + pg = self._create_wrapper_pg(with_new_group=True) + self._test_collectives_op_mismatch(pg, use_cuda=True) + + @skip_if_lt_x_gpu(4) + @with_dist_debug_levels(levels=["OFF"]) + def test_collectives_op_mismatch_cuda(self): + pg = self._create_wrapper_pg(with_new_group=False) + self._test_collectives_op_mismatch(pg, use_cuda=True) + + @skip_if_lt_x_gpu(4) + @with_dist_debug_levels(levels=["DETAIL"]) + def test_collective_shape_mismatch_cuda_debug_mode(self): + pg = self._create_wrapper_pg(with_new_group=True) + self._test_collective_shape_mismatch(pg, use_cuda=True) + + @skip_if_lt_x_gpu(4) + @with_dist_debug_levels(levels=["OFF"]) + def test_collective_shape_mismatch_cuda(self): + pg = self._create_wrapper_pg(with_new_group=False) + self._test_collective_shape_mismatch(pg, use_cuda=True) if __name__ == "__main__": diff --git a/test/run_test.py b/test/run_test.py index e043bcd0ad152..ad3cbb90b6d30 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -65,7 +65,6 @@ 'test_dataloader', 'test_datapipe', 'distributed/test_data_parallel', - 'distributed/test_distributed_fork', 'distributed/test_distributed_spawn', 'distributions/test_constraints', 'distributions/test_distributions', @@ -212,7 +211,6 @@ 'distributed/rpc/test_faulty_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/rpc/cuda/test_tensorpipe_agent', - 'distributed/test_distributed_fork', 'distributed/pipeline/sync/skip/test_api', 'distributed/pipeline/sync/skip/test_gpipe', 'distributed/pipeline/sync/skip/test_inspect_skip_layout', @@ -294,7 +292,6 @@ 'test_testing', 'test_view_ops', 'distributed/nn/jit/test_instantiator', - 'distributed/test_distributed_fork', 'distributed/rpc/test_tensorpipe_agent', 'distributed/rpc/cuda/test_tensorpipe_agent', 'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks', @@ -576,7 +573,7 @@ def test_distributed(test_module, test_directory, options): os.environ['INIT_METHOD'] = 'env://' os.environ.update(env_vars) if with_init_file: - if test_module in ["test_distributed_fork", "test_distributed_spawn"]: + if test_module == "test_distributed_spawn": init_method = f'{FILE_SCHEMA}{tmp_dir}/' else: init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' @@ -611,7 +608,6 @@ def test_distributed(test_module, test_directory, options): 'test_cuda_primary_ctx': test_cuda_primary_ctx, 'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja, 'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja, - 'distributed/test_distributed_fork': test_distributed, 'distributed/test_distributed_spawn': test_distributed, } diff --git a/test/test_determination.py b/test/test_determination.py index 6d338af4b6c8f..6b7fcc0f0d242 100644 --- a/test/test_determination.py +++ b/test/test_determination.py @@ -16,7 +16,6 @@ class DeterminationTest(unittest.TestCase): "test_jit_profiling", "test_jit", "test_torch", - "distributed/test_distributed_fork", "distributed/test_distributed_spawn", "test_cpp_extensions_aot_ninja", "test_cpp_extensions_aot_no_ninja", @@ -104,7 +103,6 @@ def test_torch_file(self): self.assertEqual( self.determined_tests(["torch/utils/cpp_extension.py"]), [ - "distributed/test_distributed_fork", "test_cpp_extensions_aot_ninja", "test_cpp_extensions_aot_no_ninja", "test_utils", diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py index 7cc853e925181..71df463b14516 100755 --- a/tools/stats/print_test_stats.py +++ b/tools/stats/print_test_stats.py @@ -630,7 +630,6 @@ def __init__(self, name: str) -> None: def append(self, test_case: TestCase, test_type: str) -> None: is_multi_test = self.name == 'test_cpp_extensions_aot' or \ - self.name == 'distributed/test_distributed_fork' or \ self.name == 'distributed/test_distributed_spawn' or \ self.name == 'distributed/test_c10d_gloo' or \ self.name == 'cpp' # The caffe2 cpp tests spawn duplicate test cases as well. diff --git a/torch/distributed/CONTRIBUTING.md b/torch/distributed/CONTRIBUTING.md index 5e426466ec67d..6cbaea694f215 100644 --- a/torch/distributed/CONTRIBUTING.md +++ b/torch/distributed/CONTRIBUTING.md @@ -85,7 +85,6 @@ python test/distributed/test_store.py python test/distributed/test_pg_wrapper.py # Run distributed tests, including tests for Distributed Data Parallel. -python test/run_test.py --verbose -i distributed/test_distributed_fork python test/run_test.py --verbose -i distributed/test_distributed_spawn # Run the RPC test suite for the TensorPipeAgent. diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index 74ed9a069604c..fb505d105980c 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -9,6 +9,7 @@ import traceback import types import unittest +import warnings from contextlib import contextmanager from datetime import timedelta from enum import Enum @@ -468,6 +469,10 @@ def _start_processes(self, proc) -> None: self.processes.append(process) def _fork_processes(self) -> None: + warnings.warn( + "Fork based multiprocessing is dangerous and should not" + " be used, for tests with ASAN consider using opt-asan", + DeprecationWarning) proc = torch.multiprocessing.get_context("fork").Process self._start_processes(proc) From 125e2d02e575612eb427104e7c67f1c28f090db8 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Fri, 20 Aug 2021 12:26:58 -0700 Subject: [PATCH 107/530] Revert D30417370: [nnc] Enable CPU fusion Test Plan: revert-hammer Differential Revision: D30417370 (https://github.com/pytorch/pytorch/commit/b9fc656cf26d60127bd695e4e5a7d27622f2563d) Original commit changeset: 84ce7a578a36 fbshipit-source-id: cd23774cdc3273fd72f8a05f1900eaf36f373e6b --- torch/csrc/jit/codegen/fuser/interface.cpp | 8 ++++++-- torch/csrc/jit/passes/graph_fuser.cpp | 12 +----------- torch/csrc/jit/passes/graph_fuser.h | 3 --- torch/csrc/jit/python/init.cpp | 2 -- 4 files changed, 7 insertions(+), 18 deletions(-) diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp index ef7e9e0b629d5..ec67c4bd83773 100644 --- a/torch/csrc/jit/codegen/fuser/interface.cpp +++ b/torch/csrc/jit/codegen/fuser/interface.cpp @@ -8,12 +8,15 @@ #include #include +C10_DEFINE_bool(torch_jit_enable_cpu_fusion, false, "enable cpu fusion"); + namespace torch { namespace jit { namespace detail { -#ifdef TORCH_ENABLE_LLVM +// Note: CPU fusion is currently disabled due to test flakiness +#if defined(FBCODE_CAFFE2) bool cpu_fuser_enabled = true; #else bool cpu_fuser_enabled = false; @@ -34,7 +37,8 @@ void runFusion(const int64_t key, Stack& stack) { } bool canFuseOnCPU() { - return fuser::hasFusionBackend(DeviceType::CPU) && detail::cpu_fuser_enabled; + return fuser::hasFusionBackend(DeviceType::CPU) && + (detail::cpu_fuser_enabled || FLAGS_torch_jit_enable_cpu_fusion); } bool canFuseOnGPU() { diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index 653f9fec08b32..f7dd466de4ff4 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -183,7 +183,7 @@ struct GraphFuser { return !strict_fuser_check; } if ((*device).is_cpu()) { - return canFuseOnCPULegacy(); + return canFuseOnCPU(); } else if ((*device).is_cuda()) { return canFuseOnGPU(); } else if ((*device).is_xpu()) { @@ -1244,16 +1244,6 @@ void PeepholeOptimizeShapeExpressions(Block* block, AliasDb* db) { } // anonymous namespace -static bool cpu_fuser_enabled_legacy = false; - -bool canFuseOnCPULegacy() { - return cpu_fuser_enabled_legacy; -} - -void overrideCanFuseOnCPULegacy(bool value) { - cpu_fuser_enabled_legacy = value; -} - void FuseGraph(std::shared_ptr& graph, bool strict_fuser_check) { AliasDb db(graph); GraphFuser(&db, graph->block(), strict_fuser_check).run(); diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h index d710e5a098098..0cdcc2e20f469 100644 --- a/torch/csrc/jit/passes/graph_fuser.h +++ b/torch/csrc/jit/passes/graph_fuser.h @@ -5,9 +5,6 @@ namespace torch { namespace jit { -TORCH_API bool canFuseOnCPULegacy(); -TORCH_API void overideCanFuseOnCPULegacy(bool value); - // NB: Be sure to run DCE before fusion, because dead instructions // can prevent fusion opportunities from being exploited. // On Windows will noop, NYI diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index f5da7b30c29d7..992e60edd7d19 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -589,8 +589,6 @@ void initJITBindings(PyObject* module) { .def("_jit_override_can_fuse_on_gpu", &overrideCanFuseOnGPU) .def("_jit_can_fuse_on_cpu", &canFuseOnCPU) .def("_jit_can_fuse_on_gpu", &canFuseOnGPU) - .def("_jit_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy) - .def("_jit_override_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy) .def( "_jit_differentiate", [](Graph& g) { From a65d1ae7cc8c104d041f554d460da0a84c0f2d4e Mon Sep 17 00:00:00 2001 From: BowenBao Date: Fri, 20 Aug 2021 12:44:29 -0700 Subject: [PATCH 108/530] [ONNX] Fix controlflow shape inference with contrib op (#60707) (#62762) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62762 `ONNXShapeTypeInference` for node `n` is skipped if `n` is non ONNX namespace, or if `n` contains any non ONNX namespace nodes. This prevents controlflow nodes containing contrib ops from running `SpecialPostProcess`, which sets up correct node output shape/type information in rare cases. This PR depends on opset 14 export https://github.com/pytorch/pytorch/pull/59486 Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30375180 Pulled By: msaroufim fbshipit-source-id: 5deacec39f091deb4d75ddd9e660e12fca7f16c5 Co-authored-by: BowenBao --- test/onnx/test_custom_ops.py | 32 ++++ .../passes/onnx/fixup_onnx_controlflow.cpp | 145 +++++++++++++---- .../jit/passes/onnx/fixup_onnx_controlflow.h | 1 + .../jit/passes/onnx/shape_type_inference.cpp | 150 +++++++++++------- 4 files changed, 240 insertions(+), 88 deletions(-) diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py index 739f267f90a95..04ac9a0066876 100644 --- a/test/onnx/test_custom_ops.py +++ b/test/onnx/test_custom_ops.py @@ -125,5 +125,37 @@ def symbolic_pythonop(g, n, *args, **kwargs): model = MyModule() run_model_test(self, model, input=(x, )) +class TestExportAsContribOps(unittest.TestCase): + opset_version = 14 + keep_initializers_as_inputs = False + onnx_shape_inference = True + + def test_contrib_op_with_loop(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.gelu = torch.nn.GELU() + + def forward(self, x): + res = [] + res2 = [] + for i in range(x.size(0)): + if len(res) > 0: + res2.append(res[0]) + else: + res2.append(self.gelu(x[0])) + res.append(x[0]) + return torch.stack(res), torch.stack(res2) + + def symbolic_custom_gelu(g, input): + return g.op("com.microsoft::Gelu", input).setType(input.type()) + + from torch.onnx import register_custom_op_symbolic + register_custom_op_symbolic("::gelu", symbolic_custom_gelu, 1) + + x = torch.randn(3, 3, 4, requires_grad=True) + model = torch.jit.script(M()) + run_model_test(self, model, input=(x, )) + if __name__ == "__main__": unittest.main() diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp index abfb547ed5e94..b0a310bfe20ad 100644 --- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp +++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp @@ -238,9 +238,7 @@ std::vector FixupONNXLoopNode(Node* node, int opset_version) { auto new_outputs = ConvertSequenceDependencies(node, opset_version); // Copy type of block output to node output. - for (size_t i = 0; i < node->outputs().size(); ++i) { - node->output(i)->setType(node->blocks().at(0)->outputs().at(i + 1)->type()); - } + FixupONNXControlflowNodeOutputs(node); TORCH_INTERNAL_ASSERT(output_size == new_outputs.size()); return new_outputs; } @@ -347,25 +345,90 @@ void ONNXFixupUninitializedOutput(Node* node) { graph, else_block, else_block_output, then_block_output); if_node->outputs()[i]->setType(else_block->outputs()[i]->type()); } - auto then_tensor_type = - then_block->outputs().at(i)->type()->castRaw(); - auto else_tensor_type = - else_block->outputs().at(i)->type()->castRaw(); - if (then_tensor_type && else_tensor_type) { - const auto& then_shape = then_tensor_type->symbolic_sizes(); - const auto& else_shape = else_tensor_type->symbolic_sizes(); - std::vector<::c10::ShapeSymbol> dims; - if (then_shape.rank() && else_shape.rank() && - then_shape.rank() == else_shape.rank()) { - for (const auto j : c10::irange(then_shape.rank().value())) { - if (then_shape[j] == else_shape[j]) { - dims.emplace_back(then_shape[j]); - } else { - dims.emplace_back(::c10::ShapeSymbol::newSymbol()); - } + } +} + +void ONNXMergeIfBlockOutputShapes(Node* node) { + TORCH_INTERNAL_ASSERT(node->kind() == ::c10::onnx::If); + Block* then_block = node->blocks().at(0); + Block* else_block = node->blocks().at(1); + + TORCH_INTERNAL_ASSERT( + then_block->outputs().size() == else_block->outputs().size()) + + auto findCommonShape = + [](const ::c10::SymbolicShape& a, + const ::c10::SymbolicShape& b) -> ::c10::SymbolicShape { + std::vector<::c10::ShapeSymbol> dims; + if (a.rank() && b.rank() && a.rank() == b.rank()) { + for (const auto j : c10::irange(a.rank().value())) { + if (a[j] == b[j]) { + dims.emplace_back(a[j]); + } else { + dims.emplace_back(::c10::ShapeSymbol::newSymbol()); } - if_node->output(i)->setType( - then_tensor_type->withSymbolicShapes(::c10::SymbolicShape(dims))); + } + return ::c10::SymbolicShape(dims); + } + if (a.rank() && a.rank().value() > 0) { + return a; + } + if (b.rank() && b.rank().value() > 0) { + return b; + } + + return ::c10::SymbolicShape(); + }; + + auto mergeTensorType = + [&findCommonShape](TensorTypePtr a, TensorTypePtr b) -> TensorTypePtr { + if (a && b) { + const auto& a_shape = a->symbolic_sizes(); + const auto& b_shape = b->symbolic_sizes(); + auto commonShape = findCommonShape(a_shape, b_shape); + return a->withSymbolicShapes(commonShape); + } else if (a) { + return a; + } else if (b) { + return b; + } + return nullptr; + }; + + auto mergeListType = [&mergeTensorType]( + ListTypePtr a, ListTypePtr b) -> ListTypePtr { + if (a && b) { + auto a_tensor_type = a->getElementType()->cast(); + auto b_tensor_type = b->getElementType()->cast(); + auto tensor_type = mergeTensorType(a_tensor_type, b_tensor_type); + if (tensor_type) { + return a->withContained({tensor_type})->cast(); + } + // Both branches produce ListType without tensor shape. + return a; + } else if (a) { + return a; + } else if (b) { + return b; + } + return nullptr; + }; + + for (const auto i : c10::irange(else_block->outputs().size())) { + auto then_type = then_block->outputs().at(i)->type(); + auto else_type = else_block->outputs().at(i)->type(); + auto then_tensor_type = then_type->cast(); + auto else_tensor_type = else_type->cast(); + auto then_list_type = then_type->cast(); + auto else_list_type = else_type->cast(); + if (then_tensor_type || else_tensor_type) { + if (auto tensor_type = + mergeTensorType(then_tensor_type, else_tensor_type)) { + node->output(i)->setType(tensor_type); + } + } else if (then_list_type || else_list_type) { + if (auto list_type = mergeListType(then_list_type, else_list_type)) { + node->output(i)->setType(list_type); } } } @@ -376,16 +439,13 @@ std::vector FixupONNXIfNode(Node* node, int opset_version) { return node->outputs().vec(); } GRAPH_DUMP("Graph before fixing controlflow: ", node->owningGraph()); - auto* if_node = node; FixupONNXSubblockOutputs(node); - ONNXFixupUninitializedOutput(if_node); + ONNXFixupUninitializedOutput(node); // Copy type of block output to node output. - for (size_t i = 0; i < node->outputs().size(); ++i) { - node->output(i)->setType(node->blocks().at(0)->outputs().at(i)->type()); - } + ONNXMergeIfBlockOutputShapes(node); GRAPH_DUMP("Graph after fixing controlflow: ", node->owningGraph()); - return if_node->outputs().vec(); + return node->outputs().vec(); } std::vector FixupONNXControlflowNode(Node* n, int opset_version) { @@ -401,5 +461,36 @@ std::vector FixupONNXControlflowNode(Node* n, int opset_version) { } } +void FixupONNXControlflowNodeOutputs(Node* n) { + switch (n->kind()) { + case ::c10::onnx::Loop: { + auto loop_carried_output_size = n->blocks().at(0)->inputs().size() - 2; + for (auto i : c10::irange(n->outputs().size())) { + auto type = n->blocks().at(0)->outputs().at(i + 1)->type(); + if (i < loop_carried_output_size) { + n->output(i)->setType(type); + } else { + if (auto t_type = type->cast()) { + auto sizes = t_type->symbolic_sizes().sizes(); + if (sizes.has_value()) { + sizes.value().emplace( + sizes.value().begin(), c10::ShapeSymbol::newSymbol()); + type = t_type->withSymbolicShapes(sizes.value()); + } + } + n->output(i)->setType(type); + } + } + break; + } + case ::c10::onnx::If: { + ONNXMergeIfBlockOutputShapes(n); + break; + } + default: + break; + } +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h index fad7611085223..8d33c2dd1fb5e 100644 --- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h +++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h @@ -6,6 +6,7 @@ namespace torch { namespace jit { std::vector FixupONNXControlflowNode(Node* n, int opset_version); +void FixupONNXControlflowNodeOutputs(Node* n); } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index f630cf023f7b5..8ade722fb8bd9 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -180,7 +181,21 @@ void UpdateTorchValueByOnnxValueInfo( } } -bool IsSupportedNode(const Node* n) { +bool IsValidONNXControlflowNode(const Node* n) { + // Skip when block size is zero. This is when the node is being created, + // and doesn't have subblocks attached yet. Run shape inference for these + // nodes later, when the subgraph has already completed shape inferencing. + auto node_kind = n->kind(); + if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) { + if (n->blocks().size() == 0) { + return false; + } + } + + return true; +} + +bool IsValidONNXNode(const Node* n) { auto node_kind = n->kind(); if (!node_kind.is_onnx()) { @@ -188,18 +203,14 @@ bool IsSupportedNode(const Node* n) { return false; } - // Skip when block size is zero. This is when the node is first created, - // doesn't have subblocks attached yet. Run shape inference for these nodes - // when the subgraph has already completed shape inferencing. - if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) { - if (n->blocks().size() == 0) { - return false; - } - for (auto b : n->blocks()) { - for (auto b_n : b->nodes()) { - if (!IsSupportedNode(b_n)) { - return false; - } + if (!IsValidONNXControlflowNode(n)) { + return false; + } + + for (auto b : n->blocks()) { + for (auto b_n : b->nodes()) { + if (!IsValidONNXNode(b_n)) { + return false; } } } @@ -1310,6 +1321,20 @@ void SpecialPostProcess(Node* n) { } break; } + case ::c10::onnx::If: { + if (!IsValidONNXControlflowNode(n)) { + break; + } + FixupONNXControlflowNodeOutputs(n); + break; + } + case ::c10::onnx::Loop: { + if (!IsValidONNXControlflowNode(n)) { + break; + } + FixupONNXControlflowNodeOutputs(n); + break; + } } } @@ -1391,64 +1416,67 @@ void ONNXShapeTypeInference( int opset_version) { GRAPH_UPDATE( "Running ONNX shape inference for node: ", n->kind().toDisplayString()); - if (!IsSupportedNode(n)) { - return; - } - // Create a Graph containing only the single node n. - // This graph is later converted to ONNX to run shape inference. - auto n_graph = std::make_shared(); - auto clone_node = CloneNodeToGraph(n, n_graph, params_dict, opset_version); - n_graph->insertNode(clone_node); + if (IsValidONNXNode(n)) { + // Create a Graph containing only the single node n. + // This graph is later converted to ONNX to run shape inference. + auto n_graph = std::make_shared(); + auto clone_node = CloneNodeToGraph(n, n_graph, params_dict, opset_version); + n_graph->insertNode(clone_node); - // Register all node outputs as graph outputs. - for (auto output : clone_node->outputs()) { - n_graph->registerOutput(output); - } + // Register all node outputs as graph outputs. + for (auto output : clone_node->outputs()) { + n_graph->registerOutput(output); + } - // Use scalar_type_analysis without low precision cast - ScalarTypeAnalysisForONNX(n_graph, false, opset_version); + // Use scalar_type_analysis without low precision cast + ScalarTypeAnalysisForONNX(n_graph, false, opset_version); - GRAPH_DEBUG("Original torch graph: ", n->owningGraph()->toString()); - GRAPH_DEBUG( - "Cloned torch graph to run shape inference: ", n_graph->toString()); - - if (IsGraphValidForInference(n_graph)) { - // TODO: Some ops have conversion happen at Peephole pass. - // The conversion here is incomplete for these ops. - // e.g: ListConstruct, ListUnpack, etc. - std::shared_ptr model_proto; - SymbolDimMap symbol_map; - ConvertGraphToONNXProto(n_graph, model_proto, symbol_map, opset_version); + GRAPH_DEBUG("Original torch graph: ", n->owningGraph()->toString()); GRAPH_DEBUG( - "ONNX graph to run shape inference: ", prettyPrint(*model_proto)); - - // infer shape - try { - onnx::shape_inference::InferShapes(*model_proto); - UpdateOutputTypeByONNXProto(n, clone_node, *model_proto, symbol_map); - } catch (std::runtime_error& ex) { - // TODO: include this as warning once we have a more consolidated warning - // system. + "Cloned torch graph to run shape inference: ", n_graph->toString()); + + if (IsGraphValidForInference(n_graph)) { + // TODO: Some ops have conversion happen at Peephole pass. + // The conversion here is incomplete for these ops. + // e.g: ListConstruct, ListUnpack, etc. + std::shared_ptr model_proto; + SymbolDimMap symbol_map; + ConvertGraphToONNXProto(n_graph, model_proto, symbol_map, opset_version); GRAPH_DEBUG( - "ONNX shape inference fails with: ", - ex.what(), - " on graph: ", - n_graph->toString()); - // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) - const char shape_err[] = "ShapeInferenceError"; - // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) - const char type_err[] = "TypeInferenceError"; - if ((strstr(ex.what(), shape_err) == nullptr) && - (strstr(ex.what(), type_err) == nullptr)) { - throw; + "ONNX graph to run shape inference: ", prettyPrint(*model_proto)); + + // infer shape + try { + onnx::shape_inference::InferShapes(*model_proto); + UpdateOutputTypeByONNXProto(n, clone_node, *model_proto, symbol_map); + } catch (std::runtime_error& ex) { + // TODO: include this as warning once we have a more consolidated + // warning system. + GRAPH_DEBUG( + "ONNX shape inference fails with: ", + ex.what(), + " on graph: ", + n_graph->toString()); + // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) + const char shape_err[] = "ShapeInferenceError"; + // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) + const char type_err[] = "TypeInferenceError"; + // NOLINTNEXTLINE(modernize-use-nullptr) + if ((strstr(ex.what(), shape_err) == NULL) && + // NOLINTNEXTLINE(modernize-use-nullptr) + (strstr(ex.what(), type_err) == NULL)) { + throw; + } } + GRAPH_DEBUG( + "ONNX graph after shape inference: ", prettyPrint(*model_proto)); } - GRAPH_DEBUG( - "ONNX graph after shape inference: ", prettyPrint(*model_proto)); } SpecialPostProcess(n); - ProcessConstantValueMap(n, opset_version); + if (IsValidONNXNode(n)) { + ProcessConstantValueMap(n, opset_version); + } GRAPH_DEBUG( "Torch graph after shape inference:", n->owningGraph()->toString()); } From 87602549112aac84c6f36fa2b2fd18902ec6bef6 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Fri, 20 Aug 2021 12:44:29 -0700 Subject: [PATCH 109/530] [ONNX] Fix an issue that optimizations might adjust graph inputs unexpectedly. (#61280) (#62763) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62763 This PR is to fix the issue that the graph inputs might be updated when we export the model in inference mode. When a model is export in inference mode, some optimizations will be made. One side effect of these optimizations is: the inputs of graph might be adjusted. Such optimizatiosn include: 1. Conv and BatchNorm op fusion. 2. Do constant folding. If the user sets export_params=False, or set keep_initializers_as_inputs=True, it's highly possible that the user wants to provide the corresponding parameters or initiliazers as the inputs of the graph. In such situation, no matter the model is export in inference mode or training mode, exporter needs to prevent above optimizations from adjusting the graph inputs. By this, the inputs of graph could match inputs that users provided. The changes in this PR, add an additional common judgement to see if the above optimizations needs to be done or not. From the value of export_params and keep_initializers_as_inputs arguments, infer if the graph inputs are allowed to be adjusted. If no, these optimizations will be ignored, even other requirements are matched. Besides these code changes, the comments of some parameters below have been updated so that users have more thoughts when they consider how to leverage these parameters for different purposes: 1. export_params 2. training 3. do_constant_folding 4. keep_initializers_as_inputs Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30375183 Pulled By: msaroufim fbshipit-source-id: 4db8b9695649eb32a3a0fefa950ee2e5651bdba0 Co-authored-by: fatcat-z --- .../expect/TestOperators.test_prelu.expect | 28 +++++----- ...ors.test_retain_param_name_disabled.expect | 52 ++++++++++++++----- torch/_C/__init__.pyi.in | 2 +- torch/csrc/jit/passes/onnx/eval_peephole.cpp | 21 ++++++-- torch/csrc/jit/passes/onnx/eval_peephole.h | 3 +- torch/csrc/jit/python/init.cpp | 5 +- torch/onnx/__init__.py | 22 ++++++-- torch/onnx/utils.py | 19 ++++--- 8 files changed, 107 insertions(+), 45 deletions(-) diff --git a/test/onnx/expect/TestOperators.test_prelu.expect b/test/onnx/expect/TestOperators.test_prelu.expect index e19623cfd4460..be0328e5c61b7 100644 --- a/test/onnx/expect/TestOperators.test_prelu.expect +++ b/test/onnx/expect/TestOperators.test_prelu.expect @@ -2,20 +2,30 @@ ir_version: 6 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { + node { + input: "weight" + output: "2" + name: "Unsqueeze_0" + op_type: "Unsqueeze" + attribute { + name: "axes" + ints: 1 + ints: 2 + type: INTS + } + } node { input: "input" - input: "4" + input: "2" output: "3" - name: "PRelu_0" + name: "PRelu_1" op_type: "PRelu" } name: "torch-jit-export" initializer { dims: 2 - dims: 1 - dims: 1 data_type: 1 - name: "4" + name: "weight" raw_data: "\000\000\200>\000\000\200>" } input { @@ -41,7 +51,7 @@ graph { } } input { - name: "4" + name: "weight" type { tensor_type { elem_type: 1 @@ -49,12 +59,6 @@ graph { dim { dim_value: 2 } - dim { - dim_value: 1 - } - dim { - dim_value: 1 - } } } } diff --git a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect index 5eeaa875feb0c..aa9499e27ac49 100644 --- a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect +++ b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect @@ -2,33 +2,57 @@ ir_version: 6 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { + node { + input: "1" + output: "3" + name: "Transpose_0" + op_type: "Transpose" + attribute { + name: "perm" + ints: 1 + ints: 0 + type: INTS + } + } node { input: "input.1" - input: "7" + input: "3" output: "4" - name: "MatMul_0" + name: "MatMul_1" op_type: "MatMul" } + node { + input: "2" + output: "5" + name: "Transpose_2" + op_type: "Transpose" + attribute { + name: "perm" + ints: 1 + ints: 0 + type: INTS + } + } node { input: "4" - input: "8" + input: "5" output: "6" - name: "MatMul_1" + name: "MatMul_3" op_type: "MatMul" } name: "torch-jit-export" initializer { - dims: 4 dims: 5 + dims: 4 data_type: 1 - name: "7" + name: "1" raw_data: "\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@" } initializer { - dims: 5 dims: 6 + dims: 5 data_type: 1 - name: "8" + name: "2" raw_data: "\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@" } input { @@ -48,32 +72,32 @@ graph { } } input { - name: "7" + name: "1" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 4 + dim_value: 5 } dim { - dim_value: 5 + dim_value: 4 } } } } } input { - name: "8" + name: "2" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 5 + dim_value: 6 } dim { - dim_value: 6 + dim_value: 5 } } } diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index b683a60615dc5..4d0245c7786af 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -324,7 +324,7 @@ def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ... def _jit_pass_onnx_fold_if(graph: Graph) -> None: ... def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ... def _jit_pass_inline_fork_wait(graph: Graph) -> None: ... -def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... +def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue], isAllowedToAdjustGraphInputs: _bool) -> Dict[str, IValue]: ... def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ... def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ... diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp index 05afb69ef0f23..4bad9367af444 100644 --- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp +++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp @@ -141,14 +141,27 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) { } } -void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) { +void EvalPeepholeONNX( + Block* b, + ParamMap& paramsDict, + bool isAllowedToAdjustGraphInputs) { auto valsToParamsMap = buildValueToParamsMap(b, paramsDict); - fuseConvBatchNorm(b, valsToParamsMap); + + // Optimizations like fusing Conv and BatchNorm ops may adjust the graph + // inputs. If the graph inputs are not allowed to be adjusted, for example + // export_params is False, such optimizations will be skipped. + if (isAllowedToAdjustGraphInputs) { + fuseConvBatchNorm(b, valsToParamsMap); + } + buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict); } -void EvalPeepholeONNX(std::shared_ptr& g, ParamMap& paramsDict) { - EvalPeepholeONNX(g->block(), paramsDict); +void EvalPeepholeONNX( + std::shared_ptr& g, + ParamMap& paramsDict, + bool isAllowedToAdjustGraphInputs) { + EvalPeepholeONNX(g->block(), paramsDict, isAllowedToAdjustGraphInputs); GRAPH_DUMP("After EvalPeepholeONNX:", g); } diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.h b/torch/csrc/jit/passes/onnx/eval_peephole.h index 6f8961d08fd5e..d953f2c2e5bda 100644 --- a/torch/csrc/jit/passes/onnx/eval_peephole.h +++ b/torch/csrc/jit/passes/onnx/eval_peephole.h @@ -9,7 +9,8 @@ namespace jit { void EvalPeepholeONNX( std::shared_ptr& g, - std::map& paramDict); + std::map& paramDict, + bool isAllowedToAdjustGraphInputs); } // namespace jit diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 992e60edd7d19..86b64b8342a7d 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -203,8 +203,9 @@ void initJITBindings(PyObject* module) { .def( "_jit_pass_onnx_eval_peephole", [](std::shared_ptr& graph, - std::map& paramsDict) { - EvalPeepholeONNX(graph, paramsDict); + std::map& paramsDict, + bool isAllowedToAdjustGraphInputs) { + EvalPeepholeONNX(graph, paramsDict, isAllowedToAdjustGraphInputs); return paramsDict; }, pybind11::return_value_policy::move) diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index b726b2b55e8b6..e058acce1947d 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -103,11 +103,17 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM export_params (bool, default True): if True, all parameters will be exported. Set this to False if you want to export an untrained model. In this case, the exported model will first take all of its parameters - as arguments, with the ordering as specified by ``model.state_dict().values()`` + as arguments, with the ordering as specified by ``model.state_dict().values()``. + This helps in stripping parameters from the model which is useful for training. + Besides, if this is False, any optimization that may adjust graph inputs will + be skipped - for example, Conv and BatchNorm fusion. verbose (bool, default False): if True, prints a description of the model being exported to stdout. training (enum, default TrainingMode.EVAL): - * ``TrainingMode.EVAL``: export the model in inference mode. + * ``TrainingMode.EVAL``: export the model in inference mode. In this case, optimizations + (e.g., fusing Conv and BatchNorm ops) may adjust graph inputs by modifying model params + and model param names. Such adjustment could be skipped by setting export_params = False + or keep_initializers_as_inputs = True. * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is False and in training mode if model.training is True. * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations @@ -184,6 +190,8 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM do_constant_folding (bool, default False): Apply the constant-folding optimization. Constant-folding will replace some of the ops that have all constant inputs with pre-computed constant nodes. + Since this optimization adjusts model initializers, it will be disabled if + export_params = False or keep_initializers_as_inputs = True. example_outputs (T or a tuple of T, where T is Tensor or convertible to Tensor, default None): Must be provided when exporting a ScriptModule or ScriptFunction, ignored otherwise. Used to determine the type and shape of the outputs without tracing the execution of @@ -265,9 +273,13 @@ def forward(self, x): keep_initializers_as_inputs (bool, default None): If True, all the initializers (typically corresponding to parameters) in the - exported graph will also be added as inputs to the graph. If False, - then initializers are not added as inputs to the graph, and only - the non-parameter inputs are added as inputs. + exported graph will also be added as inputs to the graph. + + If False, then initializers are not added as inputs to the graph, and only + the non-parameter inputs are added as inputs. Meanwhile, the optimization + that might adjust graph inputs will be skipped (e.g., fusing Conv and + BatchNorm ops), even when the user export this model in inference mode. + This may allow for better optimizations (e.g. constant folding) by backends/runtimes. diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 41ba20f3ad102..7860e38034028 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -439,7 +439,8 @@ def _model_to_graph(model, args, verbose=False, example_outputs=None, _retain_param_name=False, do_constant_folding=True, _disable_torch_constant_prop=False, fixed_batch_size=False, - training=None, dynamic_axes=None): + training=None, dynamic_axes=None, export_params=True, + keep_initializers_as_inputs=False): r"""Converts model into an ONNX graph. Returns: @@ -498,10 +499,12 @@ def _model_to_graph(model, args, verbose=False, params_dict = _get_named_param_dict(graph, params) - if training is None or training == TrainingMode.EVAL: - params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict) + allow_adjust_graph_inputs = (export_params and not keep_initializers_as_inputs) + if (training is None or training == TrainingMode.EVAL): + params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict, allow_adjust_graph_inputs) - if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: + if do_constant_folding and allow_adjust_graph_inputs and \ + _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: params_dict = torch._C._jit_pass_onnx_constant_fold(graph, params_dict, _export_onnx_opset_version) torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph) @@ -569,7 +572,9 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, output_names, operator_export_type, example_outputs, _retain_param_name, val_do_constant_folding, fixed_batch_size=fixed_batch_size, - training=training) + training=training, + export_params=export_params, + keep_initializers_as_inputs=val_keep_init_as_ip) return graph._pretty_print_onnx(params_dict, opset_version, False, operator_export_type, google_printer, @@ -685,7 +690,9 @@ def _export(model, args, f, export_params=True, verbose=False, training=None, val_do_constant_folding, fixed_batch_size=fixed_batch_size, training=training, - dynamic_axes=dynamic_axes) + dynamic_axes=dynamic_axes, + export_params=export_params, + keep_initializers_as_inputs=val_keep_init_as_ip) # TODO: Don't allocate a in-memory string for the protobuf defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE From db0771b05d81e9ca5e46740b09589a8ff0bc3ec0 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Fri, 20 Aug 2021 12:44:29 -0700 Subject: [PATCH 110/530] [ONNX] Update repeat_interleave for dynamic repeats (#59979) (#62764) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62764 Fixes #58733 - Support dynamic interleave for cases with dynamic repeat values - Moved repeat_interleave symbolic from opset 11 to opset 13, as sequence as output types for loop outputs is needed for this change Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30375179 Pulled By: msaroufim fbshipit-source-id: 787f96bf91d124fd0483761088c5f4ae930d96a9 Co-authored-by: Shubham Bhokare --- test/onnx/test_pytorch_onnx_onnxruntime.py | 61 ++++++++--- torch/onnx/symbolic_opset11.py | 104 ------------------ torch/onnx/symbolic_opset13.py | 118 ++++++++++++++++++++- torch/onnx/symbolic_opset9.py | 7 +- 4 files changed, 171 insertions(+), 119 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 67903fb0bd94c..fd1062946941c 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -4323,7 +4323,7 @@ def forward(self, x): x = torch.tensor([[1, 2], [3, 4]]) self.run_test(RepeatsDimsModel2(), (x,)) - @skipIfUnsupportedMinOpsetVersion(11) + @skipIfUnsupportedMinOpsetVersion(13) def test_dynamic_repeat_interleave(self): class SingleDynamicModel(torch.nn.Module): def forward(self, x): @@ -4345,25 +4345,62 @@ def forward(self, x): self.run_test(NegDynamicModel(), x, test_with_inputs=[another_x], input_names=["input_1"], dynamic_axes={"input_1" : {1 : "w"}}) - class SingleDynamicModel2(torch.nn.Module): + class SingleDynamicModelFloat(torch.nn.Module): def forward(self, x): repeats = torch.tensor([4]) return torch.repeat_interleave(x, repeats, dim=0) - x = torch.tensor([[1, 2], [3, 4]]) - another_x = torch.tensor([[7, 8], [5, 6]]) - self.run_test(SingleDynamicModel2(), x, test_with_inputs=[another_x], + x = torch.tensor([[1.1, 2.1], [3.1, 4.1]]) + another_x = torch.tensor([[7.1, 8.1], [5.1, 6.1]]) + self.run_test(SingleDynamicModelFloat(), x, test_with_inputs=[another_x], input_names=["input_1"], dynamic_axes={"input_1" : {0 : "h"}}) - class AllDynamicModel(torch.nn.Module): - def forward(self, x): - repeats = torch.tensor([4]) - return torch.repeat_interleave(x, repeats, dim=0) + class DynamicRepeatsModel(torch.nn.Module): + def forward(self, x, repeats): + return torch.repeat_interleave(x, repeats, dim=1) - x = torch.tensor([[1, 2, 4, 16], [3, 9, 27, 81], [2, 3, 5, 7]]) + x = torch.tensor([[1, 2, 4], [3, 4, 7]]) another_x = torch.tensor([[7, 8], [5, 6]]) - self.run_test(AllDynamicModel(), x, test_with_inputs=[another_x], - input_names=["input_1"], dynamic_axes={"input_1" : {0 : "h", 1 : "w"}}) + repeats = torch.tensor([2]) + another_repeats = torch.tensor([4]) + self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(another_x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"input_1" : {1 : "w"}, "repeats_1" : {0 : "r"}}) + + class DynamicRepeatsModel2(torch.nn.Module): + def forward(self, x, repeats): + return torch.repeat_interleave(x, repeats, dim=1) + + x = torch.tensor([[1, 2, 4], [3, 4, 7]]) + repeats = torch.tensor([2]) + another_repeats = torch.tensor([4]) + self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"repeats_1" : {0 : "r"}}) + + @skipIfUnsupportedMinOpsetVersion(13) + def test_multiple_dynamic_repeat_interleave(self): + class DynamicRepeatsModel(torch.nn.Module): + def forward(self, x, repeats): + return torch.repeat_interleave(x, repeats, dim=1) + + x = torch.tensor([[1, 2, 4], [3, 4, 7]]) + repeats = torch.tensor([2, 3, 4]) + another_repeats = torch.tensor([4, 3, 2]) + self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"repeats_1" : {0 : "r"}}) + + class DynamicRepeatsModel2(torch.nn.Module): + def forward(self, x, repeats): + return torch.repeat_interleave(x, repeats, dim=0) + + x = torch.tensor([[1, 2, 4], [3, 4, 7]]) + repeats = torch.tensor([2, 3]) + another_repeats = torch.tensor([4, 3]) + self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"repeats_1" : {0 : "r"}}) def test_view(self): class ViewModel(torch.nn.Module): diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index ed7abf263f31c..53440f15928ee 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -894,110 +894,6 @@ def chunk(g, self, chunks, dim): chunk_vec = g.op("Concat", *chunk_vec, axis_i=0) return split(g, self, chunk_vec, dim) -def repeat_interleave(g, self, repeats, dim=None, output_size=None): - input = self - final_dim = dim - # if dim is None flatten - # By default, use the flattened input array, and return a flat output array - if sym_help._is_none(dim): - input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1]))) - dim = 0 - else: - dim = sym_help._maybe_get_scalar(dim) - - repeats_dim = sym_help._get_tensor_rank(repeats) - repeats_sizes = sym_help._get_tensor_sizes(repeats) - input_sizes = sym_help._get_tensor_sizes(input) - if repeats_dim is None: - raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown " - "repeats rank.") - if repeats_sizes is None: - raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown " - "repeats size.") - if input_sizes is None: - raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown " - "input size.") - # Handle cases where dim is negative - if dim < 0: - dim += len(input_sizes) - - output_sizes = input_sizes.copy() - perm_i = [0] - for idx, input_size in enumerate(input_sizes): - perm_i.append(idx + 1) - if input_size is None: - output_sizes[idx], input_sizes[idx] = 0, -1 - perm_i[0], perm_i[dim] = perm_i[dim], perm_i[0] - - # Cases when repeats is a single value tensor and dim has unknown input size - if (repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1)) and output_sizes[dim] == 0: - if not sym_help._is_tensor(repeats): - repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) - reps = sym_help._size_helper(g, input, dim) - reps = unsqueeze(g, reps, 0) - repeats = g.op("Expand", repeats, reps) - # There are cases when the repeats are 1-d tensor with multiple repeats, but dim - # provided along one of the dynamic axes provided. A simple example would be - # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2 - # Now, repeat interleaving can be performed in pytorch when the value of * matches - # with the number of elements in repeat, for example if * -> 2, number of repeats - # should be 2 as well. - else: - return torch.onnx.symbolic_opset9.repeat_interleave(g, self, repeats, final_dim) - - reps_like = g.op("ConstantOfShape", g.op("Shape", repeats), - value_t=torch.tensor([1], dtype=torch.long)) - r_splits = split(g, repeats, reps_like, 0) - i_splits = split(g, input, reps_like, dim) - - output_sizes[dim], input_sizes[dim] = -1, 1 - - # Create a loop to iterate over each value along the dimension - # and perform individual interleaving using the repeats tensor - # Loop is of the following pattern - # input (trip_count, cond) - # int trip_count = ...; - # bool cond = ...; - # for (int i=0; i < trip_count && cond; ++i) { - # cond = ...; - # } - - # Loop conditions - loop_condition = g.op("Constant", value_t=torch.tensor(1)) - loop_condition = g.op("Cast", loop_condition, to_i=9) - loop_len = reps - loop = g.op("Loop", loop_len, loop_condition) - - # Loop inputs - loop_block = _add_block(loop.node()) - block_input_iter = _add_input_to_block(loop_block) - cond = _add_input_to_block(loop_block) - - r_split = loop_block.op("SequenceAt", r_splits, block_input_iter) - i_split = loop_block.op("SequenceAt", i_splits, block_input_iter) - - i_split = unsqueeze(loop_block, i_split, dim + 1) - r_concat = [loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[:dim + 1])), - r_split, - loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))] - r_concat = loop_block.op("Concat", *r_concat, axis_i=0) - i_split = expand(loop_block, i_split, r_concat, None) - i_split = sym_help._reshape_helper(loop_block, i_split, - g.op("Constant", value_t=torch.LongTensor(output_sizes))) - - # Loop outputs - cond_out = loop_block.op("Cast", loop_condition, to_i=9) - _add_output_to_block(loop_block, cond_out) - _add_output_to_block(loop_block, i_split) - loop_out = loop.node().output() - - # In this loop, the outputs are scan outputs and are concatenated along - # the zero'th dimension (by default). In order to avoid this and concatenate - # along the dimension provided, some post-processing is required - loop_out = g.op("Transpose", loop_out, perm_i=perm_i) - return sym_help._reshape_helper(g, loop_out, - g.op("Constant", value_t=torch.LongTensor(output_sizes))) - def normal(g, loc, scale, seed): # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py index 7f20833571a53..0baf785757702 100644 --- a/torch/onnx/symbolic_opset13.py +++ b/torch/onnx/symbolic_opset13.py @@ -5,7 +5,9 @@ import torch import torch.onnx.symbolic_helper as sym_help from torch.onnx.symbolic_helper import parse_args, _unimplemented -from torch.onnx.symbolic_opset9 import overload_by_arg_count, _maybe_cast_reduce_op_input, nonzero +from torch.onnx.symbolic_opset9 import overload_by_arg_count, _maybe_cast_reduce_op_input, nonzero, expand +from torch.onnx.symbolic_opset11 import unsqueeze +from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block # EDITING THIS FILE? READ THIS FIRST! @@ -196,3 +198,117 @@ def unsafe_chunk(g, self, chunks, dim, _outputs=None): # user's modules. splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long)) return g.op("Split", self, splits, axis_i=dim, outputs=_outputs) + +def repeat_interleave(g, self, repeats, dim=None, output_size=None): + input = self + final_dim = dim + # if dim is None flatten + # By default, use the flattened input array, and return a flat output array + if sym_help._is_none(dim): + input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1]))) + dim = 0 + else: + dim = sym_help._maybe_get_scalar(dim) + + repeats_dim = sym_help._get_tensor_rank(repeats) + repeats_sizes = sym_help._get_tensor_sizes(repeats) + input_sizes = sym_help._get_tensor_sizes(input) + if repeats_dim is None: + raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown " + "repeats rank.") + if repeats_sizes is None: + raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown " + "repeats size.") + if input_sizes is None: + raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown " + "input size.") + # Handle cases where dim is negative + if dim < 0: + dim += len(input_sizes) + + output_sizes = input_sizes.copy() + for idx, input_size in enumerate(input_sizes): + if input_size is None: + output_sizes[idx], input_sizes[idx] = 0, -1 + print(output_sizes, input_sizes) + + cond_dynamic_repeats = (repeats_dim == 1 and repeats_sizes[0] is None) + # If input size is dynamic or repeats vector is dynamic + if output_sizes[dim] == 0 or cond_dynamic_repeats: + reps = sym_help._size_helper(g, input, dim) + reps = unsqueeze(g, reps, 0) + # Check if repeats vector is a single integer value + # or a single dimension tensor with non-dynamic values + if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1): + if not sym_help._is_tensor(repeats): + repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) + repeats = g.op("Expand", repeats, reps) + # Check if repeats is dynamic + # As repeats is dynamic, we use a where node as a substitute for the if statement + # If repests_dim = 1, expand repeats otherwise use original tensor + elif cond_dynamic_repeats: + repeat_dim = sym_help._size_helper(g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))) + repeat_cond = g.op("Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))) + repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats) + # There are cases when the repeats are 1-d tensor with multiple repeats, but dim + # provided along one of the dynamic axes provided. A simple example would be + # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2 + # Now, repeat interleaving can be performed in pytorch when the value of * matches + # with the number of elements in repeat, for example if * -> 2, number of repeats + # should be 2 as well. + else: + return torch.onnx.symbolic_opset9.repeat_interleave(g, self, repeats, final_dim) + + reps_like = g.op("ConstantOfShape", g.op("Shape", repeats), + value_t=torch.tensor([1], dtype=torch.long)) + r_splits = split(g, repeats, reps_like, 0) + i_splits = split(g, input, reps_like, dim) + + output_sizes[dim], input_sizes[dim] = -1, 1 + + # Create a loop to iterate over each value along the dimension + # and perform individual interleaving using the repeats tensor + # Loop is of the following pattern + # input (trip_count, cond) + # int trip_count = ...; + # bool cond = ...; + # for (int i=0; i < trip_count && cond; ++i) { + # cond = ...; + # } + + # Loop conditions + loop_condition = g.op("Constant", value_t=torch.tensor(1)) + loop_condition = g.op("Cast", loop_condition, to_i=9) + loop_len = reps + + # Create an empty sequence to store final expansions + final_splits = g.op("SequenceEmpty") + loop = g.op("Loop", loop_len, loop_condition, final_splits) + + # Loop inputs + loop_block = _add_block(loop.node()) + block_input_iter = _add_input_to_block(loop_block) + cond = _add_input_to_block(loop_block) + final_splits = _add_input_to_block(loop_block) + + r_split = loop_block.op("SequenceAt", r_splits, block_input_iter) + i_split = loop_block.op("SequenceAt", i_splits, block_input_iter) + + i_split = unsqueeze(loop_block, i_split, dim + 1) + r_concat = [loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[:dim + 1])), + r_split, + loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))] + r_concat = loop_block.op("Concat", *r_concat, axis_i=0) + i_split = expand(loop_block, i_split, r_concat, None) + i_split = sym_help._reshape_helper(loop_block, i_split, + g.op("Constant", value_t=torch.LongTensor(output_sizes))) + final_splits = loop_block.op("SequenceInsert", final_splits, i_split) + + # Loop outputs + cond_out = loop_block.op("Cast", loop_condition, to_i=9) + _add_output_to_block(loop_block, cond_out) + _add_output_to_block(loop_block, final_splits) + + loop_out = loop.node().output() + loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim) + return loop_out diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 993284a292a96..ce59e15bb354d 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2058,7 +2058,7 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None): if not sym_help._is_tensor(repeats): repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) if input_sizes[dim] == 0: - return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 11, + return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13, "Unsupported along dimension with unknown input size") else: reps = input_sizes[dim] @@ -2067,8 +2067,11 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None): # Cases where repeats is a 1 dim Tensor elif repeats_dim == 1: if input_sizes[dim] == 0: - return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 11, + return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13, "Unsupported along dimension with unknown input size") + if repeats_sizes[0] is None: + return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13, + "Unsupported for cases with dynamic repeats") assert repeats_sizes[0] == input_sizes[dim], "repeats must have the same size as input along dim" reps = repeats_sizes[0] else: From 1dd648f1c40c24a3d5a151581a8129652191fa86 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Fri, 20 Aug 2021 12:44:29 -0700 Subject: [PATCH 111/530] [ONNX] Suppport torch.dot and torch.nn.utils.spectral_norm (#62596) (#62765) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62765 Fixes #27723 Test Plan: Imported from OSS Reviewed By: SplitInfinity Differential Revision: D30375181 Pulled By: msaroufim fbshipit-source-id: 715f4745899757ec405877980cd20c826028eb2c Co-authored-by: BowenBao --- test/onnx/test_pytorch_onnx_onnxruntime.py | 21 +++++++++++++++++++++ torch/onnx/symbolic_opset9.py | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index fd1062946941c..865b3656dbbdf 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -5722,6 +5722,27 @@ def forward(self, input, other): y = torch.randint(10, (5, )) self.run_test(MatmulModel(), (x, y)) + @skipIfUnsupportedMinOpsetVersion(9) # MatMul long inputs is added in ONNX opset 9. + def test_dot(self): + class MatmulModel(torch.nn.Module): + def forward(self, input, other): + return torch.dot(input, other) + + x = torch.randn(5, requires_grad=True) + y = torch.randn(5, requires_grad=True) + self.run_test(MatmulModel(), (x, y)) + + x = torch.randint(10, (5, )) + y = torch.randint(10, (5, )) + self.run_test(MatmulModel(), (x, y)) + + @disableScriptTest() # SpectralNorm not TorchScript compatible. + def test_spectral_norm(self): + m = torch.nn.utils.spectral_norm(torch.nn.Linear(2, 4)) + + x = torch.randn(6, 2) + self.run_test(m, (x, )) + def test_prelu(self): class PReluModel(torch.nn.Module): def __init__(self): diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index ce59e15bb354d..70bb8282570e2 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -3138,6 +3138,10 @@ def mv(g, self, vec): return matmul(g, self, vec) +def dot(g, self, other): + return matmul(g, self, other) + + @parse_args('v', 'v') def fill(g, self, value): dtype = self.type().scalarType() From 07e41cf2d7e4cd36443c1401e7dbb9970a50df82 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 20 Aug 2021 12:56:01 -0700 Subject: [PATCH 112/530] [easy]Unbreak caffe2benchmarking build (#63655) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63655 ghstack-source-id: 136324310 Test Plan: buck build //fbobjc/Apps/Internal/Caffe2Benchmarking:Caffe2Benchmarking fbobjc/mode/iphonesimulator Reviewed By: hl475, JacobSzwejbka Differential Revision: D30455659 fbshipit-source-id: b6da6be4f89b6e84753ef0849ffedea04785034a --- binaries/benchmark_helper.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc index b0e1ae06be8d8..7690e356adaa0 100644 --- a/binaries/benchmark_helper.cc +++ b/binaries/benchmark_helper.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include #ifdef _WIN32 From b008bb4443250276f9bfc50bb338c368f1a414cb Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 20 Aug 2021 13:13:54 -0700 Subject: [PATCH 113/530] Include iostream in ProcessGroupMPI.cpp (#63656) Summary: As it uses `std::cerr`, which in turn results in compilation regression introduced by https://github.com/pytorch/pytorch/pull/61500 Fixes https://github.com/pytorch/pytorch/issues/63653 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63656 Reviewed By: ejguan Differential Revision: D30455824 Pulled By: malfet fbshipit-source-id: 29f316e7f7fd8e7dcbee2666e7a985f25bf56515 --- torch/csrc/distributed/c10d/ProcessGroupMPI.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp index aa6d81bbe4a13..b75f4417e832a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include From a8de0d83fed2d68512c0b0e20716bd63e6769469 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Fri, 20 Aug 2021 14:00:20 -0700 Subject: [PATCH 114/530] empty caching allocator before test_avg_pool2d large subtest (#63528) Summary: Otherwise, unrecoverable OOM occurs on MI25. Fixes broken ROCm CI test1. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63528 Reviewed By: malfet, zhouzhuojie Differential Revision: D30459151 Pulled By: walterddr fbshipit-source-id: 63e205c4f486fcbdd514cfb0ed8e38584f894585 --- test/test_nn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_nn.py b/test/test_nn.py index d21e0477715db..bb109cf20e459 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -13936,6 +13936,9 @@ def helper(n, c, h, w, kernel_size, stride=None, helper(4, 8, 8, 8, 3, count_include_pad=False, padding=2, stride=2) helper(4, 8, 8, 8, 3, divisor_override=42) helper(4, 8, 8, 8, 7) + # ROCm 16GB MI25 hits OOM error. Clear caching allocator prior to running large subtest. + if TEST_WITH_ROCM and 'cuda' in device: + torch.cuda.empty_cache() helper(200, 512, 28, 28, 2) helper(4, 8, 7, 7, 3, stride=1) helper(4, 8, 7, 7, 3, padding=2, stride=1) From da0820e553a1ff89dbfd37c591154e8326748fab Mon Sep 17 00:00:00 2001 From: jiayisun Date: Fri, 20 Aug 2021 14:54:51 -0700 Subject: [PATCH 115/530] add BFloat16 operators on CPU: range, sinh, cosh, frexp, nan_to_num (#61826) Summary: Added BFloat16 support for range, sinh, cosh, frexp, and nan_to_num on CPU, and collected the benchmark data of these OPs(range, sinh, cosh, frexp, and nan_to_num) for BFloat16 and Float32 data type by using the operator_benchmark tool of PyTorch on the platform of Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz Number of cores: 1 core, 28 cores(1 socket) [cosh_sinh_benchmark.txt](https://github.com/pytorch/pytorch/files/6974313/cosh_sinh_benchmark.txt) [frexp_benchmark.txt](https://github.com/pytorch/pytorch/files/6974315/frexp_benchmark.txt) [nan_to_num_benchmark.txt](https://github.com/pytorch/pytorch/files/6974317/nan_to_num_benchmark.txt) [range_benchmark.txt](https://github.com/pytorch/pytorch/files/6974318/range_benchmark.txt) Pull Request resolved: https://github.com/pytorch/pytorch/pull/61826 Reviewed By: saketh-are Differential Revision: D30257259 Pulled By: VitalyFedyunin fbshipit-source-id: 394cd713e6394050a8c90b2160633beb675d71dd --- aten/src/ATen/native/RangeFactories.cpp | 4 ++-- aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 8 ++++---- c10/util/BFloat16-math.h | 6 ++++++ torch/testing/_internal/common_methods_invocations.py | 4 ++++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp index 508c157965edc..7d48c63b755ce 100644 --- a/aten/src/ATen/native/RangeFactories.cpp +++ b/aten/src/ATen/native/RangeFactories.cpp @@ -113,7 +113,7 @@ Tensor& logspace_cpu_out(const Scalar& start, const Scalar& end, c10::optional; auto xstart = start.to(); auto xend = end.to(); @@ -133,7 +133,7 @@ Tensor& range_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step scalar_t *data_ptr = r.data_ptr(); at::parallel_for(0, size, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) { - scalar_t is = p_begin; + accscalar_t is = p_begin; for (int64_t i = p_begin; i < p_end; ++i, ++is) { data_ptr[i] = xstart + is * xstep; } diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 6288cec2ea3b3..a867a2a0ce519 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -322,7 +322,7 @@ static void sinc_kernel(TensorIteratorBase& iter) { } static void sinh_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), "sinh_cpu", [&]() { cpu_kernel_vec( iter, [=](scalar_t a) -> scalar_t { return std::sinh(a); }, @@ -331,7 +331,7 @@ static void sinh_kernel(TensorIteratorBase& iter) { } static void cosh_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "cosh_cpu", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), "cosh_cpu", [&]() { cpu_kernel_vec( iter, [=](scalar_t a) -> scalar_t { return std::cosh(a); }, @@ -407,7 +407,7 @@ static void nan_to_num_kernel( c10::optional nan, c10::optional pos_inf, c10::optional neg_inf) { - AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "nan_to_num", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "nan_to_num", [&]() { scalar_t nan_replacement = static_cast(nan.value_or(0.)); scalar_t pos_inf_replacement = pos_inf.has_value() ? static_cast(pos_inf.value()) @@ -586,7 +586,7 @@ static void entr_kernel(TensorIteratorBase& iter) { } static void frexp_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_TYPES_AND(kHalf, + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, // The iter.dtype() here is the dtype of mantissa output. // It's a floating point type and must be the same as the input's dtype. iter.dtype(), diff --git a/c10/util/BFloat16-math.h b/c10/util/BFloat16-math.h index 2760100db6e98..a7b8426ced36a 100644 --- a/c10/util/BFloat16-math.h +++ b/c10/util/BFloat16-math.h @@ -57,6 +57,12 @@ inline c10::BFloat16 sin(c10::BFloat16 a) { inline c10::BFloat16 tan(c10::BFloat16 a) { return std::tan(float(a)); } +inline c10::BFloat16 sinh(c10::BFloat16 a) { + return std::sinh(float(a)); +} +inline c10::BFloat16 cosh(c10::BFloat16 a) { + return std::cosh(float(a)); +} inline c10::BFloat16 tanh(c10::BFloat16 a) { return std::tanh(float(a)); } diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 617b102642d05..63af3965a2e7d 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -6028,6 +6028,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): UnaryUfuncInfo('cosh', ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh), dtypes=all_types_and_complex_and(torch.bool), + dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), safe_casts_outputs=True, assert_autodiffed=True, @@ -6413,6 +6414,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): op=torch.frexp, ref=np.frexp, dtypes=floating_types_and(torch.half), + dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16), # skip testing torch.frexp as it is not supported by ROCm platform yet decorators=[skipCUDAIfRocm], supports_out=False, @@ -7432,6 +7434,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): UnaryUfuncInfo('sinh', ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh), dtypes=all_types_and_complex_and(torch.bool), + dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), safe_casts_outputs=True, assert_autodiffed=True, @@ -7753,6 +7756,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): UnaryUfuncInfo('nan_to_num', ref=np.nan_to_num, dtypes=all_types_and(torch.half, torch.bool), + dtypesIfCPU=all_types_and(torch.half, torch.bool, torch.bfloat16), dtypesIfCUDA=all_types_and(torch.half, torch.bool, torch.bfloat16), supports_forward_ad=True, # Passing numpy_kwargs via sample_kwargs, as numpy does comparison From bcf8e2f57eb8fdcf294276f3f5763b3e42fccec8 Mon Sep 17 00:00:00 2001 From: driazati Date: Fri, 20 Aug 2021 15:45:10 -0700 Subject: [PATCH 116/530] Remove breakpad from docker image (#63598) Summary: As of https://github.com/pytorch/pytorch/issues/63186 we're doing this properly via a third_party cmake build, so we don't need it here anymore. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63598 Reviewed By: walterddr, malfet Differential Revision: D30432250 Pulled By: driazati fbshipit-source-id: d0d5db14355cf574e42c0d0ed786bb26230180bd --- .circleci/docker/README.md | 2 +- .circleci/docker/build.sh | 20 ----------------- .circleci/docker/common/install_breakpad.sh | 25 --------------------- .circleci/docker/ubuntu-cuda/Dockerfile | 8 +++---- .circleci/docker/ubuntu/Dockerfile | 15 ++++--------- 5 files changed, 9 insertions(+), 61 deletions(-) delete mode 100644 .circleci/docker/common/install_breakpad.sh diff --git a/.circleci/docker/README.md b/.circleci/docker/README.md index a87522f622ccd..cc4f97cfae748 100644 --- a/.circleci/docker/README.md +++ b/.circleci/docker/README.md @@ -27,5 +27,5 @@ Docker builds are now defined with `.circleci/cimodel/data/simple/docker_definit ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest # Set flags (see build.sh) and build image -sudo bash -c 'BREAKPAD=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest +sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest ``` diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 7c8477349981a..2b916a19ae117 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -88,7 +88,6 @@ case "$image" in DB=yes VISION=yes KATEX=yes - BREAKPAD=yes ;; pytorch-linux-xenial-py3.6-gcc7.2) ANACONDA_PYTHON_VERSION=3.6 @@ -101,7 +100,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7) CUDA_VERSION=10.2 @@ -112,7 +110,6 @@ case "$image" in DB=yes VISION=yes KATEX=yes - BREAKPAD=yes ;; pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7) CUDA_VERSION=11.1 @@ -123,7 +120,6 @@ case "$image" in DB=yes VISION=yes KATEX=yes - BREAKPAD=yes ;; pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7) CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names @@ -134,7 +130,6 @@ case "$image" in DB=yes VISION=yes KATEX=yes - BREAKPAD=yes ;; pytorch-linux-xenial-py3-clang5-asan) ANACONDA_PYTHON_VERSION=3.6 @@ -142,7 +137,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-xenial-py3-clang7-asan) ANACONDA_PYTHON_VERSION=3.6 @@ -150,7 +144,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-xenial-py3-clang7-onnx) ANACONDA_PYTHON_VERSION=3.6 @@ -158,7 +151,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-xenial-py3-clang5-android-ndk-r19c) ANACONDA_PYTHON_VERSION=3.6 @@ -177,7 +169,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-bionic-py3.6-clang9) ANACONDA_PYTHON_VERSION=3.6 @@ -185,7 +176,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes VULKAN_SDK_VERSION=1.2.162.1 SWIFTSHADER=yes ;; @@ -195,8 +185,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes - BREAKPAD=yes ;; pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9) CUDA_VERSION=10.2 @@ -206,7 +194,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7) CUDA_VERSION=10.2 @@ -216,7 +203,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ;; pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9) CUDA_VERSION=11.0 @@ -226,7 +212,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ROCM_VERSION=3.9 ;; pytorch-linux-bionic-rocm4.0.1-py3.6) @@ -235,7 +220,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ROCM_VERSION=4.0.1 ;; pytorch-linux-bionic-rocm4.1-py3.6) @@ -244,7 +228,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ROCM_VERSION=4.1 ;; pytorch-linux-bionic-rocm4.2-py3.6) @@ -253,7 +236,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes ROCM_VERSION=4.2 ;; *) @@ -261,7 +243,6 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BREAKPAD=yes echo "image '$image' did not match an existing build configuration" if [[ "$image" == *py* ]]; then extract_version_from_image_name py ANACONDA_PYTHON_VERSION @@ -325,7 +306,6 @@ docker build \ --build-arg "GCC_VERSION=${GCC_VERSION}" \ --build-arg "CUDA_VERSION=${CUDA_VERSION}" \ --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \ - --build-arg "BREAKPAD=${BREAKPAD}" \ --build-arg "ANDROID=${ANDROID}" \ --build-arg "ANDROID_NDK=${ANDROID_NDK_VERSION}" \ --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \ diff --git a/.circleci/docker/common/install_breakpad.sh b/.circleci/docker/common/install_breakpad.sh deleted file mode 100644 index f49f1fb325e2a..0000000000000 --- a/.circleci/docker/common/install_breakpad.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -ex - -git clone https://github.com/driazati/breakpad.git -pushd breakpad - -# breakpad has no actual releases, so this is pinned to the top commit from -# main when this was forked (including the one patch commit). This uses a fork -# of the breakpad mainline that automatically daisy-chains out to any previously -# installed signal handlers (instead of overwriting them). -git checkout 5485e473ed46d065e05489e50dfc59d90dfd7e22 - -git clone https://chromium.googlesource.com/linux-syscall-support src/third_party/lss -pushd src/third_party/lss -# same as with breakpad, there are no real releases for this repo so use a -# commit as the pin -git checkout e1e7b0ad8ee99a875b272c8e33e308472e897660 -popd - -./configure -make -make install -popd -rm -rf breakpad diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile index e0e7dc9b6e5bf..003538f576bd5 100644 --- a/.circleci/docker/ubuntu-cuda/Dockerfile +++ b/.circleci/docker/ubuntu-cuda/Dockerfile @@ -61,6 +61,10 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi RUN rm install_vision.sh ENV INSTALLED_VISION ${VISION} +ADD ./common/install_openssl.sh install_openssl.sh +ENV OPENSSL_ROOT_DIR /opt/openssl +RUN bash ./install_openssl.sh + # Install ccache/sccache (do this last, so we get priority in PATH) ADD ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH @@ -88,9 +92,5 @@ ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all" # Install LLVM dev version (Defined in the pytorch/builder github repository) COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm -ADD ./common/install_openssl.sh install_openssl.sh -ENV OPENSSL_ROOT_DIR /opt/openssl -RUN bash ./install_openssl.sh - USER jenkins CMD ["bash"] diff --git a/.circleci/docker/ubuntu/Dockerfile b/.circleci/docker/ubuntu/Dockerfile index ea00c083c3d02..76a64bc0ea10d 100644 --- a/.circleci/docker/ubuntu/Dockerfile +++ b/.circleci/docker/ubuntu/Dockerfile @@ -82,13 +82,6 @@ RUN rm AndroidManifest.xml RUN rm build.gradle ENV INSTALLED_ANDROID ${ANDROID} -# (optional) Install breakpad -ARG BREAKPAD -ADD ./common/install_breakpad.sh install_breakpad.sh -RUN if [ -n "${BREAKPAD}" ]; then bash ./install_breakpad.sh; fi -RUN rm install_breakpad.sh -ENV INSTALLED_BREAKPAD ${BREAKPAD} - # (optional) Install Vulkan SDK ARG VULKAN_SDK_VERSION ADD ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh @@ -113,6 +106,10 @@ ADD ./common/install_ninja.sh install_ninja.sh RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi RUN rm install_ninja.sh +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh +ENV OPENSSL_ROOT_DIR /opt/openssl + # Install ccache/sccache (do this last, so we get priority in PATH) ADD ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH @@ -130,9 +127,5 @@ ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT} # Install LLVM dev version (Defined in the pytorch/builder github repository) COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm -ADD ./common/install_openssl.sh install_openssl.sh -RUN bash ./install_openssl.sh -ENV OPENSSL_ROOT_DIR /opt/openssl - USER jenkins CMD ["bash"] From efe01c59e3f64979bf054af8a70705f41b65db4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8C=97=E6=B5=B7=E8=8B=A5?= Date: Fri, 20 Aug 2021 15:45:12 -0700 Subject: [PATCH 117/530] [Doc] Deprecation notice for only_inputs argument (#63631) Summary: Fixes https://github.com/pytorch/pytorch/issues/63544. Changed docstring accordingly. I'm new here, not sure if the style is okay. Please check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63631 Reviewed By: ejguan Differential Revision: D30459439 Pulled By: soulitzer fbshipit-source-id: 8df3c509d1dd39764815b099ab47229550126cbe --- torch/autograd/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index 0d4f153d007c1..d11e261efcea1 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -173,17 +173,18 @@ def grad( gradients w.r.t. each of the outputs. If an output doesn't require_grad, then the gradient can be ``None``). - If ``only_inputs`` is ``True``, the function will only return a list of gradients - w.r.t the specified inputs. If it's ``False``, then gradient w.r.t. all remaining - leaves will still be computed, and will be accumulated into their ``.grad`` - attribute. - .. note:: If you run any forward ops, create ``grad_outputs``, and/or call ``grad`` in a user-specified CUDA stream context, see :ref:`Stream semantics of backward passes`. + .. note:: + + ``only_inputs`` argument is deprecated and is ignored now (defaults to ``True``). + To accumulate gradient for other parts of the graph, please use + ``torch.autograd.backward``. + Args: outputs (sequence of Tensor): outputs of the differentiated function. inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be From e0fe5699c4b7dabd132834b690d6cc2513e0e978 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 20 Aug 2021 16:28:39 -0700 Subject: [PATCH 118/530] enable increment build for build_libtorch (#63074) Summary: Since issue https://github.com/pytorch/pytorch/issues/59859 is resolved. rerun_cmake in build_libtorch should not be hardcoded. build_libtorch is necessary to generate debug version libtorch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63074 Reviewed By: VitalyFedyunin, seemethere Differential Revision: D30306705 Pulled By: malfet fbshipit-source-id: f2077d334191f4973da0681560937bc8bab730c1 --- tools/build_libtorch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py index 800d8eb278481..c263e5084f783 100644 --- a/tools/build_libtorch.py +++ b/tools/build_libtorch.py @@ -14,7 +14,10 @@ if __name__ == '__main__': # Placeholder for future interface. For now just gives a nice -h. parser = argparse.ArgumentParser(description='Build libtorch') + parser.add_argument('--rerun-cmake', action="store_true", help='rerun cmake') + parser.add_argument('--cmake-only', action="store_true", + help='Stop once cmake terminates. Leave users a chance to adjust build options') options = parser.parse_args() build_caffe2(version=None, cmake_python_library=None, build_python=False, - rerun_cmake=True, cmake_only=False, cmake=CMake()) + rerun_cmake=options.rerun_cmake, cmake_only=options.cmake_only, cmake=CMake()) From 7c0f5b9aa4dbdfefce02b10a07c1928d4ec1a66b Mon Sep 17 00:00:00 2001 From: driazati Date: Fri, 20 Aug 2021 16:38:42 -0700 Subject: [PATCH 119/530] [clang-tidy] Enable more folders (#63380) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63380 Crosses off some more of #62011, see the test in the stacked PR #63381 Test Plan: Imported from OSS Reviewed By: malfet, seemethere Differential Revision: D30455843 Pulled By: driazati fbshipit-source-id: d473545d05ffa0b2476968f0b1c55f3a16a2c755 --- .github/workflows/lint.yml | 7 ++++++- tools/linter/clang_tidy/__main__.py | 1 + torch/csrc/api/include/torch/nn/functional/pooling.h | 2 +- torch/csrc/deploy/example/benchmark.cpp | 1 + torch/csrc/deploy/test_deploy.cpp | 3 ++- torch/csrc/deploy/test_deploy_python_ext.cpp | 2 +- 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 720e76c4e6a5f..f036bc17d2a2d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -367,7 +367,12 @@ jobs: cd "${GITHUB_WORKSPACE}" python3 -m tools.linter.clang_tidy \ - --paths torch/csrc/fx \ + --paths \ + torch/csrc/fx \ + torch/csrc/utils \ + torch/csrc/generic \ + torch/csrc/deploy \ + torch/csrc/tensor \ --clang-tidy-exe "$(which clang-tidy)" \ --disable-progress-bar 2>&1 | tee "${GITHUB_WORKSPACE}"/clang-tidy-output.txt diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py index fc9f2ab4e6687..b99c1f5366848 100644 --- a/tools/linter/clang_tidy/__main__.py +++ b/tools/linter/clang_tidy/__main__.py @@ -74,6 +74,7 @@ def clang_search_dirs() -> List[str]: "-torch/csrc/deploy/interpreter/interpreter.h", "-torch/csrc/deploy/interpreter/interpreter_impl.h", "-torch/csrc/deploy/interpreter/test_main.cpp", + "-torch/csrc/deploy/test_deploy_python_ext.cpp", ], "paths": ["torch/csrc/"], "include-dir": ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs(), diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h index c8538858e8a74..f06b68ba2870d 100644 --- a/torch/csrc/api/include/torch/nn/functional/pooling.h +++ b/torch/csrc/api/include/torch/nn/functional/pooling.h @@ -776,7 +776,7 @@ inline std::tuple fractional_max_pool2d_with_indices( Tensor _random_samples_ = _random_samples; if (!_random_samples_.defined()) { - auto n_batch = 1 ? input.dim() == 3 : input.size(0); + auto n_batch = input.dim() == 3; _random_samples_ = torch::rand({n_batch, input.size(-1), 2}, torch::TensorOptions().dtype(input.dtype()).device(input.device())); } return torch::fractional_max_pool2d(input, kernel_size, *output_size_, _random_samples_); diff --git a/torch/csrc/deploy/example/benchmark.cpp b/torch/csrc/deploy/example/benchmark.cpp index 348d84fec02b4..d2f1142965d40 100644 --- a/torch/csrc/deploy/example/benchmark.cpp +++ b/torch/csrc/deploy/example/benchmark.cpp @@ -295,6 +295,7 @@ struct Benchmark { std::function run_one_work_item; }; +// NOLINTNEXTLINE(bugprone-exception-escape) int main(int argc, char* argv[]) { int max_thread = atoi(argv[1]); cuda = std::string(argv[2]) == "cuda"; diff --git a/torch/csrc/deploy/test_deploy.cpp b/torch/csrc/deploy/test_deploy.cpp index f88a23c43bde0..a004db1e0d232 100644 --- a/torch/csrc/deploy/test_deploy.cpp +++ b/torch/csrc/deploy/test_deploy.cpp @@ -63,7 +63,7 @@ TEST(TorchpyTest, InitTwice) { TEST(TorchpyTest, DifferentInterps) { torch::deploy::InterpreterManager m(2); m.register_module_source("check_none", "check = id(None)\n"); - int64_t id0, id1; + int64_t id0 = 0, id1 = 0; { auto I = m.all_instances()[0].acquire_session(); id0 = I.global("check_none", "check").toIValue().toInt(); @@ -312,6 +312,7 @@ TEST(TorchpyTest, SharedLibraryLoad) { I.global("sys", "path").attr("append")({"torch/csrc/deploy"}); I.global("test_deploy_python", "setup")({getenv("PATH")}); } else { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) char buf[PATH_MAX]; strncpy(buf, test_lib_path, PATH_MAX); dirname(buf); diff --git a/torch/csrc/deploy/test_deploy_python_ext.cpp b/torch/csrc/deploy/test_deploy_python_ext.cpp index 42700ead6678b..59a04f5e84853 100644 --- a/torch/csrc/deploy/test_deploy_python_ext.cpp +++ b/torch/csrc/deploy/test_deploy_python_ext.cpp @@ -7,7 +7,7 @@ bool run() { torch::deploy::InterpreterManager m(2); m.register_module_source("check_none", "check = id(None)\n"); - int64_t id0, id1; + int64_t id0 = 0, id1 = 0; { auto I = m.all_instances()[0].acquire_session(); id0 = I.global("check_none", "check").toIValue().toInt(); From 3ee1f81dce748aac73848d0ad45c1eb84fcffd74 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Fri, 20 Aug 2021 17:09:35 -0700 Subject: [PATCH 120/530] Extend _sharded_tensor constructor to support other ops like torch.ones (#63378) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63378 a) Introduce InitCommonParams to wrap tensor creation params b) Factor local tensor initiation into common_params so that tensor value is not hard specified in ShardedTensor constructor c) Add _sharded_tensor.ones(...) to exemplify - Note memory_format arg is not provided to be consistent as torch.ones d) Follow up: more ops like torch.full, torch.zero, torch.rand, Test: $ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestCreateTensorFromParams --v $ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestShardedTensorChunked.test_create_sharded_tensor_with_ones --v $ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestShardedTensorEnumerable.test_create_sharded_tensor_with_ones --v Test Plan: Imported from OSS Reviewed By: pritamdamania87, wanchaol Differential Revision: D30359245 Pulled By: bowangbj fbshipit-source-id: 85768fcb36e9d9d40213036884b1266930a91701 --- .../_sharded_tensor/test_sharded_tensor.py | 109 +++++++++++- torch/distributed/_sharded_tensor/__init__.py | 61 ++++++- torch/distributed/_sharded_tensor/api.py | 162 ++++++++---------- 3 files changed, 237 insertions(+), 95 deletions(-) diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py index 829855f6be2c5..5067f301b5595 100644 --- a/test/distributed/_sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py @@ -1,4 +1,5 @@ from functools import wraps +import math import io import sys import torch @@ -15,6 +16,11 @@ EnumerableShardingSpec, ShardMetadata ) +from torch.distributed._sharded_tensor.api import ( + CreateOp, + TensorInitParams, + _create_tensor_from_params, +) from torch.testing._internal.common_distributed import ( MultiProcessTestCase, requires_nccl, @@ -22,10 +28,11 @@ TEST_SKIPS, ) from torch.testing._internal.common_utils import ( + TestCase, TEST_WITH_DEV_DBG_ASAN, run_tests, + sandcastle_skip_if, ) - if TEST_WITH_DEV_DBG_ASAN: print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr) sys.exit(0) @@ -115,6 +122,38 @@ def wrapper(self): self.destroy_comms() return wrapper +class TestCreateTensorFromParams(TestCase): + @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') + def test_empty(self): + tensor_init_params = TensorInitParams( + create_op=CreateOp.EMPTY, + dtype=torch.double, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, ) + local_device = torch.device('cuda:0') + local_tensor = _create_tensor_from_params( + 5, 10, local_device=local_device, tensor_init_params=tensor_init_params) + self.assertEqual(local_device, local_tensor.device) + self.assertEqual(torch.double, local_tensor.dtype) + self.assertEqual(torch.strided, local_tensor.layout) + self.assertEqual(False, local_tensor.requires_grad) + + @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') + def test_ones(self): + tensor_init_params = TensorInitParams( + create_op=CreateOp.ONES, + dtype=torch.double, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, ) + local_device = torch.device('cuda:0') + local_tensor = _create_tensor_from_params( + 5, 10, local_device=local_device, tensor_init_params=tensor_init_params) + expected_tensor = torch.ones(5, 10, device=local_device, dtype=torch.double) + self.assertEqual(expected_tensor, local_tensor) class TestShardedTensorChunked(ShardedTensorTestBase, MultiProcessTestCase): @@ -219,6 +258,35 @@ def test_complete_world_size(self): else: self.assertEqual((3, 20), shard.tensor.size()) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_create_sharded_tensor_with_ones(self): + """ Test _sharded_tensor.ones(...) """ + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + h, w = 10, 20 + sharded_tensor = _sharded_tensor.ones(spec, h, w) + + # Validate local shard is initialized with torch.ones + local_shards = sharded_tensor.local_shards() + self.assertEqual(1, len(local_shards)) + local_shard = local_shards[0].tensor + self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device) + # The split: for rank!=3 ceil(h/4)=3 for rank=3 1 + expected_h = 1 if self.rank == 3 else math.ceil(h / 4) + self.assertEqual((expected_h, w), local_shard.size()) + self.assertEqual(local_shard, torch.ones(expected_h, w)) + @with_comms @skip_if_lt_x_gpu(4) @requires_nccl() @@ -818,6 +886,45 @@ def test_grid_sharding(self): shard = remote_shard.to_here() self.assertEqual((5, 5), shard.tensor.size()) + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_create_sharded_tensor_with_ones(self): + """ Test _sharded_tensor.ones(...) """ + + spec = EnumerableShardingSpec([ + ShardMetadata( + shard_offsets=[0, 0], + shard_lengths=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[0, 5], + shard_lengths=[5, 5], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_lengths=[5, 5], + placement="rank:2/cuda:2", + ), + ShardMetadata( + shard_offsets=[5, 5], + shard_lengths=[5, 5], + placement="rank:3/cuda:3", + ) + ]) + + sharded_tensor = _sharded_tensor.ones(spec, 10, 10, init_rrefs=True) + self.assertEqual((10, 10), sharded_tensor.size()) + self.assertEqual(1, len(sharded_tensor.local_shards())) + + # Verify local shard is initialized with torch.ones + local_shard = sharded_tensor.local_shards()[0] + self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device) + self.assertEqual((5, 5), local_shard.tensor.size()) + self.assertEqual(local_shard.tensor, torch.ones(5, 5)) + @skip_if_lt_x_gpu(4) @requires_nccl() def test_uneven_shards(self): diff --git a/torch/distributed/_sharded_tensor/__init__.py b/torch/distributed/_sharded_tensor/__init__.py index d9833159dc9de..ecb7ea1fed8c6 100644 --- a/torch/distributed/_sharded_tensor/__init__.py +++ b/torch/distributed/_sharded_tensor/__init__.py @@ -3,12 +3,15 @@ import torch from torch.distributed._sharding_spec import ShardingSpec from .api import ( + CreateOp, Shard, ShardedTensor, ShardedTensorMetadata, + TensorInitParams, load_with_process_group, ) + def empty( sharding_spec: ShardingSpec, *size, @@ -49,14 +52,62 @@ def empty( Returns: A :class:`ShardedTensor` object on each rank """ + tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, dtype=dtype, layout=layout, + requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format) + return ShardedTensor( + sharding_spec, + *size, + tensor_init_params=tensor_init_params, + process_group=process_group, + init_rrefs=init_rrefs, + ) + +def ones( + sharding_spec: ShardingSpec, + *size, + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + process_group=None, + init_rrefs=False): + """ + Creates a ones :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion. + + Args: + sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification + describing how to shard the Tensor. + size (int...): a sequence of integers defining the shape of the output + tensor. Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + process_group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + init_rrefs (bool, optional): Whether or not to initialize + :class:`torch.distributed.rpc.RRef`s pointing to remote shards. + Need to initialize the RPC Framework if specified as ``True``. + Default: ``False``. + + Returns: + A :class:`ShardedTensor` object on each rank + """ + tensor_init_params = TensorInitParams(create_op=CreateOp.ONES, dtype=dtype, layout=layout, + requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format) return ShardedTensor( sharding_spec, *size, - dtype=dtype, - layout=layout, - requires_grad=requires_grad, - pin_memory=pin_memory, - memory_format=memory_format, + tensor_init_params=tensor_init_params, process_group=process_group, init_rrefs=init_rrefs, ) diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py index ca9a05abffa06..2b6720b059a85 100644 --- a/torch/distributed/_sharded_tensor/api.py +++ b/torch/distributed/_sharded_tensor/api.py @@ -1,6 +1,7 @@ import collections from contextlib import contextmanager from dataclasses import dataclass, field +from enum import Enum from typing import ( Dict, List @@ -22,6 +23,7 @@ validate_non_overlapping_shards_metadata ) + # Tracking for sharded tensor objects. _sharded_tensor_lock = threading.Lock() _sharded_tensor_current_id = 0 @@ -123,6 +125,26 @@ def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]] _sharded_tensor_map[sharded_tensor_id]._register_remote_shards(rrefs, rpc_rank) +class CreateOp(Enum): + EMPTY = 0 + ONES = 1 + + +@dataclass +class TensorInitParams(object): + """ Container for list of common params to create new local tensor. """ + + __slots__ = ['create_op', 'dtype', 'layout', 'requires_grad', 'pin_memory', + 'memory_format'] + + create_op: CreateOp + dtype: torch.dtype + layout: torch.layout + requires_grad: bool + pin_memory: bool + memory_format: torch.memory_format + + class ShardedTensor(object): """ ShardedTensor is an abstraction to represent Tensors that are sharded @@ -136,8 +158,9 @@ class ShardedTensor(object): ShardedTensor doesn't provide any Tensor like operations but is a wrapper providing the Tensor representing the local shard and the global metadata. Using these, users can build their custom distributed sharded computations - on top of this primitive. The local shards are all initialized using - :meth:`torch.empty`. + on top of this primitive. The local shards are all initialized using the + create_op specified by tensor_init_params.create_op, e.g., torch.ones, or + torch.empty Args: sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification @@ -146,20 +169,7 @@ class ShardedTensor(object): tensor. Can be a variable number of arguments or a collection like a list or tuple. Keyword args: - dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. - Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`). - layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. - Default: ``torch.strided``. - requires_grad (bool, optional): If autograd should record operations on the - returned tensor. Default: ``False``. - pin_memory (bool, optional): If set, returned tensor would be allocated in - the pinned memory. Works only for CPU tensors. Default: ``False``. - memory_format (:class:`torch.memory_format`, optional): the desired memory format of - returned Tensor. Default: ``torch.contiguous_format``. - process_group (ProcessGroup, optional): The process group to work on. If None, - the default process group will be used. If specified the ShardedTensor is only - built on ranks that are part of this process group and the provided ``sharding_spec`` - is applied in the context of this process group. + tensor_init_params (:class: `TensorInitParams`): common params to create tensor. init_rrefs (bool, optional): Whether or not to initialize :class:`torch.distributed.rpc.RRef`s pointing to remote shards. Need to initialize the RPC Framework if specified as ``True``. @@ -170,11 +180,7 @@ def __init__( self, sharding_spec: ShardingSpec, *size, - dtype=None, - layout=torch.strided, - requires_grad=False, - pin_memory=False, - memory_format=torch.contiguous_format, + tensor_init_params: TensorInitParams, process_group=None, init_rrefs=False, ): @@ -182,13 +188,13 @@ def __init__( # _process_group, _local_shards, etc. self._prepare_init(process_group=process_group, init_rrefs=init_rrefs) - if dtype is None: - dtype = torch.get_default_dtype() + if tensor_init_params.dtype is None: + tensor_init_params.dtype = torch.get_default_dtype() - if layout != torch.strided: + if tensor_init_params.layout != torch.strided: raise ValueError('Only torch.strided layout is currently supported') - if memory_format != torch.contiguous_format: + if tensor_init_params.memory_format != torch.contiguous_format: raise ValueError('Only torch.contiguous_format memory_format is currently supported') if len(size) == 1 and isinstance(size[0], collections.Sequence): @@ -203,23 +209,9 @@ def __init__( self._sharding_spec = sharding_spec if isinstance(self._sharding_spec, ChunkShardingSpec): - self._init_chunked( - dims, - dtype, - layout, - requires_grad, - pin_memory, - memory_format, - ) + self._init_chunked(dims, tensor_init_params) elif isinstance(self._sharding_spec, EnumerableShardingSpec): - self._init_enumerable( - dims, - dtype, - layout, - requires_grad, - pin_memory, - memory_format, - ) + self._init_enumerable(dims, tensor_init_params) else: raise ValueError(f'Unsupported sharding_spec: {self._sharding_spec}') @@ -420,15 +412,7 @@ def _init_from_local_shards( sharded_tensor._post_init() return sharded_tensor - def _init_chunked( - self, - dims, - dtype, - layout, - requires_grad, - pin_memory, - memory_format, - ): + def _init_chunked(self, dims, tensor_init_params: TensorInitParams, ): current_rank = dist.get_rank(self._process_group) sharding_dim = self._sharding_spec.dim # type: ignore[attr-defined] @@ -469,38 +453,22 @@ def _init_chunked( # Build the local shard for the current rank if it is involved in the sharding spec. if current_rank == rank: # Initialize the local shard. - local_shard = torch.empty( - *rank_dims, - dtype=dtype, - layout=layout, - device=local_device, - requires_grad=requires_grad, - memory_format=memory_format, - pin_memory=pin_memory, - ) - + local_shard = _create_tensor_from_params( + *rank_dims, local_device=local_device, tensor_init_params=tensor_init_params) self._local_shards.append(Shard(local_shard, shard_metadata)) # Build overall metadata self._metadata = ShardedTensorMetadata( shards_metadata, dims, - dtype, - layout, - requires_grad, - memory_format, - pin_memory, + tensor_init_params.dtype, + tensor_init_params.layout, + tensor_init_params.requires_grad, + tensor_init_params.memory_format, + tensor_init_params.pin_memory, ) - def _init_enumerable( - self, - dims, - dtype, - layout, - requires_grad, - pin_memory, - memory_format, - ): + def _init_enumerable(self, dims, tensor_init_params: TensorInitParams): # Validate the sharding spec is compatible with the tensor. check_tensor(self._sharding_spec.shards, dims) # type: ignore[attr-defined] @@ -513,27 +481,20 @@ def _init_enumerable( if current_rank == rank: # Initialize the local shard. - local_shard = torch.empty( - *shard_metadata.shard_lengths, - dtype=dtype, - layout=layout, - device=local_device, - requires_grad=requires_grad, - memory_format=memory_format, - pin_memory=pin_memory, - ) - + local_shard = _create_tensor_from_params( + *shard_metadata.shard_lengths, local_device=local_device, + tensor_init_params=tensor_init_params) self._local_shards.append(Shard(local_shard, shard_metadata)) # Build overall metadata self._metadata = ShardedTensorMetadata( shards_metadata, dims, - dtype, - layout, - requires_grad, - memory_format, - pin_memory, + tensor_init_params.dtype, + tensor_init_params.layout, + tensor_init_params.requires_grad, + tensor_init_params.memory_format, + tensor_init_params.pin_memory, ) def _parse_and_validate_remote_device(self, remote_device: torch.distributed._remote_device): @@ -672,3 +633,26 @@ def __setstate__(self, state): f'but at load time was {global_world_size}') self._post_init() + + +def _create_tensor_from_params(*size, local_device, tensor_init_params: TensorInitParams): + """ Helper to construct tensor from size, device and common params. """ + + if tensor_init_params.create_op == CreateOp.ONES: + return torch.ones(*size, + dtype=tensor_init_params.dtype, + layout=tensor_init_params.layout, + device=local_device, + pin_memory=tensor_init_params.pin_memory, + requires_grad=tensor_init_params.requires_grad,) + elif tensor_init_params.create_op == CreateOp.EMPTY: + return torch.empty(*size, + dtype=tensor_init_params.dtype, + layout=tensor_init_params.layout, + device=local_device, + requires_grad=tensor_init_params.requires_grad, + # Note memory_format param is not accepted by torch.ones + memory_format=tensor_init_params.memory_format, + pin_memory=tensor_init_params.pin_memory,) + else: + raise ValueError(f'Unsupported create_op: {tensor_init_params.create_op}') From b4f5809db8511d9517b043b14f48814a9199dea3 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Fri, 20 Aug 2021 18:27:33 -0700 Subject: [PATCH 121/530] Migrate thnn_conv2d from THC to ATen (#63428) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63428 Closes gh-24644, closes gh-24645 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30441307 Pulled By: ngimel fbshipit-source-id: 9c3dec469c0525831ae398df261cf41b7df7e373 --- BUILD.bazel | 1 - aten/src/ATen/LegacyTHFunctionsCUDA.h | 4 - aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp | 266 --------- aten/src/ATen/native/cuda/ConvolutionMM2d.cu | 519 +++++++++++++++++- aten/src/ATen/native/native_functions.yaml | 4 +- aten/src/THCUNN/CMakeLists.txt | 1 - aten/src/THCUNN/SpatialConvolutionMM.cu | 13 - .../THCUNN/generic/SpatialConvolutionMM.cu | 499 ----------------- aten/src/THCUNN/generic/THCUNN.h | 46 -- 9 files changed, 509 insertions(+), 844 deletions(-) delete mode 100644 aten/src/THCUNN/SpatialConvolutionMM.cu delete mode 100644 aten/src/THCUNN/generic/SpatialConvolutionMM.cu diff --git a/BUILD.bazel b/BUILD.bazel index ca8874d64e857..dab227590072d 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -420,7 +420,6 @@ filegroup( "aten/src/THCUNN/SoftMarginCriterion.cu.cc", "aten/src/THCUNN/SoftPlus.cu.cc", "aten/src/THCUNN/SoftShrink.cu.cc", - "aten/src/THCUNN/SpatialConvolutionMM.cu.cc", "aten/src/THCUNN/Tanh.cu.cc", ], ) diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h index 5670f31a089d9..1a20e0bb8fa0b 100644 --- a/aten/src/ATen/LegacyTHFunctionsCUDA.h +++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h @@ -24,10 +24,6 @@ Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper); Tensor _th_potri(const Tensor & self, bool upper); Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src); Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training); -std::tuple _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones); -std::tuple _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const optional & bias, IntArrayRef stride, IntArrayRef padding); -std::tuple _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones); -std::tuple _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array output_mask); } // namespace th } // namespace legacy diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp index 30c61a3e8b355..4ead51e6bd26e 100644 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -175,272 +175,6 @@ Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) { } return self; } -std::tuple _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones) { - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); - const Tensor& bias = *bias_maybe_owned; - - const OptionalDeviceGuard device_guard(device_of(self)); - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - case ScalarType::BFloat16: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type); - THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - default: - AT_ERROR("_thnn_conv2d_forward_out not supported on CUDAType for ", dispatch_scalar_type); - } - return std::tuple(output, columns, ones); -} -std::tuple _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional& bias_opt, IntArrayRef stride, IntArrayRef padding) { - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); - const Tensor& bias = *bias_maybe_owned; - - const OptionalDeviceGuard device_guard(device_of(self)); - auto dispatch_scalar_type = infer_scalar_type(self); - auto output_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto output = Tensor(c10::intrusive_ptr::reclaim(output_)); - auto columns_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto columns = Tensor(c10::intrusive_ptr::reclaim(columns_)); - auto ones_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto ones = Tensor(c10::intrusive_ptr::reclaim(ones_)); - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - case ScalarType::BFloat16: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3); - auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - break; - } - default: - AT_ERROR("_thnn_conv2d_forward not supported on CUDAType for ", dispatch_scalar_type); - } - return std::tuple(output, columns, ones); -} -std::tuple _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones) { - const OptionalDeviceGuard device_guard(device_of(self)); - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - case ScalarType::Float: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - case ScalarType::Half: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - case ScalarType::BFloat16: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - default: - AT_ERROR("_thnn_conv2d_backward_out not supported on CUDAType for ", dispatch_scalar_type); - } - return std::tuple(grad_input, grad_weight, grad_bias); -} -std::tuple _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array output_mask) { - const OptionalDeviceGuard device_guard(device_of(self)); - auto dispatch_scalar_type = infer_scalar_type(self); - auto grad_input_ = output_mask[0] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; - auto grad_input = Tensor(c10::intrusive_ptr::reclaim(grad_input_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_input_)); - auto grad_weight_ = output_mask[1] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; - auto grad_weight = Tensor(c10::intrusive_ptr::reclaim(grad_weight_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_weight_)); - auto grad_bias_ = output_mask[2] ? c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr; - auto grad_bias = Tensor(c10::intrusive_ptr::reclaim(grad_bias_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_bias_)); - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - case ScalarType::Float: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - case ScalarType::Half: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - case ScalarType::BFloat16: { - auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4); - auto stride_ = check_intlist<2>(stride, "stride", 5); - auto padding_ = check_intlist<2>(padding, "padding", 6); - auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type); - if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]); - if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1); - break; - } - default: - AT_ERROR("_thnn_conv2d_backward not supported on CUDAType for ", dispatch_scalar_type); - } - return std::tuple(grad_input, grad_weight, grad_bias); -} } // namespace th } // namespace legacy diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu index ede7e1fb39b29..bf3f8ac0a6eff 100644 --- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu +++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu @@ -1,12 +1,482 @@ #include -#include -namespace at { -namespace native { +#include +#include +#include +#include +#include +#include -std::tuple slow_conv2d_backward_out_cuda(const Tensor& grad_output, - const Tensor& self, - const Tensor& weight, +namespace at { namespace native { +namespace { + +void slow_conv2d_shape_check( + const Tensor& input, const Tensor& grad_output, + const Tensor& weight, const Tensor& bias, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + bool weight_nullable) { + TORCH_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got kH: ", kH, " kW: ", kW); + TORCH_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got dH: ", dH, " dW: ", dW); + + TORCH_CHECK(weight_nullable || weight.defined(), + "weight tensor is expected to be non-nullable"); + TORCH_CHECK(!weight.defined() || + ((weight.numel() > 0) && (weight.dim() == 2)), + "non-empty 2D weight tensor expected, but got: ", weight.sizes()); + TORCH_CHECK(!bias.defined() || (bias.dim() == 1 && bias.sizes()[0] == weight.sizes()[0]), + "Expected bias to have shape [", weight.sizes()[0], "] but got ", bias.sizes()); + + const auto in_sizes = input.sizes(); + constexpr int ndim = 4; + constexpr int dimf = 1; + constexpr int dimh = 2; + constexpr int dimw = 3; + TORCH_CHECK(in_sizes.size() == ndim, "Expected 4D input tensor, but got ", in_sizes); + + // Allow for empty batch size but not other dimensions + const bool valid_empty = c10::multiply_integers(in_sizes.slice(1)) != 0; + TORCH_CHECK(valid_empty, "non-empty input tensor expected but got: ", in_sizes); + + int64_t inputHeight = in_sizes[dimh]; + int64_t inputWidth = in_sizes[dimw]; + + int64_t exactInputHeight = inputHeight + 2 * padH; + int64_t exactInputWidth = inputWidth + 2 * padW; + + TORCH_CHECK(exactInputHeight >= kH && exactInputWidth >= kW, + "Calculated padded input size per channel: ", + IntArrayRef{exactInputHeight, exactInputWidth}, + ". Kernel size: ", IntArrayRef{kH, kW}, + ". Kernel size can't be greater than actual input size"); + + // NOTE: can't use conv_output_size if the weight isn't defined + auto outputHeight = div_rtn(exactInputHeight - kH, dH) + 1; + auto outputWidth = div_rtn(exactInputWidth - kW, dW) + 1; + + TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1, + "Given input size per channel: ", + IntArrayRef{inputHeight, inputWidth}, + ". Calculated output size per channel: ", + IntArrayRef{outputHeight, outputWidth}, + ". Output size is too small"); + + if (weight.defined()) { + const auto w_sizes = weight.sizes(); + int64_t nInputPlane = w_sizes[1]; + if (w_sizes.size() == 2) { + nInputPlane /= (kH * kW); + } + TORCH_CHECK(in_sizes[dimf] == nInputPlane, + "Expected input dim ", dimf, " to have size ", nInputPlane, + " but got ", in_sizes[dimf]); + } + + if (grad_output.defined()) { + const auto gO_sizes = grad_output.sizes(); + TORCH_CHECK(gO_sizes.size() == ndim, + "Expected grad_output to have ", ndim, + " dimensions but got shape", gO_sizes); + + if (weight.defined()) { + const auto w_sizes = weight.sizes(); + TORCH_CHECK(gO_sizes[dimf] == w_sizes[0], + "Expected dim ", dimf, " to have size ", w_sizes[0], + " but got ", gO_sizes[dimf]); + } else if (bias.defined()) { + const auto b_sizes = bias.sizes(); + int64_t nOutputPlane = b_sizes.size() == 0 ? 1 : b_sizes[0]; + TORCH_CHECK(gO_sizes[dimf] == nOutputPlane, + "Expected grad_output dim ", dimf, " to have size ", + nOutputPlane, " but got ", gO_sizes[dimf]); + } + TORCH_CHECK(gO_sizes[dimh] == outputHeight, + "Expected grad_output dim ", dimh, " to have size ", + outputHeight, " but got ", gO_sizes[dimh]); + TORCH_CHECK(gO_sizes[dimw] == outputWidth, + "Expected grad_output dim ", dimw, " to have size ", + outputWidth, " but got ", gO_sizes[dimw]); + } +} + +Tensor new_view_weight_MM2d(const Tensor& weight_) { + auto weight = weight_.expect_contiguous(); + const auto w_sizes = weight->sizes(); + TORCH_CHECK(w_sizes.size() == 4); + int64_t s1 = w_sizes[0]; + int64_t s2 = c10::multiply_integers(w_sizes.slice(1)); + return weight->view({s1, s2}); +} + +void slow_conv2d_forward( + const Tensor &input, + const Tensor &output, + const Tensor &weight_, + const Tensor &bias, + const Tensor &columns, + const Tensor &ones_, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW) { + auto weight = new_view_weight_MM2d(weight_); + slow_conv2d_shape_check( + input, {}, weight, bias, kH, kW, dH, dW, padH, padW, /*weight_nullable*/false); + + TORCH_CHECK(!bias.defined() || bias.is_contiguous(), + "bias tensor has to be contiguous"); + + constexpr int ndim = 4; + constexpr int dimf = 1; + constexpr int dimh = 2; + constexpr int dimw = 3; + + auto in_sizes = input.sizes(); + int64_t batchSize = in_sizes[0]; + int64_t nInputPlane = in_sizes[dimf]; + int64_t inputHeight = in_sizes[dimh]; + int64_t inputWidth = in_sizes[dimw]; + int64_t nOutputPlane = weight.sizes()[0]; + int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + // Resize output + resize_output(output, {batchSize, nOutputPlane, outputHeight, outputWidth}); + + // Resize temporary columns + resize_output(columns, {nInputPlane * kW * kH, outputHeight * outputWidth}); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + Tensor ones; + if (bias.defined()) { + ones = at::ones({outputHeight, outputWidth}, input.options()); + } + const bool requires_columns = ( + kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0); + + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), + "slow_conv2d_cuda", [&] { + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + auto input_n = input.select(0, elt); + auto output_n = output.select(0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t n_ = outputHeight * outputWidth; + int64_t k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias.defined()) { + at::cuda::blas::gemm( + 't', 'n', + n_, m_, k_, + scalar_t(1), + ones.data_ptr(), k_, + bias.data_ptr(), k_, + scalar_t(0), + output_n.data_ptr(), n_ + ); + } else { + output_n.zero_(); + } + + if (requires_columns) { + // Extract columns: + at::native::im2col( + c10::cuda::getCurrentCUDAStream(), + input_n.data_ptr(), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + 1, 1, + columns.data_ptr() + ); + } + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = columns.size(1); + int64_t k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + auto gemm_in_ptr = requires_columns ? + columns.data_ptr() : + input_n.data_ptr(); + at::cuda::blas::gemm( + 'n', 'n', + n, m, k, + scalar_t(1), + gemm_in_ptr, n, + weight.data_ptr(), k, + scalar_t(1), + output_n.data_ptr(), n + ); + } + }); +} + +void slow_conv2d_backward( + const Tensor &input, + const Tensor &grad_output, + const Tensor &grad_input, + const Tensor &weight_, + const Tensor &grad_columns, + const Tensor &ones, + int kH, int kW, + int dH, int dW, + int padH, int padW) { + Tensor weight = new_view_weight_MM2d(weight_); + slow_conv2d_shape_check(input, grad_output, weight, {}, + kH, kW, dH, dW, padH, padW, /*weight_nullable=*/false); + + // Params + auto weight_sizes = weight.sizes(); + int nInputPlane = weight_sizes[1]/(kW*kH); + int nOutputPlane = weight_sizes[0]; + + TORCH_INTERNAL_ASSERT(grad_output.is_contiguous()); + + auto input_sizes = input.sizes(); + int64_t inputWidth = input_sizes[3]; + int64_t inputHeight = input_sizes[2]; + auto output_sizes = grad_output.sizes(); + int64_t outputWidth = output_sizes[3]; + int64_t outputHeight = output_sizes[2]; + + // Batch size + input planes + int64_t batchSize = input_sizes[0]; + + // Resize output + resize_output(grad_input, input_sizes); + TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); + + // Resize temporary columns + resize_output(grad_columns, {nInputPlane*kW*kH, outputHeight*outputWidth}); + TORCH_CHECK(grad_columns.is_contiguous(), "grad_columns must be contiguous"); + + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), + "slow_conv2d_backward_cuda", [&] { + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + auto grad_input_n = grad_input.select(0, elt); + auto grad_output_n = grad_output.select(0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nInputPlane*kW*kH; + int64_t n = grad_columns.sizes()[1]; + int64_t k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + at::cuda::blas::gemm( + 'n', 't', + n, m, k, + scalar_t(1), + grad_output_n.data_ptr(), n, + weight.data_ptr(), m, + scalar_t(0), + grad_columns.data_ptr(), n + ); + + // Unpack columns back into input: + using acc_t = at::acc_type; + at::native::col2im( + c10::cuda::getCurrentCUDAStream(), + grad_columns.data_ptr(), + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, grad_input_n.data_ptr() + ); + } + }); +} + +void slow_conv2d_grad_weight_bias( + const Tensor &input, + const Tensor &grad_output, + const Tensor &grad_weight_, + const Tensor &grad_bias, + const Tensor &columns, + const Tensor &ones, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW) { + if (grad_weight_.defined()) { + TORCH_CHECK(grad_weight_.is_contiguous(), "grad_weight needs to be contiguous"); + } + if (grad_bias.defined()) { + TORCH_CHECK(grad_bias.is_contiguous(), "grad_bias needs to be contiguous"); + TORCH_CHECK(ones.is_contiguous(), "ones needs to be contiguous"); + } + + auto grad_weight = new_view_weight_MM2d(grad_weight_); + slow_conv2d_shape_check(input, grad_output, grad_weight, grad_bias, + kH, kW, dH, dW, padH, padW, /*weight_nullable=*/true); + + // Params + TORCH_INTERNAL_ASSERT(input.is_contiguous()); + TORCH_INTERNAL_ASSERT(grad_output.is_contiguous()); + + auto input_sizes = input.sizes(); + int64_t nInputPlane = input_sizes[1]; + int64_t nOutputPlane = grad_output.sizes()[1]; + + int64_t inputWidth = input_sizes[3]; + int64_t inputHeight = input_sizes[2]; + int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + // Batch size + input planes + int64_t batchSize = input_sizes[0]; + + // Define a buffer of ones, for bias accumulation + if (ones.defined() && ones.numel() < outputHeight * outputWidth) { + ones.resize_({outputHeight, outputWidth}); + ones.fill_(1); + } + + // Resize temporary columns + resize_output(columns, {nInputPlane * kH * kW, outputHeight * outputWidth}); + + const bool requires_columns = ( + kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0); + + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), + "slow_conv2d_grad_weight_bias_cuda", [&] { + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + auto grad_output_n = grad_output.select(0, elt); + + // Do Weight: + if (grad_weight.defined()) { + // Matrix mulitply per output: + auto input_n = input.select(0, elt); + + if (requires_columns) { + // Extract columns: + at::native::im2col( + c10::cuda::getCurrentCUDAStream(), + input_n.data_ptr(), + nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + kH, kW, padH, padW, dH, dW, + 1, 1, + columns.data_ptr() + ); + } + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m = nOutputPlane; + int64_t n = nInputPlane*kW*kH; + int64_t k = columns.sizes()[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + auto gemm_in_ptr = requires_columns ? + columns.data_ptr() : + input_n.data_ptr(); + at::cuda::blas::gemm( + 't', 'n', + n, m, k, + scalar_t(1), + gemm_in_ptr, k, + grad_output_n.data_ptr(), k, + scalar_t(1), + grad_weight.data_ptr(), n + ); + } + + // Do Bias: + if (grad_bias.defined()) { + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + int64_t m_ = nOutputPlane; + int64_t k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + at::cuda::blas::gemv( + 't', + k_, m_, + scalar_t(1), + grad_output_n.data_ptr(), k_, + ones.data_ptr(), 1, + scalar_t(1), + grad_bias.data_ptr(), 1 + ); + } + } + }); +} + +} // namespace (anonymous) + + +std::tuple slow_conv2d_forward_out_cuda( + const Tensor &self_, + const Tensor &weight_, + IntArrayRef kernel_size, + const c10::optional &bias_, + IntArrayRef stride, + IntArrayRef padding, + Tensor &output, + Tensor &finput, + Tensor &fgrad_input) { + TORCH_CHECK(kernel_size.size() == 2); + TORCH_CHECK(stride.size() == 2); + TORCH_CHECK(padding.size() == 2); + + auto self = self_.expect_contiguous(); + auto weight = weight_.expect_contiguous(); + auto bias = [&] { + if (bias_.has_value() && bias_->defined()) { + return bias_->expect_contiguous(); + } + return MaybeOwned::owned(c10::in_place); + }(); + + slow_conv2d_forward( + *self, + output, + *weight, + *bias, + finput, + fgrad_input, + kernel_size[0], kernel_size[1], + stride[0], stride[1], + padding[0], padding[1] + ); + return std::tuple{ + output, finput, fgrad_input}; +} + +std::tuple slow_conv2d_forward_cuda( + const Tensor &self, + const Tensor &weight, + IntArrayRef kernel_size, + const c10::optional &bias, + IntArrayRef stride, + IntArrayRef padding) { + auto output = at::empty({0}, self.options()); + auto finput = at::empty({0}, self.options()); + auto fgrad_input = at::empty({0}, self.options()); + return slow_conv2d_forward_out_cuda( + self, weight, kernel_size, bias, stride, padding, output, finput, fgrad_input); +} + +std::tuple slow_conv2d_backward_out_cuda( + const Tensor& grad_output_, + const Tensor& self_, + const Tensor& weight_, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, @@ -16,17 +486,42 @@ std::tuple slow_conv2d_backward_out_cuda(const Tensor Tensor& grad_weight, Tensor& grad_bias) { if (grad_weight.defined()) { - grad_weight.resize_(weight.sizes()); + resize_output(grad_weight, weight_.sizes()); grad_weight.zero_(); } if (grad_bias.defined()) { - grad_bias.resize_({ weight.size(0) }); + resize_output(grad_bias, {weight_.sizes()[0]}); grad_bias.zero_(); } - return legacy::cuda::_thnn_conv2d_backward_out(grad_input, grad_weight, grad_bias, - grad_output, self, weight, - kernel_size, stride, padding, - finput, fgrad_input); + auto grad_output = grad_output_.expect_contiguous(); + if (grad_input.defined()) { + resize_output(grad_input, self_.sizes()); + auto weight = weight_.expect_contiguous(); + + slow_conv2d_backward( + self_, *grad_output, + grad_input, *weight, + finput, fgrad_input, + kernel_size[0], kernel_size[1], + stride[0], stride[1], + padding[0], padding[1]); + } + if (grad_weight.defined() || grad_bias.defined()) { + auto self = self_.expect_contiguous(); + slow_conv2d_grad_weight_bias( + *self, + *grad_output, + grad_weight, + grad_bias, + finput, + fgrad_input, + kernel_size[0], kernel_size[1], + stride[0], stride[1], + padding[0], padding[1] + ); + } + return std::tuple{ + grad_input, grad_weight, grad_bias}; } std::tuple slow_conv2d_backward_cuda( diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 40245cc7607af..9bce764b1ee1a 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -9498,13 +9498,13 @@ python_module: nn dispatch: CPU: slow_conv2d_forward_out_cpu - CUDA: legacy::cuda::_thnn_conv2d_forward_out + CUDA: slow_conv2d_forward_out_cuda - func: thnn_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input) python_module: nn dispatch: CPU: slow_conv2d_forward_cpu - CUDA: legacy::cuda::_thnn_conv2d_forward + CUDA: slow_conv2d_forward_cuda - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt index 55197277b3779..f84005e7e92f6 100644 --- a/aten/src/THCUNN/CMakeLists.txt +++ b/aten/src/THCUNN/CMakeLists.txt @@ -1,5 +1,4 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} -${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu PARENT_SCOPE) set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu deleted file mode 100644 index 020bfa1ebf8ce..0000000000000 --- a/aten/src/THCUNN/SpatialConvolutionMM.cu +++ /dev/null @@ -1,13 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu deleted file mode 100644 index af492b3e7da02..0000000000000 --- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu +++ /dev/null @@ -1,499 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/SpatialConvolutionMM.cu" -#else - -#include -#include - -static inline void THNN_(SpatialConvolutionMM_shapeCheck)( - THCState *state, - THCTensor *input, THCTensor *gradOutput, - THCTensor *weight, THCTensor *bias, - int kH, int kW, int dH, int dW, int padH, int padW, - int weight_nullable) { - THArgCheck(kW > 0 && kH > 0, 9, - "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); - THArgCheck(dW > 0 && dH > 0, 11, - "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); - - if (weight != NULL) { - THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, - "non-empty 2D or 4D weight tensor expected, but got: %s"); - if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0)); - } - } else if (!weight_nullable) { - THError("weight tensor is expected to be non-nullable"); - } - - int ndim = input->dim(); - int dimf = 0; - int dimh = 1; - int dimw = 2; - - if (ndim == 4) { - dimf++; - dimh++; - dimw++; - } - - // Allow for empty batch size but not other dimensions - bool valid_empty = false; - if (ndim == 3) { - valid_empty = input->size(0) == 0 && input->size(1) != 0 && input->size(2) != 0; - } else if (ndim == 4) { - valid_empty = input->size(0) == 0 && input->size(1) != 0 && input->size(2) != 0 && input->size(3) != 0; - } - - - THCUNN_argCheck(state, (!input->is_empty() || valid_empty) && (ndim == 3 || ndim == 4), 2, input, - "non-empty 3D or 4D input tensor expected but got: %s"); - - int64_t inputHeight = input->size(dimh); - int64_t inputWidth = input->size(dimw); - - int64_t exactInputHeight = inputHeight + 2 * padH; - int64_t exactInputWidth = inputWidth + 2 * padW; - - if (exactInputHeight < kH || exactInputWidth < kW) { - THError("Calculated padded input size per channel: (%ld x %ld). " - "Kernel size: (%d x %d). Kernel size can't be greater than actual input size", - exactInputHeight, exactInputWidth, kH, kW); - } - - int64_t outputHeight = div_rtn(exactInputHeight - kH, dH) + 1; - int64_t outputWidth = div_rtn(exactInputWidth - kW, dW) + 1; - - if (outputWidth < 1 || outputHeight < 1) { - THError("Given input size per channel: (%ld x %ld). " - "Calculated output size per channel: (%ld x %ld). Output size is too small", - inputHeight, inputWidth, outputHeight, outputWidth); - } - - if (weight != NULL) { - int64_t nInputPlane = weight->size(1); - if (weight->dim() == 2) { - nInputPlane /= (kH * kW); - } - THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); - } - - if (gradOutput != NULL) { - if (weight != NULL) { - int64_t nOutputPlane = weight->size(0); - THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); - } else if (bias != NULL) { - int64_t nOutputPlane = bias->dim() == 0 ? 1 : bias->size(0); - THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); - } - THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); - THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth); - } -} - -static THCTensor* THNN_(newViewWeightMM2d)(THCState *state, THCTensor *weight) { - weight = THCTensor_(newContiguous)(state, weight); - if (weight->dim() == 4) { - int64_t s1 = weight->size(0); - int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); - THCTensor *old_weight = weight; - weight = THTensor_wrap(weight).view({s1, s2}).unsafeReleaseTensorImpl(); - THCTensor_(free)(state, old_weight); - } - return weight; -} - -void THNN_(SpatialConvolutionMM_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - THCTensor *weight, - THCTensor *bias, - THCTensor *columns, - THCTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH) { - THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); - if (bias) { - THCUNN_assertSameGPU(state, 2, weight, bias); - } - weight = THNN_(newViewWeightMM2d)(state, weight); - THNN_(SpatialConvolutionMM_shapeCheck) - (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, 0); - THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5, - "bias tensor has to be contiguous"); - - int ndim = input->dim(); - int dimf = 0; - int dimh = 1; - int dimw = 2; - - if (ndim == 4) { - dimf++; - dimh++; - dimw++; - } - - int64_t nInputPlane = input->size(dimf); - int64_t inputHeight = input->size(dimh); - int64_t inputWidth = input->size(dimw); - int64_t nOutputPlane = weight->size(0); - int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; - int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; - - - input = THCTensor_(newContiguous)(state, input); - int is_batch = 1; - if (input->dim() == 3) { - // Force batch - is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); - } - - // Batch size + input planes - int64_t batchSize = input->size(0); - - // Resize output - THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); - - // Resize temporary columns - THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (bias) { - if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); - THCTensor_(fill)(state, ones, ScalarConvert::to(1)); - } - } - - // Helpers - THCTensor *input_n = THCTensor_(new)(state); - THCTensor *output_n = THCTensor_(new)(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCTensor_(select)(state, input_n, input, 0, elt); - THCTensor_(select)(state, output_n, output, 0, elt); - - // Do Bias first: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m_ = nOutputPlane; - int64_t n_ = outputHeight * outputWidth; - int64_t k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - if (bias) { - at::cuda::blas::gemm( - 't', 'n', - n_, m_, k_, - ScalarConvert::to(1), - THCTensor_(data)(state, ones), k_, - THCTensor_(data)(state, bias), k_, - ScalarConvert::to(0), - THCTensor_(data)(state, output_n), n_ - ); - } else { - THCTensor_(zero)(state, output_n); - } - - if (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0) { - // Extract columns: - at::native::im2col( - c10::cuda::getCurrentCUDAStream(), - THCTensor_(data)(state, input_n), - nInputPlane, inputHeight, inputWidth, - outputHeight, outputWidth, - kH, kW, padH, padW, dH, dW, - 1, 1, - columns->data() - ); - } - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = nOutputPlane; - int64_t n = columns->size(1); - int64_t k = nInputPlane*kH*kW; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - auto gemm_in_ptr = - (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0) - ? THCTensor_(data)(state, columns) - : THCTensor_(data)(state, input_n); - at::cuda::blas::gemm( - 'n', 'n', - n, m, k, - ScalarConvert::to(1), - gemm_in_ptr, n, - THCTensor_(data)(state, weight), k, - ScalarConvert::to(1), - THCTensor_(data)(state, output_n), n - ); - } - - // Free - THCTensor_(free)(state, input_n); - THCTensor_(free)(state, output_n); - - // Resize output - if (is_batch == 0) { - THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); - THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); - } - - THCTensor_(free)(state, input); - THCTensor_(free)(state, weight); -} - -void THNN_(SpatialConvolutionMM_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput, - THCTensor *weight, - THCTensor *gradColumns, - THCTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH) { - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - gradColumns, gradInput); - weight = THNN_(newViewWeightMM2d)(state, weight); - - THNN_(SpatialConvolutionMM_shapeCheck) - (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0); - - // Params - int nInputPlane = weight->dim() == 2 ? weight->size(1)/(kW*kH) : weight->size(1); - int nOutputPlane = weight->size(0); - - input = THCTensor_(newContiguous)(state, input); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - - int is_batch = 1; - if (input->dim() == 3) { - // Force batch - is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); - } - - int64_t inputWidth = input->size(3); - int64_t inputHeight = input->size(2); - int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; - int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; - - // Batch size + input planes - int64_t batchSize = input->size(0); - - // Resize output - THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); - - // Resize temporary columns - THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCTensor *gradInput_n = THCTensor_(new)(state); - THCTensor *gradOutput_n = THCTensor_(new)(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: - THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); - THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = nInputPlane*kW*kH; - int64_t n = gradColumns->size(1); - int64_t k = nOutputPlane; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - at::cuda::blas::gemm( - 'n', 't', - n, m, k, - ScalarConvert::to(1), - THCTensor_(data)(state, gradOutput_n), n, - THCTensor_(data)(state, weight), m, - ScalarConvert::to(0), - THCTensor_(data)(state, gradColumns), n - ); - - // Unpack columns back into input: - at::native::col2im( - c10::cuda::getCurrentCUDAStream(), - THCTensor_(data)(state, gradColumns), - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCTensor_(data)(state, gradInput_n) - ); - } - - // Free - THCTensor_(free)(state, gradInput_n); - THCTensor_(free)(state, gradOutput_n); - THCTensor_(free)(state, weight); - - // Resize output - if (is_batch == 0) { - THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); - THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); - } - - THCTensor_(free)(state, input); - THCTensor_(free)(state, gradOutput); -} - -void THNN_(SpatialConvolutionMM_accGradParameters)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradWeight, - THCTensor *gradBias, - THCTensor *columns, - THCTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - accreal scale_) { - scalar_t scale = ScalarConvert::to(scale_); - THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones); - if (gradWeight) { - THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous"); - gradWeight = THNN_(newViewWeightMM2d)(state, gradWeight); - } - if (gradBias) { - THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous"); - THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous"); - } - - THNN_(SpatialConvolutionMM_shapeCheck) - (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, 1); - - // Params - input = THCTensor_(newContiguous)(state, input); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - - int is_batch = 1; - if (input->dim() == 3) { - // Force batch - is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); - } - - int64_t nInputPlane = input->size(1); - int64_t nOutputPlane = gradOutput->size(1); - - int64_t inputWidth = input->size(3); - int64_t inputHeight = input->size(2); - int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; - int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; - - // Batch size + input planes - int64_t batchSize = input->size(0); - - // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); - THCTensor_(fill)(state, ones, ScalarConvert::to(1)); - } - - // Resize temporary columns - THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCTensor *input_n = THCTensor_(new)(state); - THCTensor *gradOutput_n = THCTensor_(new)(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); - - // Do Weight: - if (gradWeight) { - // Matrix mulitply per output: - THCTensor_(select)(state, input_n, input, 0, elt); - - if (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0) { - // Extract columns: - at::native::im2col( - c10::cuda::getCurrentCUDAStream(), - THCTensor_(data)(state, input_n), - nInputPlane, inputHeight, inputWidth, - outputHeight, outputWidth, - kH, kW, padH, padW, dH, dW, - 1, 1, - columns->data() - ); - } - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = nOutputPlane; - int64_t n = nInputPlane*kW*kH; - int64_t k = columns->size(1); - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - auto gemm_in_ptr = - (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0) - ? THCTensor_(data)(state, columns) - : THCTensor_(data)(state, input_n); - at::cuda::blas::gemm( - 't', 'n', - n, m, k, - scale, - gemm_in_ptr, k, - THCTensor_(data)(state, gradOutput_n), k, - ScalarConvert::to(1), - THCTensor_(data)(state, gradWeight), n - ); - } - - // Do Bias: - if (gradBias) { - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m_ = nOutputPlane; - int64_t k_ = outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - //#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_BFLOAT16) - at::cuda::blas::gemv( - 't', - k_, m_, - scale, - THCTensor_(data)(state, gradOutput_n), k_, - THCTensor_(data)(state, ones), 1, - ScalarConvert::to(1), - THCTensor_(data)(state, gradBias), 1 - ); - } - } - - // Free - THCTensor_(free)(state, input_n); - THCTensor_(free)(state, gradOutput_n); - if (gradWeight) - THCTensor_(free)(state, gradWeight); - - // Resize - if (is_batch == 0) { - THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); - } - - THCTensor_(free)(state, input); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h index 87a6105293057..d624fdd090177 100644 --- a/aten/src/THCUNN/generic/THCUNN.h +++ b/aten/src/THCUNN/generic/THCUNN.h @@ -26,50 +26,4 @@ TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor* weights, // [OPTIONAL] accreal margin); -TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateOutput)( - THCState* state, - THCTensor* input, - THCTensor* output, - THCTensor* weight, - THCTensor* bias, // [OPTIONAL] - THCTensor* columns, - THCTensor* ones, - int kW, - int kH, - int dW, - int dH, - int padW, - int padH); - -TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateGradInput)( - THCState* state, - THCTensor* input, - THCTensor* gradOutput, - THCTensor* gradInput, - THCTensor* weight, - THCTensor* columns, - THCTensor* ones, - int kW, - int kH, - int dW, - int dH, - int padW, - int padH); - -TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_accGradParameters)( - THCState* state, - THCTensor* input, - THCTensor* gradOutput, - THCTensor* gradWeight, - THCTensor* gradBias, // [OPTIONAL] - THCTensor* columns, - THCTensor* ones, - int kW, - int kH, - int dW, - int dH, - int padW, - int padH, - accreal scale); - #endif From 2d58f3f56d7be4eced403454d561179c0d5527c5 Mon Sep 17 00:00:00 2001 From: Akshit Khurana Date: Fri, 20 Aug 2021 21:08:59 -0700 Subject: [PATCH 122/530] NNAPI: Support const values in binary ops Summary: NNAPI converter failed with 1 const value and one tensor earlier Code suggestions from dreiss Test Plan: pytest test/test_nnapi.py::TestNNAPI::test_pointwise_binary Imported from OSS Reviewed By: anshuljain1 Differential Revision: D28893881 fbshipit-source-id: 59240373fb03c6fdafa4cb2fa4d8408dd20092f6 --- test/test_nnapi.py | 25 +++++++++++++++++++++++++ torch/backends/_nnapi/serializer.py | 25 ++++++++++++++++++------- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/test/test_nnapi.py b/test/test_nnapi.py index 19efa7f0ae738..d70bebf547a1e 100644 --- a/test/test_nnapi.py +++ b/test/test_nnapi.py @@ -49,6 +49,7 @@ def check( convert_args=None, atol_rtol=None, limit=None, + expected_memory_format=None ): with torch.no_grad(): if isinstance(arg_or_args, torch.Tensor): @@ -76,6 +77,8 @@ def check( # Too many mismatches. Re-run the check with no tolerance # to get a nice message. self.assertEqual(eager_output, nnapi_output, atol=0, rtol=0) + if expected_memory_format: + self.assertTrue(nnapi_out.is_contiguous(memory_format=expected_memory_format)) def float_and_quant_and_nhwc(self, inp_float, scale, zero_point): torch.manual_seed(29) @@ -319,6 +322,28 @@ def forward(self, lhs, rhs): torch.tensor([[3.0, 4.0], [5.0, 6.0]]), ]) + def test_pointwise_binary_const(self): + const = torch.randn(1, 4, 6, 6) + + class ArgPlusConst(torch.nn.Module): + def forward(self, arg): + return arg + const + + class ConstPlusArg(torch.nn.Module): + def forward(self, arg): + return const + arg + + arg_contig = torch.randn(2, 4, 6, 6) + arg_nhwc = nhwc(torch.randn(2, 4, 6, 6)) + + for mod_class in [ArgPlusConst, ConstPlusArg]: + for use_nhwc in [False, True]: + with self.subTest(mod_class=mod_class.__name__, use_nhwc=use_nhwc): + arg = arg_nhwc if use_nhwc else arg_contig + memory_format = torch.channels_last if use_nhwc else torch.contiguous_format + self.check(mod_class(), arg, + expected_memory_format=memory_format) + def test_hardtanh(self): inp = torch.tensor([-2.0, -0.5, 0.5, 2.0, 7.0]) self.check(torch.nn.Hardtanh(), inp) diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py index f85d51a040995..a2530df478833 100644 --- a/torch/backends/_nnapi/serializer.py +++ b/torch/backends/_nnapi/serializer.py @@ -404,8 +404,8 @@ def add_tensor_operand_for_input(self, arg_idx, jitval, tensor): self.compute_operand_shape(operand_id, dim, f"args[{arg_idx}].shape[{dim}]") return operand_id - def add_tensor_operand_for_weight(self, tensor): - toper = self.torch_tensor_to_operand(tensor, DimOrder.UNKNOWN_CONSTANT) + def add_tensor_operand_for_weight(self, tensor, dim_order=DimOrder.UNKNOWN_CONSTANT): + toper = self.torch_tensor_to_operand(tensor, dim_order) operand_id = len(self.operands) self.operands.append(toper) tsize = tensor_size(toper.op_type, toper.shape) @@ -418,6 +418,9 @@ def add_tensor_operand_for_weight(self, tensor): buf_num, offset, tsize)) + # For NHWC NNAPI op, lay out data in the same dim order by permuting torch tensor + if dim_order == DimOrder.CHANNELS_LAST: + tensor = tensor.permute(0, 2, 3, 1) self.used_weights.append(tensor) return operand_id @@ -456,6 +459,9 @@ def add_immediate_int_vector(self, value): array.array("i", value).tobytes(), (len(value),)) + def has_operand_for_jitval(self, jitval): + return jitval in self.jitval_operand_map + def get_tensor_operand_by_jitval(self, jitval): operand_id = self.jitval_operand_map[jitval] return (operand_id, self.operands[operand_id]) @@ -469,11 +475,11 @@ def get_tensor_operand_by_jitval_fixed_size(self, jitval): raise Exception("Flexible size is not supported for this operand.") return op_id, oper - def get_tensor_operand_or_constant(self, jitval): + def get_tensor_operand_or_constant(self, jitval, dim_order=DimOrder.PRESUMED_CONTIGUOUS): operand_id = self.jitval_operand_map.get(jitval) if operand_id is None: _, value = self.get_constant_value(jitval, "TensorType") - operand_id = self.add_tensor_operand_for_weight(value) + operand_id = self.add_tensor_operand_for_weight(value, dim_order) return (operand_id, self.operands[operand_id]) def get_tensor_operand_for_weight(self, jitval): @@ -1233,9 +1239,14 @@ def _do_add_binary(self, node, opcode, fuse_code, *, qparams=None): assert node.inputsAt(0).type().kind() == "TensorType" assert node.inputsAt(1).type().kind() == "TensorType" - # TODO: Should support constant as either operand. - in0_id, in0_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0)) - in1_id, in1_oper = self.get_tensor_operand_by_jitval(node.inputsAt(1)) + if self.has_operand_for_jitval(node.inputsAt(0)): + in0_id, in0_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0)) + in1_id, in1_oper = self.get_tensor_operand_or_constant(node.inputsAt(1), in0_oper.dim_order) + elif self.has_operand_for_jitval(node.inputsAt(1)): + in1_id, in1_oper = self.get_tensor_operand_by_jitval(node.inputsAt(1)) + in0_id, in0_oper = self.get_tensor_operand_or_constant(node.inputsAt(0), in1_oper.dim_order) + else: + raise Exception(f"Can't do a NNAPI binary op: {opcode} on two constants") assert in0_oper.op_type == in1_oper.op_type in0_id, in0_oper, in1_id, in1_oper = self.transpose_for_broadcast( From b2a601ffe54294100d9967b98cc6576675dcdfcf Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Fri, 20 Aug 2021 21:41:19 -0700 Subject: [PATCH 123/530] [Static Runtime] Implement out variant for fb::quantized_linear (#63635) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63635 Reviewed By: ajyu Differential Revision: D30446234 fbshipit-source-id: 1ef014186ff725930a97d0159626f9233ee74030 --- benchmarks/static_runtime/test_scripts.h | 11 +++++ .../static_runtime/test_static_runtime.cc | 14 ++++++ torch/csrc/jit/runtime/static/ops.cpp | 47 +++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 9946c7af02e5a..73380129731ed 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -719,3 +719,14 @@ const auto append_tensor_script = R"JIT( lst.append(a) return lst )JIT"; + +const std::string quantize_script = R"IR( + graph(%input: Tensor, %weights: Tensor): + %scale: float = prim::Constant[value=1.]() + %zero_point: int = prim::Constant[value=1]() + %bias: None = prim::Constant() + %packed_params = quantized::linear_prepack(%weights, %bias) + %1254 = quantized::linear(%input, %packed_params, %scale, %zero_point) + %1249: Tensor = aten::dequantize(%1254) + return (%1249) +)IR"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index ec703ef8a2ec1..dfe2c14e9489b 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1172,3 +1172,17 @@ TEST(StaticRuntime, IndividualOps_Append) { testStaticRuntime(append_tensor_script, args_tensor); testStaticRuntime(append_tensor_script, args_tensor, args_tensor_large); } + +TEST(StaticRuntime, QuantizedLinear) { + at::Tensor weight = + at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8); + at::Tensor input = + at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8); + + at::Tensor weight_2 = + at::quantize_per_tensor(torch::randn({4, 3}), 2, 3, torch::kQInt8); + at::Tensor input_2 = + at::quantize_per_tensor(torch::randn({4, 3}), 2, 3, torch::kQUInt8); + + testStaticRuntime(quantize_script, {input, weight}, {input_2, weight_2}); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index eef5595cee7b2..2543182db138c 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1528,6 +1528,53 @@ REGISTER_OPERATOR_FUNCTOR(quantized::linear, quantized_linear, [](Node* n) -> SR }; }); +REGISTER_OPERATOR_FUNCTOR( + fb::quantized_linear, + fb_quantized_linear, + [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "fb::quantized_linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase w_prepack, Tensor Y_scale_i, Tensor Y_zero_point_i) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + const auto w = toIValue(n->inputs()[1]); + c10::intrusive_ptr packed_weight; + if (w) { + packed_weight = w->toCustomClass(); + } + return [packed_weight](ProcessedNode* p_node) { + const auto& input = p_node->Input(0).toTensor(); + const auto output_scale = p_node->Input(2).toTensor().item().toFloat(); + const auto output_zero_point = + p_node->Input(3).toTensor().item().toLong(); + + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::empty_affine_quantized( + {0}, + c10::kQUInt8, + c10::nullopt, + c10::kCPU, + false, + output_scale, + output_zero_point, + c10::nullopt); + } + auto& out_t = p_node->Output(0).toTensor(); + fastResizeToZero(out_t); + + if (packed_weight) { + packed_weight->apply_out( + input, output_scale, output_zero_point, out_t); + } else { + // Weights could be quantized on the fly + auto packed_weight_tmp = + p_node->Input(1).toCustomClass(); + packed_weight_tmp->apply_out( + input, output_scale, output_zero_point, out_t); + } + }; + }); + REGISTER_OPERATOR_FUNCTOR(aten::full, aten_full, [](Node* n) -> SROperator { if (!n->matches(torch::schema( "aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"))) { From 8871ff29b743948d1225389d5b7068f37b22750b Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Fri, 20 Aug 2021 22:15:55 -0700 Subject: [PATCH 124/530] [sharded_tensor] add readonly tensor properties (#63679) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63679 This PR add read only tensor properties to sharded tensor, to match the torch.Tensor behaviors. Test Plan: test_sharded_tensor_metadata Reviewed By: pritamdamania87 Differential Revision: D30459343 fbshipit-source-id: 9aec8ecfe76479eed25f3b843495e5719ed2956d --- .../_sharded_tensor/test_sharded_tensor.py | 43 +++++++++---------- torch/distributed/_sharded_tensor/api.py | 29 +++++++++++++ 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py index 5067f301b5595..26a176b1455c1 100644 --- a/test/distributed/_sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py @@ -174,19 +174,17 @@ def test_sharded_tensor_metadata(self): sharded_tensor = _sharded_tensor.empty(spec, 10, 20, init_rrefs=True) sharded_tensor_metadata = sharded_tensor.metadata() self.assertEqual(torch.Size([10, 20]), sharded_tensor_metadata.size) - self.assertEqual(torch.float, sharded_tensor_metadata.dtype) - self.assertEqual(torch.strided, sharded_tensor_metadata.layout) - self.assertEqual(False, sharded_tensor_metadata.requires_grad) - self.assertEqual(torch.contiguous_format, sharded_tensor_metadata.memory_format) - self.assertEqual(False, sharded_tensor_metadata.pin_memory) + self.assertEqual(torch.float, sharded_tensor.dtype) + self.assertEqual(torch.strided, sharded_tensor.layout) + self.assertEqual(False, sharded_tensor.requires_grad) + self.assertTrue(sharded_tensor.is_contiguous()) + self.assertFalse(sharded_tensor.is_pinned()) sharded_tensor = _sharded_tensor.empty(spec, 10, 20, requires_grad=True, init_rrefs=True) - sharded_tensor_metadata = sharded_tensor.metadata() - self.assertEqual(True, sharded_tensor_metadata.requires_grad) + self.assertEqual(True, sharded_tensor.requires_grad) sharded_tensor = _sharded_tensor.empty(spec, 10, 20, dtype=torch.double, init_rrefs=True) - sharded_tensor_metadata = sharded_tensor.metadata() - self.assertEqual(torch.double, sharded_tensor_metadata.dtype) + self.assertEqual(torch.double, sharded_tensor.dtype) # Need CPU for pin_memory spec = ChunkShardingSpec( @@ -200,8 +198,12 @@ def test_sharded_tensor_metadata(self): ) sharded_tensor = _sharded_tensor.empty(spec, 10, 20, pin_memory=True, init_rrefs=True) - sharded_tensor_metadata = sharded_tensor.metadata() - self.assertEqual(True, sharded_tensor_metadata.pin_memory) + self.assertEqual(True, sharded_tensor.is_pinned()) + + # test read only properties, they're read only as we can't simply change + # the global metadata without changing the underlying shard's properties + with self.assertRaisesRegex(AttributeError, "can't set attribute"): + sharded_tensor.requires_grad = True @with_comms @skip_if_lt_x_gpu(4) @@ -782,19 +784,17 @@ def test_sharded_tensor_metadata(self): sharded_tensor = _sharded_tensor.empty(spec, 10, 10, init_rrefs=True) sharded_tensor_metadata = sharded_tensor.metadata() self.assertEqual(torch.Size([10, 10]), sharded_tensor_metadata.size) - self.assertEqual(torch.float, sharded_tensor_metadata.dtype) - self.assertEqual(torch.strided, sharded_tensor_metadata.layout) - self.assertEqual(False, sharded_tensor_metadata.requires_grad) - self.assertEqual(torch.contiguous_format, sharded_tensor_metadata.memory_format) - self.assertEqual(False, sharded_tensor_metadata.pin_memory) + self.assertEqual(torch.float, sharded_tensor.dtype) + self.assertEqual(torch.strided, sharded_tensor.layout) + self.assertEqual(False, sharded_tensor.requires_grad) + self.assertTrue(sharded_tensor.is_contiguous()) + self.assertFalse(sharded_tensor.is_pinned()) sharded_tensor = _sharded_tensor.empty(spec, 10, 10, requires_grad=True, init_rrefs=True) - sharded_tensor_metadata = sharded_tensor.metadata() - self.assertEqual(True, sharded_tensor_metadata.requires_grad) + self.assertEqual(True, sharded_tensor.requires_grad) sharded_tensor = _sharded_tensor.empty(spec, 10, 10, dtype=torch.double, init_rrefs=True) - sharded_tensor_metadata = sharded_tensor.metadata() - self.assertEqual(torch.double, sharded_tensor_metadata.dtype) + self.assertEqual(torch.double, sharded_tensor.dtype) # Need CPU for pin_memory spec = EnumerableShardingSpec([ @@ -821,8 +821,7 @@ def test_sharded_tensor_metadata(self): ]) sharded_tensor = _sharded_tensor.empty(spec, 10, 10, pin_memory=True, init_rrefs=True) - sharded_tensor_metadata = sharded_tensor.metadata() - self.assertEqual(True, sharded_tensor_metadata.pin_memory) + self.assertTrue(sharded_tensor.is_pinned()) @with_comms @skip_if_lt_x_gpu(4) diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py index 2b6720b059a85..5f501b7689e4e 100644 --- a/torch/distributed/_sharded_tensor/api.py +++ b/torch/distributed/_sharded_tensor/api.py @@ -551,6 +551,35 @@ def size(self) -> torch.Size: """ return self._metadata.size + def is_pinned(self) -> bool: + """ + Returns True if the sharded tensor (each local shard) resides in pinned memory. + """ + return self._metadata.pin_memory + + def is_contiguous(self) -> bool: + """ + Returns True if the sharded tensor (each local shard) is contiguous in memory + in the order specified by memory format. + """ + return self._metadata.memory_format == torch.contiguous_format + + @property + def shape(self): + return self._metadata.size + + @property + def requires_grad(self): + return self._metadata.requires_grad + + @property + def dtype(self): + return self._metadata.dtype + + @property + def layout(self): + return self._metadata.layout + def _register_remote_shards(self, remote_shards: List[rpc.RRef[Shard]], rpc_rank: int): self._remote_shards[rpc_rank] = remote_shards From 76da46ccdccd2fa06fd17b58edbaca98100be5ed Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sat, 21 Aug 2021 03:36:09 -0700 Subject: [PATCH 125/530] Revert D30417127: Remove flag to toggle CPU fusion in the presence of parallelism Test Plan: revert-hammer Differential Revision: D30417127 (https://github.com/pytorch/pytorch/commit/6600bc96517269c608ea47b76b6bda9476c7bcef) Original commit changeset: b77d7c68364f fbshipit-source-id: 6b52fb83a84fe241945e3cb3eeb71050d1d9c8f1 --- test/cpp/tensorexpr/test_te_fuser_pass.cpp | 6 +++++- test/jit/test_profiler.py | 3 +++ test/test_jit_fuser_te.py | 5 +++++ test/test_tensorexpr.py | 4 ++++ torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 ++++++++++++++++++- torch/csrc/jit/passes/tensorexpr_fuser.h | 2 ++ torch/csrc/jit/python/init.cpp | 2 ++ 7 files changed, 39 insertions(+), 2 deletions(-) diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index 91fb4c2b7582c..8dd616453362b 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -15,15 +15,19 @@ namespace jit { using namespace torch::jit::tensorexpr; struct WithCPUFuser { - WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) { + WithCPUFuser(bool val = true) + : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) { overrideCanFuseOnCPU(val); + setTexprParallelCPUEnabled(true); } ~WithCPUFuser() { overrideCanFuseOnCPU(cpuFuserEnabled); + setTexprParallelCPUEnabled(parallel); } bool cpuFuserEnabled; + bool parallel; }; TEST(TEFuserPass, FuserPass_1) { diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index b9ed9d0b78eb5..aa8be0518385f 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -29,6 +29,8 @@ def setUp(self): torch._C._debug_set_fusion_group_inlining(False) self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) + self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() + torch._C._jit_set_texpr_parallel_cpu_enabled(True) def tearDown(self): torch._C._jit_set_profiling_executor(self.prev_exec) @@ -40,6 +42,7 @@ def tearDown(self): torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled) torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) + torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def test_tensor_type_not_determined_by_inputs(self): @torch.jit.script diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index b89caca44a1b2..5e8204a4c7b14 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -85,6 +85,10 @@ def setUp(self): self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) + # TODO: CPU fuser currently is disabled when multithreading. + self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() + torch._C._jit_set_texpr_parallel_cpu_enabled(True) + self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] self.int_dtypes = [ torch.int8, @@ -112,6 +116,7 @@ def tearDown(self): torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) + torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 47c7e689aa6a4..6353113a1ec4c 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -24,6 +24,9 @@ def setUp(self): torch._C._debug_set_fusion_group_inlining(False) self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) + # TODO: CPU fuser currently is disabled when multithreading. + self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() + torch._C._jit_set_texpr_parallel_cpu_enabled(True) self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] @@ -36,6 +39,7 @@ def tearDown(self): torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state) torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) + torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 52bf4539479df..d4add03506c4f 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -249,6 +250,15 @@ bool isSupported(Node* node) { } // namespace tensorexpr static bool texpr_fuser_enabled_ = true; +static bool texpr_parallel_cpu_enabled = false; + +bool texprParallelCPUEnabled() { + return texpr_parallel_cpu_enabled; +} + +void setTexprParallelCPUEnabled(bool val) { + texpr_parallel_cpu_enabled = val; +} void setTensorExprFuserEnabled(bool val) { texpr_fuser_enabled_ = val; @@ -888,7 +898,14 @@ class TensorExprFuser { return false; } if (device->is_cpu()) { - return canFuseOnCPU(); + // CPU fusion is only supported for single-thread. + if (!canFuseOnCPU()) { + return false; + } + if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) { + return true; + } + return false; } else if (device->is_cuda()) { return canFuseOnGPU(); } else if (device->is_xpu()) { diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h index 254aebd91d12f..3f6538b7e587a 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.h +++ b/torch/csrc/jit/passes/tensorexpr_fuser.h @@ -24,6 +24,8 @@ TORCH_API void setTensorExprFuserEnabled(bool val); TORCH_API bool tensorExprFuserEnabled(); TORCH_API bool setTexprReductionsEnabled(bool value); TORCH_API bool texprReductionsEnabled(); +TORCH_API bool texprParallelCPUEnabled(); +TORCH_API void setTexprParallelCPUEnabled(bool val); TORCH_API void RemoveProfileNodesAndSpecializeTypes( std::shared_ptr& graph); diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 86b64b8342a7d..c92ab1b46e41c 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -712,6 +712,8 @@ void initJITBindings(PyObject* module) { .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed) .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled) .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled) + .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled) + .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled) .def( "_jit_set_te_generate_block_code", [](bool gen_block_code) { From 37d60c08e547e63cef8a80a9e187c4d5e3b9b418 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sat, 21 Aug 2021 03:45:21 -0700 Subject: [PATCH 126/530] Revert D30360382: [nnc] Support thread level parallelism in fused kernels Test Plan: revert-hammer Differential Revision: D30360382 (https://github.com/pytorch/pytorch/commit/d6d86efb1c839ddafd1398d6dab9caa4f31a9f0b) Original commit changeset: 29acf4e932c6 fbshipit-source-id: e0531113135d30eabb172dc1537d5dd6d65dc438 --- test/cpp/tensorexpr/test_kernel.cpp | 30 -------- torch/csrc/jit/tensorexpr/kernel.cpp | 87 ---------------------- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 24 ++---- torch/csrc/jit/tensorexpr/llvm_jit.h | 8 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 7 -- 5 files changed, 8 insertions(+), 148 deletions(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 8d4e48c4a0bff..8f36f54395f49 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -206,36 +206,6 @@ TEST_F(Kernel, _3) { } } -TEST_F(Kernel, ParallelStrided) { - KernelScope kernel_scope; - - const auto graph_string = R"IR( - graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu), - %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)): - %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1) - %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2) - return (%3))IR"; - auto graph = std::make_shared(); - parseIR(graph_string, &*graph); - - auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat)); - auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat)) - .index( - {Slice(None, None, 2), - Slice(None, None, 2), - Slice(None, None, 2)}); - auto ref = a * (a * b); - auto o = at::zeros_like(ref); - TensorExprKernel k(graph); - std::vector inputs = {a, b}; - std::vector stack = fmap(inputs); - k.run(stack); - o = stack[0].toTensor(); - for (size_t i = 0; i < 5 * 3; i++) { - CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); - } -} - TEST_F(Kernel, DISABLED_Shape_Inference) { // disabled: doesn't do stride propagation, and isn't being used currently diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index c5333b2010610..faacd022e7e0b 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -2488,86 +2487,6 @@ void fuseAllLoops(StmtPtr st) { } } -// Compute the trip count of a loop if it is a constant. -c10::optional tripCount(ForPtr loop) { - auto tc = IRSimplifier::simplify( - cast(ExprHandle(loop->stop()) - ExprHandle(loop->start()))); - if (auto val = to(tc.node())) { - return val->value(); - } - return c10::nullopt; -} - -// Prune innermost loops until iterations satisfies a minimum grain size. -static void pruneByGrainSize(std::vector& loops) { - constexpr int64_t minGrainSize = 32768; - int64_t grainSize = 1; - for (int64_t i = loops.size(); i > 0; i--) { - auto tc = tripCount(loops[i - 1]); - if (!tc) { - break; - } - grainSize *= *tc; - if (grainSize < minGrainSize) { - loops.pop_back(); - } - } -} - -// Retain enough outermost loops to fill the number of threads. -static void pruneByThreadCount(std::vector& loops) { - int64_t trips = 1; - auto threads = at::get_num_threads(); - auto it = loops.begin(); - for (; it != loops.end(); it++) { - if (trips >= threads) { - break; - } - auto tc = tripCount(*it); - if (!tc) { - break; - } - trips *= *tc; - } - loops.erase(it, loops.end()); -} - -// Flatten and parallelize outer loops, subject to a minimum number of elements -// in the inner loop, and a maximum level of thread-level parallelism in the -// outer loops. -template -static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) { - for (auto const& buf : bufs) { - auto loops = l.getLoopStmtsFor(buf); - pruneByGrainSize(loops); - pruneByThreadCount(loops); - - // There are no loops to parallelize; give up. - if (loops.size() == 0) { - continue; - } - // The loop nest contains a reduction; give up. - auto reductions = NodeFinder::find(loops[0]); - if (reductions.size() > 0) { - continue; - } - // The loop nest has loop carried dependences; give up. - if (LoopNest::hasLoopCarriedDependence(loops[0])) { - continue; - } - // Try to flatten the outer loops and parallelize them if successful. - ForPtr flattened = nullptr; - if (loops.size() == 1) { - flattened = loops[0]; - } else { - LoopNest::flatten(loops, &flattened); - } - if (flattened) { - flattened->set_parallel(); - } - } -} - StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { torch::jit::tensorexpr::LoopNest l(st, bufOutputs_); GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n"); @@ -2609,8 +2528,6 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { if (backendType == kLLVMCodeGen) { fuseAllLoops(l.root_stmt()); GRAPH_DEBUG("after fuse", *l.root_stmt()); - parallelizeOuterLoops(l, bufOutputs_); - GRAPH_DEBUG("after parallelize", *l.root_stmt()); } if (backendType == kCudaCodeGen) { @@ -2685,13 +2602,9 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { } l.prepareForCodegen(); - GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt()); - l.simplify(); - GRAPH_DEBUG("after simplification", *l.root_stmt()); if (backendType == kLLVMCodeGen && !hasReduction) { l.vectorizeInnerLoops(); - GRAPH_DEBUG("after vectorization", *l.root_stmt()); } StmtPtr stmt = l.root_stmt(); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index d5a95bc4cf886..eac1f82f25c4b 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -274,24 +274,15 @@ class LLVMCodeGenImpl : public IRVisitor { } }; -extern "C" { typedef void (*ParallelCallee)(int index, int8_t* packed_data); -void DispatchParallel( - int8_t* func, - int start, - int stop, - int8_t* packed_data) noexcept { +void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data) { // TODO: preserve the func type. - try { - ParallelCallee callee = reinterpret_cast(func); - at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { - for (int index = f_begin; index < f_end; index++) { - callee(index, packed_data); - } - }); - } catch (...) { - } -} + ParallelCallee callee = reinterpret_cast(func); + at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { + for (int index = f_begin; index < f_end; index++) { + callee(index, packed_data); + } + }); } } // namespace tensorexpr @@ -1296,7 +1287,6 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) { module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype); llvm::Function* dispatcher = llvm::cast(dispatcher_callee.getCallee()); - dispatcher->addFnAttr(llvm::Attribute::NoUnwind); irb_.CreateCall( dispatcher, {func_value, start, stop, packed_caller_args_ptr}); value_ = llvm::ConstantInt::get(IntTy_, 0); diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h index 8585900abc8d6..30ad5317a1b3c 100644 --- a/torch/csrc/jit/tensorexpr/llvm_jit.h +++ b/torch/csrc/jit/tensorexpr/llvm_jit.h @@ -17,13 +17,7 @@ namespace torch { namespace jit { namespace tensorexpr { -extern "C" { -void DispatchParallel( - int8_t* func, - int start, - int stop, - int8_t* packed_data) noexcept; -} +void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data); inline std::string formatError(llvm::Error&& err, const char* msg) { static constexpr char* defaultErrorMsg = "Unexpected failure in LLVM JIT"; diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 7bcdd1a666f7b..a296d8c7af79b 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -179,13 +179,6 @@ class Vectorizer : public IRMutator { }); } - ExprPtr mutate(ModPtr v) override { - std::vector inputs = {v->lhs(), v->rhs()}; - return try_vectorize(v, inputs, [&]() { - return ExprHandle(inputs[0]) % ExprHandle(inputs[1]); - }); - } - ExprPtr mutate(AndPtr v) override { std::vector inputs = {v->lhs(), v->rhs()}; return try_vectorize(v, inputs, [&]() { From e926f75b0bc66c789365cb1c48ba41e8447b97fb Mon Sep 17 00:00:00 2001 From: jiej Date: Sat, 21 Aug 2021 09:05:04 -0700 Subject: [PATCH 127/530] BatchNorm autodiff re-enabled (#57321) Summary: Turns on BN in autodiff: 1. outputs an empty tensor for running stats to by pass autodiff issue on None; 2. fixing BN inference backward in cudnn & miopen, where backward falls back to native batchnorm kernel instead; Pull Request resolved: https://github.com/pytorch/pytorch/pull/57321 Reviewed By: albanD, ngimel Differential Revision: D30250419 Pulled By: jansel fbshipit-source-id: a62553789c20fb50a820003a056f40d9d642dfaa --- aten/src/ATen/native/Normalization.cpp | 54 +++++++++++++--- aten/src/ATen/native/cuda/Normalization.cu | 6 +- aten/src/ATen/native/cudnn/BatchNorm.cpp | 3 + .../ATen/native/miopen/BatchNorm_miopen.cpp | 2 + test/test_jit.py | 62 +++++++++++++++++++ torch/csrc/jit/runtime/symbolic_script.cpp | 2 +- .../_internal/jit_metaprogramming_utils.py | 35 ++++++++++- 7 files changed, 149 insertions(+), 15 deletions(-) diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 40ee1d5d4a152..611faf010abaf 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -240,7 +240,7 @@ std::tuple batch_norm_backward_cpu_template( grad_weight = at::empty_like(weight, at::MemoryFormat::Contiguous); } if (grad_input_mask[2]) { - grad_bias = at::empty_like(weight, at::MemoryFormat::Contiguous); + grad_bias = at::empty({input.size(1)}, input.options()); } // since we are directly manipulating pointers in contiguous path, @@ -416,6 +416,22 @@ std::tuple _batch_norm_impl_index( const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); auto num_features = input.sizes()[1]; + + if (input.numel() == 0) { + Tensor reserve = at::empty({0}, input.options().dtype(kByte)); + auto options = input.options().dtype( + at::toAccumulateType(input.scalar_type(), /*is_cuda=*/input.is_cuda())); + auto save_mean = at::empty({num_features}, options); + auto save_invstd = at::empty({num_features}, options); + + // don't return view of input, don't return empty tensor because it will break gradient chain + auto out = input.clone(); + if (weight.defined()) out = out * weight[0]; + if (bias.defined()) out = out + bias[0]; + return std::tuple( + out, save_mean, save_invstd, reserve, 0); + } + if (running_mean.defined()) { check_dims_match_num_input_features("running_mean", num_features, running_mean.numel()); } else if (!training) { @@ -508,7 +524,30 @@ std::tuple _batch_norm_impl_index_backward( const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();}); const Tensor& save_var_transform = c10::value_or_else(save_var_transform_opt, [] {return Tensor();}); - if (impl_index == 0) { + if (input.numel() == 0) { + std::vector dims(input.dim() - 1); + dims[0] = 0; + std::iota(dims.begin() + 1, dims.end(), 2); + + // don't return empty tensor because it will break gradient chain + Tensor grad_input; + Tensor grad_weight; + Tensor grad_bias; + if (output_mask[2]) { + grad_bias = grad_output.sum(dims); + } + if (output_mask[1]) { + grad_weight = (grad_output * input).sum(dims); + } + if (output_mask[0] && weight.defined()) { + grad_input = grad_output * weight[0]; + } + return std::make_tuple(grad_input, grad_weight, grad_bias); + } + + // backward in inference mode is not supported in cudnn, fallback to native + // TODO: verify the same thing in miopen + if (impl_index == 0 || (!train)) { return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask); } else if (impl_index == 1) { // TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC @@ -528,13 +567,6 @@ Tensor batch_norm( const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();}); const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();}); const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); - if (input.numel()==0){ - //don't return view of input, don't return empty tensor because it will break gradient chain - auto out = input.clone(); - if (weight.defined()) out = out * weight[0]; - if (bias.defined()) out = out + bias[0]; - return out; - } return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled)); } @@ -602,7 +634,9 @@ std::tuple batch_norm_cpu(const Tensor& self, const c10: return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm", [&] { if (!train) { - return batch_norm_cpu_transform_input_template(self, weight, bias, {}, {}, running_mean, running_var, train, eps); + auto save_mean = at::empty({0}, self.options()); + auto save_var = at::empty({0}, self.options()); + return batch_norm_cpu_transform_input_template(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps); } else { auto save_stats = batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, eps); return batch_norm_cpu_transform_input_template(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps); diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index dff3f69bcc43c..0238b1b682877 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -487,7 +487,8 @@ std::tuple batch_norm_backward_cuda(const Tensor& grad_o // save_mean and save_invstd, so it needs recalculated. const auto acc_type = at::toAccumulateType(input.scalar_type(), /*is_cuda=*/true); Tensor mean; - if (save_mean->defined()) { + TORCH_INTERNAL_ASSERT(save_mean->defined(), "save_mean should always be defined\n"); + if (save_mean->numel() != 0) { mean = *save_mean; } else if (needs_reduction) { TORCH_CHECK(!train && running_mean->defined()); @@ -496,7 +497,8 @@ std::tuple batch_norm_backward_cuda(const Tensor& grad_o } Tensor invstd; - if (save_invstd->defined()) { + TORCH_INTERNAL_ASSERT(save_invstd->defined(), "save_invstd should always be defined\n"); + if (save_invstd->numel() != 0) { invstd = *save_invstd; } else { TORCH_CHECK(!train && running_var->defined()); diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp index 3a34e327e2697..1c70aa353b517 100644 --- a/aten/src/ATen/native/cudnn/BatchNorm.cpp +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -212,6 +212,9 @@ std::tuple cudnn_batch_norm( #endif // CUDNN_VERSION >= 7400 } else { reserve = at::empty({0}, input->options().dtype(kByte)); + // This keeps a consistent output with native_batch_norm + save_mean = at::empty({0}, weight_t.options()); + save_var = at::empty({0}, weight_t.options()); AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference( handle, mode, &one, &zero, idesc.desc(), input->data_ptr(), diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index d78fe079ed442..28e20e90b2997 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -120,6 +120,8 @@ std::tuple miopen_batch_norm( save_mean.data_ptr(), save_var.data_ptr())); } else { + save_mean = at::empty({0}, weight_t.options()); + save_var = at::empty({0}, weight_t.options()); MIOPEN_CHECK(miopenBatchNormalizationForwardInference( handle, mode, &one, &zero, idesc.desc(), input->data_ptr(), diff --git a/test/test_jit.py b/test/test_jit.py index 2dd0d4764c46c..06afe656a8d3c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -10774,6 +10774,68 @@ def addmm_grad_test(b, x, w): self.assertEqual(w.grad, w_ref.grad) self.assertEqual(b.grad, b_ref.grad) + @unittest.skipIf(not RUN_CUDA, "running tests on cuda to verify cudnn fix") + def test_batch_norm_inference_backward_cuda(self): + with enable_profiling_mode_for_profiling_tests(): + class MyBatchNorm(torch.nn.Module): + def __init__(self, num_features, affine, track_running_stats): + super(MyBatchNorm, self).__init__() + self.bn = torch.nn.BatchNorm2d( + num_features, 1e-5, affine=affine, track_running_stats=track_running_stats).float() + + def forward(self, x: torch.Tensor): + o = self.bn(x) + o = torch.nn.functional.relu(o) + return o + + batch = 4 + c = 2 + hw = 3 + # Initialize param and input values + x_init = torch.randn(batch, c, hw, hw, dtype=torch.float).cuda() + grad = torch.randn(batch, c, hw, hw, dtype=torch.float).cuda() + + training = False + affine = True + track_running_stats = True + + module = torch.jit.script(MyBatchNorm(c, affine, track_running_stats)).cuda() + ref_module = MyBatchNorm(c, affine, track_running_stats).cuda() + module.eval() + ref_module.eval() + + jit_module = torch.jit.script(module) + ref_module.load_state_dict(module.state_dict()) + + x = x_init.detach().clone() + x.requires_grad_() + x_ref = x_init.detach().clone() + x_ref.requires_grad_() + + # Test symbolic differentiation + # Run Forward and Backward thrice to trigger autodiff graph + for i in range(0, 3): + y = jit_module(x) + y.backward(grad) + x.grad.zero_() + + module.bn.running_mean.zero_() + module.bn.running_var.fill_(1.0) + ref_module.bn.running_mean.zero_() + ref_module.bn.running_var.fill_(1.0) + + # run jitted module + y = jit_module(x) + y.backward(grad) + # reference computation + y_ref = ref_module(x_ref) + y_ref.backward(grad) + + self.assertEqual(y_ref, y) + self.assertEqual(x.grad, x_ref.grad) + self.assertEqual(module.bn.running_mean, ref_module.bn.running_mean) + self.assertEqual(module.bn.running_var, ref_module.bn.running_var) + def test_zeros(self): class M(torch.jit.ScriptModule): __constants__ = ['d'] diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp index 453a83cd4507e..29ce74a7d3ef7 100644 --- a/torch/csrc/jit/runtime/symbolic_script.cpp +++ b/torch/csrc/jit/runtime/symbolic_script.cpp @@ -1117,7 +1117,7 @@ const std::vector functions = { return result, backward )", R"( - def batch_norm_disabled(input : Tensor, + def batch_norm(input : Tensor, weight : Optional[Tensor], bias : Optional[Tensor], running_mean : Optional[Tensor], diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py index a21717bc5f9a1..350866cdbf083 100644 --- a/torch/testing/_internal/jit_metaprogramming_utils.py +++ b/torch/testing/_internal/jit_metaprogramming_utils.py @@ -109,8 +109,39 @@ ('bilinear', (S, S, S), ((S, S, M), torch.zeros(M, S, M),),), ('embedding', torch.tensor([[1, 2, 4, 5], [4, 3, 2, 5]]), (torch.rand(6, 3), ), '', (True,)), ('embedding_bag', torch.tensor([1, 2, 4, 2]), (torch.rand(5, 3), torch.tensor([0, 4]),),), - ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), ), - '', (False, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), + (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), None, None, True, ), + 'training', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (0, S, S, S), + (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ), + 'size_zero', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (0, S, S, S), + (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ), + 'size_zero_inference', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), + (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ), + 'with_weight_and_bias_training', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + None, non_differentiable(torch.ones(S)), True, ), + 'with_only_bias_training', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + non_differentiable(torch.randn(S)), None, True, ), + 'with_only_weight_training', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + None, None, False, ), + 'inference', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), False, ), + 'with_weight_and_bias_inference', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + None, non_differentiable(torch.ones(S)), False, ), + 'with_only_bias_inference', (True, 'aten::_batch_norm_impl_index')), + ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), + non_differentiable(torch.randn(S)), None, False, ), + 'with_only_weight_inference', (True, 'aten::_batch_norm_impl_index')), ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),), ('layer_norm', (S, S, S, S), ([5],), '', (True, ['aten::native_layer_norm'])), From 2289a12f21c54da93bf5d696e3f9aea83dd9c10d Mon Sep 17 00:00:00 2001 From: Horace He Date: Sat, 21 Aug 2021 17:13:27 -0700 Subject: [PATCH 128/530] Made FuncTorchBatched decompose CompositeImplicitAutograd (#63616) Summary: See https://github.com/facebookresearch/functorch/issues/56 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63616 Reviewed By: zou3519 Differential Revision: D30438316 Pulled By: Chillee fbshipit-source-id: e84446d9f68b87daa0cfff75b3b8a972f36ec85a --- c10/core/DispatchKeySet.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index 404acc7cb1db3..21433d4ace8d7 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -32,8 +32,8 @@ bool isBackendDispatchKey(DispatchKey t) { // math_dispatch_keyset contains all keys in backend_dispatch_keyset and // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd // maps to math_dispatch_keyset. -constexpr DispatchKeySet math_dispatch_keyset = - backend_dispatch_keyset | autograd_dispatch_keyset; +constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset | + autograd_dispatch_keyset | DispatchKeySet({DispatchKey::FuncTorchBatched}); DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); From d6133b2fe6b863dd49fb21641bd04e24e19ac794 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Sun, 22 Aug 2021 18:55:45 -0700 Subject: [PATCH 129/530] Remove `_fork_processes` from common_distributed.py (#63711) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63711 This removes `_fork_process` from common_distributed.py and fixes all other callpoints to use `spawn_process` instead. ghstack-source-id: 136395719 Test Plan: waitforbuildbot Reviewed By: xush6528 Differential Revision: D30463834 fbshipit-source-id: 0c09e8a996d0e5b912c8cdd45488a39951bac4db --- torch/testing/_internal/common_distributed.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index fb505d105980c..01e167f528af2 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -9,7 +9,6 @@ import traceback import types import unittest -import warnings from contextlib import contextmanager from datetime import timedelta from enum import Enum @@ -468,14 +467,6 @@ def _start_processes(self, proc) -> None: self.pid_to_pipe[process.pid] = parent_conn self.processes.append(process) - def _fork_processes(self) -> None: - warnings.warn( - "Fork based multiprocessing is dangerous and should not" - " be used, for tests with ASAN consider using opt-asan", - DeprecationWarning) - proc = torch.multiprocessing.get_context("fork").Process - self._start_processes(proc) - def _spawn_processes(self) -> None: proc = torch.multiprocessing.get_context("spawn").Process self._start_processes(proc) From 726fd26b3e5ecf205569e59cf7cd5a6cbf4387a0 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Sun, 22 Aug 2021 22:29:04 -0700 Subject: [PATCH 130/530] Update ROCm PyTorch persons of interest (#55206) Summary: cc jeffdaily sunway513 Pull Request resolved: https://github.com/pytorch/pytorch/pull/55206 Reviewed By: VitalyFedyunin Differential Revision: D30296584 Pulled By: dzhulgakov fbshipit-source-id: 6e5c610cc6b7c7fd58b80fa3f9de31f269341a88 --- docs/source/community/persons_of_interest.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst index 5c1fcbf1c7ecb..c220ae80806e8 100644 --- a/docs/source/community/persons_of_interest.rst +++ b/docs/source/community/persons_of_interest.rst @@ -97,8 +97,9 @@ MKLDNN AMD/ROCm/HIP ~~~~~~~~~~~~ -- Junjie Bai (`bddppq `__) -- Johannes M. Dieterich (`iotamudelta `__) +- Peng Sun (`sunway513 `__) +- Jithun Nair (`jithunnair-amd `__) +- Jeff Daily (`jeffdaily `__) Build + CI ~~~~~~~~~~ From bafd875f743d93ccb3463676ea29101cae1760d7 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Mon, 23 Aug 2021 07:05:51 -0700 Subject: [PATCH 131/530] Allow implementing either backward or vjp for Function (#63434) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63434 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30431968 Pulled By: albanD fbshipit-source-id: 0bb88664283486a9fd3364e6c3d79442a44625c2 --- test/test_autograd.py | 18 +++++++++++++++++- torch/autograd/function.py | 21 +++++++++++++++++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 7200bd525acf2..8b7aeb4159f23 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -5477,13 +5477,29 @@ class BadBw(Function): def forward(ctx, foo): return foo.clone() + class BadBw2(Function): + @staticmethod + def forward(ctx, foo): + return foo.clone() + + @staticmethod + def backward(ctx, foo): + return foo + + @staticmethod + def vjp(ctx, foo): + return foo + inp = torch.rand(1, requires_grad=True) with self.assertRaisesRegex(NotImplementedError, "must implement the forward"): BadFw.apply(inp) - with self.assertRaisesRegex(RuntimeError, "must implement the backward"): + with self.assertRaisesRegex(RuntimeError, "must implement either the backward"): BadBw.apply(inp).sum().backward() + with self.assertRaisesRegex(RuntimeError, "Implementing both 'backward' and 'vjp'"): + BadBw2.apply(inp).sum().backward() + def test_custom_function_local_inplace(self): class MyFn(torch.autograd.Function): @staticmethod diff --git a/torch/autograd/function.py b/torch/autograd/function.py index 4d6122924ec14..90aeea5f1dfea 100644 --- a/torch/autograd/function.py +++ b/torch/autograd/function.py @@ -188,7 +188,15 @@ def _register_hook(backward_hooks, hook): class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin): def apply(self, *args): # _forward_cls is defined by derived class - return self._forward_cls.backward(self, *args) # type: ignore[attr-defined] + # The user should define either backward or vjp but never both. + backward_fn = self._forward_cls.backward # type: ignore[attr-defined] + vjp_fn = self._forward_cls.vjp # type: ignore[attr-defined] + if backward_fn is not Function.backward and vjp_fn is not Function.vjp: + raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom " + "Function is not allowed. You should only implement one " + "of them.") + user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn + return user_fn(self, *args) class FunctionMeta(type): @@ -271,7 +279,8 @@ def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any: @staticmethod def backward(ctx: Any, *grad_outputs: Any) -> Any: - r"""Defines a formula for differentiating the operation. + r"""Defines a formula for differentiating the operation with backward mode + automatic differentiation. This function is to be overridden by all subclasses. @@ -291,8 +300,12 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any: first input to :func:`forward` needs gradient computated w.r.t. the output. """ - raise NotImplementedError("You must implement the backward function for custom" - " autograd.Function.") + raise NotImplementedError("You must implement either the backward or vjp method for " + "your custom autograd.Function to use it with backward " + "mode AD.") + + # vjp and backward are alias of each other + vjp = backward def once_differentiable(fn): From f1d865346fa6cd191dd0f3102a8f58ec04d6cda1 Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Mon, 23 Aug 2021 07:41:33 -0700 Subject: [PATCH 132/530] [ONNX] add test images to repo (#63717) Summary: This is better than the status quo: * Test doesn't download files from the internet -> faster and more reliable. * Test doesn't leave the git working directory dirty. Rather than using the original images, I've copied some images from the pytorch/vision repo. This will keep the tests in the two repos in sync, while avoiding adding new assets to the vision repo. See https://github.com/pytorch/vision/pull/4176. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63717 Reviewed By: janeyx99 Differential Revision: D30466016 Pulled By: malfet fbshipit-source-id: 2c56d4c11b5c74db1764576bf1c95ce4ae714574 --- test/onnx/assets/grace_hopper_517x606.jpg | Bin 0 -> 73746 bytes test/onnx/assets/rgb_pytorch.png | Bin 0 -> 575 bytes test/onnx/test_pytorch_onnx_onnxruntime.py | 31 ++++++--------------- 3 files changed, 8 insertions(+), 23 deletions(-) create mode 100644 test/onnx/assets/grace_hopper_517x606.jpg create mode 100644 test/onnx/assets/rgb_pytorch.png diff --git a/test/onnx/assets/grace_hopper_517x606.jpg b/test/onnx/assets/grace_hopper_517x606.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d2a427810f679db537236c5430873a81a62ef412 GIT binary patch literal 73746 zcmcG#byQu=vM)MuclU+6ySux)yE_C3mOu#Z?(PuWEm(rPyE{qHK!CUM?S0O9`;Pm@ zz5l$LqkHx0s_v@l?$x8$Z&tspyzKz!3NrFC00bliUg4TZ@9F}83H`4u00Dmf{rOi48USa~Eq*YtVmo0w98$|BE31@#+pw($&e;10-ke zWaH)PVhz%?_WEn{UyX7AtBnm<-U*z;{?iEJ-!+AqS^r(b0p>se2H^a^WVrvSVg8?* ze>6S*Ta%WDxr?Wbwa5QklN)%{|F;Bn0`BGi2i`w(01qqcZEt1$*H!Rr{`CTY6#G{K z{x!y=DB2oIAXgu2508Hf{++>p@d#lxo$M|D)!GkS`7iJPvH?gFaDo9BkO5i1126|H z0c!vR9#2=m4g6Sx>;BR9SN1>Xzvc#9`M3Swe*RDNk0%%y@{*DuZ5J@K*+$%)kjeQxG5k2m%)|QUQDb;_W|wL2_U(fwwCFRou$e!Wys&0SiLB ztpi~IGCVvy0z5JT0x|{?A`%7;8Zt5(4n7tZ4i*+Z2J%1j?~i|d`CoGg6eJ`RR1|bn zRCH`qR8;K087lU_nqd6jbnw;>V8DYNFoS}i03b0SpfDibHo*e``!)n23bqmk;vWJV z{Fk7>8v-;8EF3%nA`Ht$qpl~ zZjMFa9?AiWomki)K}m(9F<`;TSo@88JKrj&#?rw%O~+?KWsFCO0M*!X-(D{CEDI=bhT)wT7FNH1?2T|R!m z03ad3)C!~Jy*Y(z1hm9#dGm-XI%oACc_0qy^o@U{jZL;a%@0}usnMuq!Vp$FFBiP1^A zOfMOYMFnZgk9j#U1~ReJpHwM)(z)il(Y73&?pe~2&XoOvHCfO3(U`uEG+yMkKf45G z9CdESYfoB^+XeczECux3sh_)Ad_N`A5lGiPjhjBbyxTTWA8qX+9Qm?*{!+S%i`&qm z+cw|y+FFO4K%?WevK^VU#-f93qJq>&NYtArHigFconq^Z$R}66isij3r?=*Ijl3z!F+^rQ2nZ%BZIRK|CcvFNNb$LN%*lN;1g~A{rjl%JNg&u8f)*T zmImzz_RB#6Tcddwr!=-h~o-;cbz^Tv6J-vbmYw%}cF(*lHf zg`XX}en*SWmEpt=*)y~4Zjc?a)SvgNs-6iW3EetHH8mQaw!Q&PD=uH!7R%byjkVUT zyF}-8n`YjCk9X+)x&}`z3r{+5s!xN;?fkE)dUxJPYli1~Yv(W2!qSBk+bxT|V;*ln z@O)>BBi;}%0nBV2UCZ&}!6e56PaXWa||qR4BwmKyckCLuW0z zI1n3GSQUU%h@1PtSL!~64~zb*Lh#$nE>UKHfbj~lAbsO*vX!0Deaphqoa_;ItMGWi zX})pekKTIHL^BspQ+LaTU(gF|J)+8wmyPu*OCsyJPolS^Nz-S+I-F0za`n{)Pp<>1 z@iNjne3zd2?>6rR&P1MBinpH0{%jXMxju4d`|zlcJXjuToz;y7mwOV6M8}<=ipp-c zc?X~3^s`Td7S6p^EC|CblJ?Da%E0M3gXebLuPo<%RzzhIQm*5RtID!JuI|Uv*6Uea z<-Gcm27%-a0bhBaQ}@x1@NMMm4deQ*%?@>_O|$&0t_A$#+pU<9vt#W5*`b5AY8zrW z;!s}t{1@6R#@&j9OS0#7{##QY?%FQG(OFN_BYneYVq~e73+4cwD^) z*s?D#JfWvg0dn_?P3xzew~8fCJ$IZXoo!a$6n~OmQJa{|$a?R!pJanI{ZATRYhP5p z*;}$zx!a|q6K{&Vj5RlXeKlb?nt)Fk~-)kaT*hyrG#4#M_eYa)a;N6}{vtdwYxq8ks;a~Mad(&IJ9B8{3Uxkly z=kyieQt7=4c#d0?WG#A8JL78C)z;1q1D>kxX>w+0^;m?O`{#lqF5g{+ zW@o1_=3Uf8b1=20!}TtAj$)W>t8SY6&N!S(`#a#`!pU@%1zOh=_bt}wIz) zmW5GM>zZ}jHJG@-G<6&P>Urz-*EHt*Z+AG;^}%{>{)0bH2U;%hr(<%=W*7o{vmKv} zecLt@Pq_QcUo3C)(r-jnhQApsFZWgx>RXbuj_^rl?f7)m**~#3X62v*kBsc25FnWx zir2X!MVF#4WwZfbvjJ2_#Dy~=xe&7yOKdgu+PpYyn#ik`7M)zfMNA;)a^ z8wl(C78 z4_-`=AI84WOdH%Ut&^QeK%^HD1;#d_lb!u`KfyN^bqEafE%l z{8`ZWmiP@&eMK=_oZa%t!I(toZ-F{-dS?MIMuWA3y zP56WHdyq03VPpI4g`Mxo_1z1PQ$S$07QsUH0=r?N_GHHcmrytFJP6IV2rj^Tq>Q6o0p64H1B zisI)((iSv}Vj$ly9;J%D&3%2^Pn{R2HyEr&Txv-(ShCTCtmvHAtxow<@2&HcdRe^u z2Eg10AHAGs^!`nO_#b^0BDC_JgxqoonF2Sj? z#xeUFOMuMvMZM7JYTfDT-N6D^Nu=4;;ALR&5zBRjQ6NJIQD(B`dENp0geT=|($o>h zpY$b<`X3FGb3zp(VmLKY)-M+}Lc#m|8G(+Kn@g+Fi;jUxw@CJm{mhklyA_Stq^?~x?shyOMh z_RSP6nDDiUN{PZ6gM}PRD!2Z@^{KA#%-3Ys(1VT45w4k|OP;o?O<-(ox+vmgZ+HLD zs^57TujkU~f@S@-dh<2&>J1Rqs6W3#wH#I&^j_+dX)_BY%lQ>p)oZFTNkYq5bvQRS zwyj@}kHunHYiVqmx9u$YLoitMwEVN&6ex}e!fSX{;2dGX%YO8@M@ z-)!v%kWT95+Vp2=CZOPU&5;Falb;m#3=e)NL6D$$1?D5#_DZ7+R2jJr{Vy6&EsYqb{fS(t)v!^wqZawRc+gL%`HS*Pml@HV$-uwP*c?!#o5m z66y-@G8Me}?iuZPAhr?Qp~_o(XLQd~lao%Ue@k!M*ihhl;}l=oc9V*kAZ8g-FE{T} z^?l;#`^`k+iofsc`9g5?gdk^UT@VZ!_Gaq*r8Q4}(P-nw`>323w3CPVYr|YIUh*^p zy))u{T#ZZOQ8?$;Y{KiJvl^rHMUk4T#JhMX-r#ag7iT}z%afHr#=*WXm*Y!8U43Ww zNN3|`{K~^!w70GmI_JiAwzKV;@{VhNf^Xk|eA5%BHvpnh=y9S-{r87gLoRdo^l_G+ z=z)C;b4&K+aRW)}YdBN))i)r>P}JDbRsJbq!ZAxfebKdwL4ZGX{wc8D^$%GS_+r2wEVBZmqX8t^9r70GGa}y<>yP_gZfNLmR@FAff#jd)n^R z&)!xqr4RV$rcwK&YeX%5XgY0%roYb(#2t<1h(|+?ga{l%cPHHLJF3SO3ta-$&RB~p zd10pAM#mhp)~081Lwn1)f@~fGgSNMI*l(Z{2~cjo3=4rEEqqSE>u;g{m0eL*wnp$Y zurM^Aq@FeYqQP&Y?);CH;ZrY+X`gd_{8>Zn8!%fPHB@4nE{*DD>D&_I`F#!W{AhG6 zc;nN;i8eN#eO&i7k*bIA4X6m#W**1cIoNtaf{I>ESO?zcEs`uG-E%9}oCxOa#4bwhzBOUb{%Pra9{i2qI zd@5kWJk@y4F+8qZW?lQE^$4VJQ4ex-5R$xx1cI-;uwDq5BxLxamUR_RGZ_d+|vUW%gxR{ z6F0ivRPWHrL8=6C zXZ@+unLOSoMk9$&2p2*<9s1mRe5K8&Jo-!adv<0#s#7BDkd@^t!*1*mcF!9%1Qg8`Ev`ojZOgNFt!SxX2fLmsq}>yLK(xHPJ~=;pouD7j5QE7ypjp z7u6FOQOr4Sgo@sBcE3HteBh}vx^}kwL&`0B#Pl0Ndipnr5}gi!T{K{Q@Nh@?IUb{} z`1vO)Sz1Xpr69{wxUPP0&%n=?;ti;bWfw<->w-HISq2t&4`~pv0lHaLMXA^<=G!~rWX zIA!q|X-s@W}9R@W`l$@bHMJ=qO;o2L=8A2Hb%_0HaHY2uKJBNT^6CNT?Wp z(V>6CLjU*Z5Tpqt%>RN8k-rTABVf0n!2rkq&|d%u78(W){%_%*G5l{H#6Qpw7!~>p z$iP6u!N7n8ptWv2p}-jw?NCr{(xdPI%b;AD2`;1hu+81 zP?r}UO`u8)?j?8ErTY~Uy}ph4rK362D;-kD+U66j>z;+BK5g1~6tVdIk!AUx3>06D zP5v>tf(?VOX|a6AZd=;mIrAYUO2L(p4Cslpjj_JUHGY7}wwzm~`$LHBgVW5L>QMD)vLt z069U1W?jXs&+YlwCs1<+`EE2Tms;y}*1pWD$9s|vK@g*QkTh)FzI5@_M{1NY-fik8Je+!S zu)#~PUedX29rN68kXOuVcN&UVQKq;@Xc6kGcZ0E10x4VwFfdYdCad`3;Ex99kobqa z^%Xl%lRjdoYq|SU*k#k0NKexzEsY&w7{_*ZoLD*x4jZ1GLXY(Evd39dS6*2WiVzlU!_&iDRFDo6rPFj(o?S znZKPFeFTSG`^u4?Yr!+6r3Pl$nhPq6WgTvOs&Xq!SWt9s4rE_#A{~tA5HEJM9(6Mo z!B1PnVoLcjmSl2hViD_NPPl<}N+hjPJHZZ4+zP!l9` z*x}2YcR&DJp@3EyG`0v546y+eu0v)>M&38_$TCTF`1alYLsVu0DEi7hM1=c(3d~s$ zaVa(d4Q(&+4$;7x78E%@g4vf}OWk3KA{O%ck6(=qdYR@$>kvb6(;|t*GOtJyB;|)8 zhG7A$b+q}?!Obidh4#U$MThU!lzALeh#^Drw)bjE0R@Esi)1#_a=YZ8Qdx(ON&ON4 zWpmv-a{!rHd_p0QbOjo;7{W0iIHBa1oaM+lx{D9-p&y0(tF(k7<(g-RH)V4795qKG zTKj^yV- zjxXwJIE$aJC#d*Z3IGXc7&rXgmzjP`IoGhy5B)4!w5nbyN&^x$g|f(Hx@RA3Bj_oy zDC3;~NV%K$3`JHjy9id|!wSR_Zeb9K`0dInh_)p^L(9yy640Sv`u71e=$~S;GiJp` zTrI_zar)z|d!N30GcaTl5PTa$lIn+%7p}Y&gZhX7D^)4K`)JOXKrYs9R^EvTjqqVF z^Oz6ft8pH`7ywi94a8{&>h!C(!l53erw8PGRj{(=uy^uh9|ju^FOs9M0B8qyLZdi` zPc3dBx_xM_4())6pA3qHFz%Uj?t73@vF0E+`ZUo7(TlGYfTm;UI|L?lAKSx7X&i~L z9)775DEhiQc0ozwAK?}sBp}}tmLU!G!_UEqAY6vG+oSt4{kpNIRiV}-;VVNga#ha~ z!P);&Poh?^fi3zrb6_4JPd`R;m;lv0f+MCU+vmJK&A z$X!BX-dA7P8~%v0MZ~j6&$f2gbcWOdjkeI`-UDd6ca* z<}fAqsrhTvs2E5C)ap9Qj03^%6~51l^FUEwMU!V^!H761l)>;>4#rq9W+>tpe#5CH zhlj2!5fZ^4HJiH>Bi9D8hV6M%REtR@1993|j}4KdC?&|nE=>@}6 z5bx;H+>^CX0A+EY;s%-rJYI&;4oGAxf#*RO#(1d6fkP?7xgpK}>$P{#}OPKtK z-xjb*ZaWg;_t-HX4q^I3=7?iq#iVD(horP2`SR&SEH~gtuUpWNN>mAOhM3sDL8fSC zrhHD}LorbLowGE-Py{7Jfimw&{W4nex-^Nj^%9H#Px31P8Y_G3N_>R@3O1RU+p}X6vbogcd=_3YN-qG?t)x8gBX(P zM^^|HjV2&{6x6rdbVe|*nAZ=Tz+8aHcmq%?I-eB~O?{YOki)yJ(sg={t5<#M~b zN_y1MXO$A7_3~@PXATxSqJDkyy5*zY(Bgyxk@*N6HXFU>y_9OF zQr+<-X?U`q#tzm#acApy_ij3`I_pO$3ya?|_-GVu`Mv?4gr>WVS2kSIbpwTp{e;!%?LH>$)3%2@{qh-{WST-s2YRxheeHbTz`buRA^+2Ja#8%NIU zSbc{cH(AxA#z^Ox-g&@1Kdkmc4Q#?D(Aw`T!b3ekp&<@0ezA~%ti z1qifX1bqTE)a%L+bS4KCx@3s+%3@8S9T``RoPE8WSnUL+e#J98u=u1ydWwPLFWvxR z8IfXLS)2$Nez)F>{77dvw}oXCP?8EK={JZ_fCYgGn+#_cM*$D#;-fTCXLHsd1B*{t zJESUG@G!xG|EB^b?{y-LoPkOu+;#|2#_qvvx=!MAjv$BO`&88kcdL2c;R48FWC$2t zf_LJ+>`8nCG?ei^=*%vc_4vllYFZAAnIFS?r(+ZpZR8j}<4|D;Q5qRIRAbvD+0Ryq z%wZyEIXGw_jQIJE;nKm&lMi96qd}B5xk7LaVIfqhIz&>Xii;yldL6G+Rez3vJ@sJf zpB_}g4N)ma$V|BoBVwjyD$(r)00GJ9kcAAN8c4)+gqSzLik(7!PMj@-__+|>>a;us zdpG{F3?F$$Y6Ksozq^XY42px=<)rLJIE3dq;|oN(_!#xA4q$Zx*l8;gs1P(Fh<7OD zCdVl=Bs?NMmZV@gRHwLyg&!nxahk5F!N(FR&>)p41pY{tlbYvZ!N-&;Jf|QEi;3^1 zf((I#TOBsu`hx604mv7HQhF8ywP3ds96;a)F>!n>MdwPA$fVnauOKc}C_=VgasLql zTOd7wMuthwl1|3~2SJs__E~A+$ShVsA;$)CdIUdw47;R1EH(?9!TX&rygTuLhNcvv z?KVV{*eUry;BUEsiXi|M4X~2od)E&KqSA_?iu{Vnk9`nOXiWe5+@ zWkp?4!bpM8`OzQu$NZ}q4kLxCm(-@*aay8s3FLu*xEpmre)#m?42O>COAsF%0vdwCypeYB#Glrb{h-_Tu|3h2dbUPWRhi{?Hc450vzkH-C) z&^Q?k?jPTP>49iX+(dz_WDcqim}WA8>}842Tn^A@`upto*r_@m2!mlZn(o-&7`b1+ z)kpt$jFX}QG^Aoe(}{}=9G*Ni@F4gi`|-#bE@i~gnqsWj>uwbrkUq;bZVwIpl2Yb` z0qBRt$7sZQV(~31Bge?if$;GtAX930Xh92#=HrJX=)IMLu5XQkk!;Paauop*$M}2s4b(Sh)M8Rv&YDWJJew+l@G2Gp=Ok8)T`+cvdn5O+9!q6X$3%QkJo?i zSVibtq_%uadjkjsTqnMc+`iz}vsd(wdWL84kigQ@Kx_XWxmQB72%WRNu2I;VDP#5x`*p2kAI#}TBkJD7ltI*q7Mqet>~Ud zGo`Ra$xzKd!p9^bDcwk_Jnv=p%b#=u&!SqG?}-T0U2K_cW0?w7BP#U|Q`W8*LL%$95BU#PIC`_(cy?;}-6BT&5OZ<; zcvd}(``!AdqqF@qIw+g=nmO6iZg%n?WuGWeN^F^xNdrut$ zZV0Oh9vuuqX9q5|+#;94u5ss$cfn(Q>0Nqjt%kFKMu}&yv2pF~9o3)O zwCtQ|GhYn@l|@nbVoKJ76`{Y>1;Tl4f9p+OzogGALfgC^m#zLCU+k*9Qi|vmOIsYY zsp}b-=4WZx`=@5R)8fzdHWQ1n_?5l$$i~B$ZmxFW$zqKi{uX;tNpJP`${)e(uQKA@ z?+?#%doN7#GHPubB4FovZ>!ZSwXlxv6axiZnSXKSG5L@QJlXW5GER<6f!`XGYo#^T zI{FHH%)!&N*R^OZbhHiA?PSHFIQSH3jx^`^4Y&l1KXkp+uo%D1*ZEUzBd`d?s;bPy? z@%;7DE9;MDzisczRcW+tU^L&{UdCu>tZP;_RzoN5s2FSG;Ml^uST?Er$@ETSwq-jRL2a*~_ml&6^R%%uGAY!S11HkxF5OsjP}X%ow_t#RG0 zwKCYR#J{b+Rwp}pp3v)5IHqjr4Jg}k?S2@htqM}SBGPnlA>lMBJGNU4R75NYs^Bi* z?~PvL(v^J8FB{9ZT#VLL`*X2!$;=>F&Rl)9nA*JRX17pow-*1awN&=g!r}vwY8Gwr z4O!L6x>nzYKh5t}pqfF0`R9rdLCC^;>c@S#&0tH8e%|6@HlTA)JpR8+`1!2<$ja*&9S7wIp-hA%W2dFkvLzl17E<~<6xsnd|vD^~X zwHwP&h*eh*SekC)iSBejPiGZ;SB+Ob_<{Ufnu=mn!LrYzstl;K3yRFX>foyh2Tq#}X zoBFnm>YeG>);-7~wAYzpVL-qo^UVTp4TK5VGAu*S?`uhru7ej4XgX)25!hlbD%0eJ zt$|bU`@?(X7_FTWcRu3M5pMdjszy5dav980;-0U7lp1k}l@?3{M4~hYP0%Ykg3&Vn zH!>@Egh)3#1_e+OJ~R#=pruS@PDh0t14Vr zyUbXDM6JkrFGy67B0}8i;VK0DBt#x1LJ4{Zh0=G953zX(@W6+!;wGYh*nnkjY>>UZ z)a4ah4*O5Bs_H?x@Cgoy?cj$R^ipCfh@4U(5CpI%2Nf1&y^Vqloe(9KEn&S09cQdi zsMM}VC-s52k@LvZf{G0?ot%FUj~xn+9;itvnYNBRKM>h3U9&Sg-#6q9zua$7XK7JW z$EsIaoZAn(YktU>a!DArNF?%qLF-#}PUVeWcXyS@SS&pxyNG3c9|jaZxJN%us?QT* zB!PT6XVr4Vl^$8#GnSfX;Y)XtB0(9Gw)k?2M;#S_0?)d2?9!pT%G~$7Qd!8OFt~9F zO(BbM55<6Y7&d^0nD&n9dj;efdZQ7(L5CUdp7G0buGUgoT>cW8q&dkC_{_5VY+m7X zm`1YrIrh>@RUjP(u1tG%Asu$TJp>vvvl6)xZ4HDSv4NHQ5enGu$#+oPdUe-BkxXHW z#+sZtH|$9h==X*EGW3j8m-`nnSppKR(gi7M5%9T{mg4dVlSyVW&_6%lQYIQwKvp3T zn@^*l9>HfmO3;Z>y!#+>jUPKPfDk+JjiBE5DsFj3I-;}^QDGT<=f3Nb-t?zmHnDkF zImnov$&HiqQA3bcytDqLNpV4$8BTH(P;hu2gdz=o`PMfp{s!>;bMi4gqkbgoA&zh} zJ>wP2mDk^u@f;E}gT}3@VhF55f0?W$%H6k*Gc_suV5O=4IRvJIbXC@8u*ULHxBYn1 zuXJlb-C$D>T?>nyDPZ4nZjEe#$kS+VdJ?9@>h$z}q~(P6QVUVA*nKuMqLOJbYsJVL ztr2VT4-NfMr~9?&;q<-y&*f|9SMF=pd~OoWC+m`sKTBuyD{em*i8E55lVTB%aUP!c zEB~yxm)l!iig9TPgwj>ap>;C70rVm|^R1<>i@C}!Wd1MQtv$V8)TP51SscIAzbJMu zWWE8X>(g&Q`>QPF%u(TKT3+q=eP;basvn1GQ7qi}b#hzRN@>@fCd2Bb3;FE@t;*hk zMGFRrv-tKK(4hOOOS#{-qNnfd(|qA`?(fsu{3%O|u9-cSB-TMj?Q`{pTAO*LNwIxH zLDECQ43L=_o6f?|OW%#xK!7>Ruv|zBzZ3i_vBh`Q1rrl(uhw2?xUPPid8-lEepT68 zt$kZlu~2(D@2n8-X);NENbuWVdGOBDlOru-H(0o?uKG@7X+6!}S-4la${w$?`bAZF z2o8I0(x8}&pQoG~$z$WL(Dr){?kR(B&dhAt5qF)#?lAQ*`-lM9m?NV@`g6;0c12%B zt#fku$;PRpsrp)vL-T^CPN{MM?*&hBFN*q9hDy?Xi&|tz6CWT%F)+I+^_qDm$o%vE zs=ZvSCmnmxFBY*_T4Ff8V(F&o@};8n)~Hs2-KCitktWY;tb??uIe^!>ZGNlu*bs$~ z<>D$=r3+pZDQ8)$Tw7D8UdX|Jx~TdtXwkW8w##O-ZLVJa&WQwBn;@BnSABVt+6MUiDdS*QGVfqQPvTGDu0L??=;g^@_XaF1<31D0x;7QtHXHJk zma^@9eVS6M_&K$!y~y1$eY8lt#i4SwTE+MK%OKemtvg@UDD3*cA=x$6X6t(NIoaHb zahFTos74@Clr!q=sc`-LDWk!?mZh=u#p1zxvrVsYmrj}eG8RU{tB%%cncDqx&UDjna7oPSmhbJn) zvA8pY*$;QJlH==fFi76?+MoDuxdX}P;!jEQOB;=J%i9~`0~J~VG2}2Gva*`-v~{@> zws}`$qN53Q$fwqvn$q47W8lA7YCatL1n~aW|IAW*+>*7R&V2 z7Zj%t7iy==_hW=A)GwopbM3a=>wSIeWQEP3>qc882^E zLy?KFN2&NRLCBzGkUxrZjBRYz@o4WxF4%kP{8U~=v=DH2|2XbChm_vTv#rq%tBHNb z|4o#8D__WAa+8)s)uA~=F2koROt&8Wu&C@Gj*R>?C#>FZ7!C4NKRea z|Er@Lylg22HiDE~%2k=tJ~6iDu)ixNxuKKe93mzAL>O9}CXIohrER2xlobk}2U8|( z4!KG8qr?`=TARTN$QQiuuxgBAVs|2xyYKRjQG4z8l3Dr(1*@rbn zn@~aF>Sfj%W`&RDoe2pQa&$P@6403Hn+IYL)OeT$#ZsRbfm8v*^g)pA`aU#8&cLvz zD!xO7eAOYnBv6#nk3sd+laOLnFiuZ{F6M(Noe1=22E7M2bb z@R16!hsDux%cT(P#$e7TBoz^djzPpQOO}MIhj5T}jo`7=ih)s1Yqo?-3hQ@dEbnQN z6xGg@6+)?KZ~GzU8&P%yCeLKod;!q2SwoHU8{*5h4cvS%GOu zlb#$tLH=734s@#RnU{K}^9va_mQdX)V-MSou$3n;Tu zeT;7sdK?}+@~(Y6i(1CfULXZZY{X~@hQ$jzjg}z=RWDabNFhtXZ--lDZf;n<0>sDm z_!ek=d?%*#r;X)0b1e4$bADHdREg6cW$N6yC6G(ENoWtwiLYwQ)J3Vs_#h>ut~3vg zuE>e0@|~4}vju@Qf%3bQ6+_2%t`4z;iunoQFaf$){bo9!T>4M+_m=Zy?48;#s?9oW zZ3ux**9NPr>AbtIG}V^*xB1^|NA7C7&+uO3A_c;WX%3#Jqntc;iR6{{i4j!R>&P6j z($t!|yWGnil^2a5Sdd}X+%m=STTT`VjTg>QSnWY=#6KwMZf!N&%zg*Yx4>>XE4Seg zoz6f5$HHHw+2aL_wE_*(b!xP)-93+g3OM4*oVT}WL<$itXoO>q;uS(g(uN)6&T?b! z{UAiF&yb?33h1+nW);_)+Z0HfoBM)Vw)3JWlpLntA?cMDRiaU?SY23H0na&PfvPtM z8OkLkc1SKrjk}$ZE8>Il+JgIm`3k5N+u;)~Ud5q@3{{G={~XbyEaP}-fx@-sj|pNG zg5b+(m$m%us@`{h6XMfrduIMpn6O&M^!@Y)g0k1~BI=JGnZ_WYu_n{M^4^ zGJPqF_A>eg+^nDaplx{xGd@Lmhs7E8aZhf!e*CDrT;UZaMGN)VJsUTNWVK~ZQKfYh zIGehFjJ(AQe&2W8Cz*;cNN|YnG$=>92x}oo?Bu*2XinxHbv-IN^s?h`&piIgkti;q zQYT$qPau-;t`OQI!9d*Mek7|92Kg&J$|x3*NtTRj8u>I6Yz4$V5JdK##h^BN{I}-= z>IfbPe2<0pec@cIv3%#smW$S~AxZ7mojEt1#k%~YuZ>)5qCCYxA~J%BKj&61rXOUY ztxD+i)Ep>tHeIQdjqo+BBUMVg3&kqpfsxRFe;`5}fx$x53D@ZE~800LS)!j3Is9&># z*|(45^cRfnCBkI|D_LR<$2`1~B^lc0E4;OC2@S#F<-dbcann8h?iMSFr~S^l#@AA% zb7eKl)!LWaeQ(bBhth?whzpf;HyALDCw?7+MFlmRTuP37)6}gVA6RF%)4timgJa<^ zxcw)?TPJ*{!O>pfd*??!Pvfxx__)gDI+eu+1vuxh3x~))wMq(3^X2J3hE3hKe1(sW z?*dv56+4H5IjU1WC0Cr-k)>5vD;g4h^~RLVgdeaaVXS%LEOinPbU53}Y_HW%*RC|& zoUK-GF<9#KG)CQ$I zrm8W-ArP!vZ?^OtwhZ5W>1ixfPjz7?-ka90;k2!YP&BIi8HB&gD=AJDH@rLeQt{2w z>Su@>h+d7qNz77mSbet;dQ(%FeL+~Yt?959sVmxdMPr+3xFxlx;!c<$9h>;sU+fEK zI%1IA_HF;h*<9;por6Gl?O}LZWy`X;<%v;4`GOm*N?K2xqXYK@`1SzT3wf{C`IDrj z&3>S*@~tx+ZsXyEP5NwkL7%e3HtU*`OFi@If|tt$I3#p^ah>&&L_EJF_?WT#PG5`v zVs7?u?%RHK9i~QpRWqymz(_1Z>%nW$1QtHK{>p`KPiW10UES1m8#jTYr#gYJ z{09RUq2@ZmA+nrpm9{@^BhD;kD=xS5>urL9)ls#cryAD7B4%Hqrs+MK+x8CQiaPM{ zSTc(_TjxvOFD?=>=DL*W%TJ>-*ArIQm%FAbd78Os}IR;0c9O&eT zKS_Q`t0`K)VXjBWw>;Z;Ss!kjkMV8Mz4Cs)&Z}+mrP#}^)Yt1=vG8}Q z-Db$ngS_2ZZ(hg#o7Aehn%EP{WlIkpDUqIR=D#bCaywTY=OIw)7 zm`knZ$8kH?Q%cs1pH4euUU^ftk~0LX+2@~2qf3F3^K1I8TTiXEeicqE!8{+&(4!0r zSO2ia)y@a=)K;bO^w_R$QVKg7CY?3ck@BMm@}h8E>R0>%@4P;sE7PVwa(`9=A8m2g zG2Uk=&#kZ1rdU_5-$cnyu;d7)#@_|d)V_SyZMM|-oSntjo1d5ebL7GN%ux6~S8U*9 zYZmH`r{s*8_M+QE+jXm)z}ulGo{9ijh5;d#r-Ok7&sm_pT1(wy@WwE!_+(nRS`Oxu z96bK+Av&hC+gD2wNYB_~9cL(;t#gjmNP9P!$qISycv@rD7lzPD*Ns_1DMBlpf(aP> zoc+1=tQ<DsZO}*gSZ`{)Q96cF%bPttEL1NPKwa9 zW->g1dz9na*UA37;*Ip>IQtcuOI3*7c*I{qrVCV$lewXntfx(T~l*6 z6(Gbt_4Xjcq0|QXS}udR{l^CK^3R5Spvf|DhS0h(O$V{FBRtVdnz2gDA*wGTUov=Z zP_?zjH%ahZusY&0Ht>|k$>pf6byJIFJwJa~cMt8|);-QA8FVxT9+#4UU@oqdQ6 zU{Tr2w|x>jR&>|vj5L~vCza{c2PnBog$bx$DsJ*k?<70Z{VVk&UXwQM*F!&k=Je92 z!l;Euu}a~}G=DDk?7e-tR&nueU2hC#{H-cVB!UOAc@#?`ljJ8+GovDiAbT)qqo-D+ zy>Z4BZh=7!GYFkx(yOM9)7hKJd{i!fM9*cX)8UiFH7QFV#~^F`TXwFi=hF|~=(C_) zy!pA29EA$K;zYiR79)d#UEDS6^kXuW*ikE&V^<}Cp6HJoSQ%eZ0%Rk&JYpjXcwhb? zeVXUFkC!~xhF92$y^2?KQ7JuHEY~lbaLe#@D$4D>UV0I3T#r4;%b}uS0hnt{Q4^6_C5uZpVP_|mM~QJ2bQ#U9fkJoh8r7GTd^{o zRWBJMbw5?Ik|3)XNQ^WmN7#&Z6q4$ogoMANCG*XW3V$!(V0_2*B#;+bc{5tt zvAjJxO$}ok_G3))r&9)&zW@ozJK=!45s%PU7{jtDMyugi7JSF%rW>>C+)mqjcQNjD z%LCZWOs4j^Uhv>OCh1nPTA%7WmOIzV?fTxoNm=dQm6c7;XSsjrtg4nWtChU9=<5Ff zTCb~Cn`$>#*p2|MGbnom`zI)pl}5GgsKsS)Lz?#Txug{iX1^yPNYj)>UwWgx2H~J^x3MHo-6$m(;Xu}bS>1nubmYb0C`E@A$mw&0D%;Wdl_WiosvWD%s+~3g% zDSFkO^iIM^ewe*&-!rZHd!=^$pQg5!Qm;kS-MCyU+Sr^xMEY4b&QE#Hf)l$@Gv(?#A9r7E3T!5a?fZ7uX+Y6+mF+K8 zh5GesqSmp`Xw&g}yx(Kvj&Id3`Lom2ZFeop)BQf$_tm$&u%k%ZHSTMunzfkLHlX~v z+J8EmS@wCN$o~MIT`bGXrnhaouST0gZ0@AvS$yHt>JVr*?73YP-OJ~u`F6^b>^9qL zYE=jn&2s9;Ov8iO%E#+(!xOV=>Y{ejYggOa8>f?!Noi7vf7Ltw>ho7|-L9^VtwEyT z=xJO-DVEd8v8wecmKK_A(n1vzm`z%Y^2#nXK44g0urqM!9ge7$y0mB-H46MqsKVzl z{{R$MHoL8=({ML;+i=`%*c@nDR;uRG*Z!ZX=JV8J7t8U+_@<)#l z(J333H$fD~txkmh05j8b{{TOm{ISmVf13NA%<6RZy{Bxnv1*fPZAH{|E+^F0QN_pb z#oeDhcB5eGl{Tw^exNp!3m-(!nclL8HLV}={Lz-}xi|bYv(s@V=SoGVTd8(`XwZ5B4016m!W9^ylNM>k%GZvO!Ci==mJKTmG68#T)u*9*Hw z1ubbuG1Szx`kV1MYSsKvlS}4jMDANl?;p6+qRNhK3YND=)2P?fI&`_H=js0dn$1jX zQMc}>-ugeN_NH|H{Cc9o>Y1fSXx7^GLsS;)sYE-}d;b8^tbaF4J}*2`X`ovk1}KZ{%RzMJa5_%v^} zUzU9(uwB|++wGSZdyd}1MeS#5rCzyBHC;{4a|79{@}9;uwE4f(Y-_*n-RGz+cRN0+ z=sViB2AT9NnMl02fH__2Xme_6cBWRP%+E79qmxkTub90K8@1P~cek*DTWfg2T1pjl z7d#$GujM;TN-E{e=}9S9#z>pNMNitY@4)JRF>LO4Ev~Oj^!CqbeYkElP3_|?0|i4^iHK<@|9(XUT4&qh((6cfPZ-Z7W~Y8cuPj{{T}4+tdxx#(7-N zTAS)O>@o4V=#QJUZ74Tu>c>>0qycZc+tX=&gUl&hGfu5M+?80^wA|fnoq1!q?R`CW zM9_06(ZAj9YCKn>9$e;BZ8wHCrk_tp{MvqajAh8<$79vi$>Z2Pu$!vNI;~DU(w%R) zbp6%4&1b%a5`8DU>Nfh%<~TG?ttx|-qkoQ{JbG(o-R*jpY^AwaxPD_nsYE%hbvF>Y zNohEacE<+lFO^$2PiAzWm2iwRJGzU(M*Uiz&VJnXgZ#_nmC}+_7HSI?Hw1>^CF8wXz=KRM7OJw7WhIr5xAwHle|yyEV9$v}y`Wib(zFpl(~{^}AMe*6n{* zyWD@&d3qC!-Tv%gf72XX=ha^@Bm^O3sE&@OAD1gOqi<@GU3wTU@!@iATbgZ9>PSH| zLzqu=GzSD$pjRW9$)NplbN7Ab7Ceh0B2ikA>wMNfLSphV{N~u2R>qcsE zQdu=qohZ1;#pu8g5g|DMGD5S20MbZQ#E`PW12m4}iq*|egb)EbP#Gu^_n2Z4C;8qAyWJ2N-y#xXo=5x@f zB~)+;v!Y$R=6oBQ}&z!2rDw7Wh7bW z0KkV5WingE6Bq<(9A%W{iugo;AP8om0-O(bbO9nr2vfd!;|xO)$e$N~j~)=9=?a=m z%8uX@k8CIa|JCCa!32>YqJHjf5iPlq1O#aE&yN^}unM5bI|}y!xAnC_a zEBC%0;t%0nhg0c3rlqiaxxRB8Mbq6;v|A!|J;jXrW0(Tx9(6B5t{_ainiFm+nm?xX z7K($Q5v4)Zln}!p0<3@-@L$~pn|oZJ-jzv7*8>8m{Nyt|oIE0CYAqn*Ow%*_VGz~? z+>n6fsG#l6CS$o&6iJErdtwlNY4?$0Z7Ro2QpYx(T|$${A+FG#qeHVTE^e(c(LPc+ z&FYpk5?bm+=~4kCiKKISU~5l>NCcqAJY%JNpQh*aOKN;9q)U!yzu6GGPs`nJ{xKy7 zNh7%Rk5p6XU6!Tuiq-!BO^N|7I0&ZfKRLEt!$r349cNJdPYTuiLMdN2`jOkV`TZ_~ zkLj!cZDcZaXJS8vqB*(%;?ljHf^sokt{i1)In_6paW`^~i$-@pnHxsm)Z0kiwwkQ1 zb8HLPSV1ppO|ijg)Tc(fA6}oSO{ABwwU1Ora!!soGX1jkXHM&7H`@vo0g|^4;ybgB%hQ{- z-*;{etyaA-(m^rh2L%%{nv}r=bzhRktM{9MsJeh+wwjU}PX*%`?^sbzad}fd(%9YI zNVV)ce)CGLYd=+k3}c6U*QRZG06ikpuek%xt0ioQW z7X=7};=FKy+85{$kERBYik+u~AP@z*kmbM`c0vSdM@eTX{SdT?km*UJ;Dz2_4p?rY zxJ!WjrvMTc6U@?70)=>YLTyKYOI(d0>S~lEL}wIhq-NuS!Bi>fB@znt7zhWI$bQt4 z5=mrBb#pR(SA}%J5(flaCQ5OD9OWRLL@Iv5AU6tmm;e(11jp%v>aoO?Eg1#v?1mLG z8rLF%q)Jnm_`9IARK-99sX3U$N=`x?;7H@y+XWc9n)2KPjFbohrcl-b>PI)qxrO&b zn{iYG62?U3D8oR~zN@=QKFL9CQpYH$NyLJa?}M-uyG+w|f_$qYK$$1#fGX291Hm-h z=FFl2lDEq*EMvCjoXe?#P?er8m$qE6!E(?!8ix!qfuu`w%BN!JoBpNhotEvkTb)XK z$4dHDw)Yy3EWEdVw|AvUR(>1 zk%8bb#F)!2B9!+hM4D=vbtm(c5_QbB^-jsKZMSuuud#TVo%ynAs zV`(|f^61p))@=PlxOCmy?>0-9TGw+;I)$kAxv7Kcy=kAg;~FyOI4+}tq~J{ZIB<;$ z3uf8*lj=d-#<5g0F83?D%C45SiBeP$n9T5(WF|M^2v?Vfz5>W1rV#8fD3V;~4G6Pz zWoI{VuWSiEr43DJ$;@(*g!x?;6(k|SWjXgn7oP1HjAHFbLd#xKoiDupvZB#LJx zTw}Asb+Ee|Us=!PFWz^m{X=yULP)3+5&*$jBZ`KU;EH=OSaW~~6cLP$TtuLOr8P_e ziS6%=WdcA{xmw}@S^Z`Pkf7j61QV1!GZ+;!koUq-gFpnye8;;?AONQ)AzV+gQ=e=C z2nnPrDw${B4t@ww1Vl;q6%Y=UBuPxb5`X}JIY7f9`}2T9&Lpk~pSKuzR}fK4M=%2~ zWCA9#UfhA>fQBKZpb)ddXSDwSWDo)u5V)L#&~X!jLu3TP1XUCIAqIe@$Z?Q9`9J_^ zDiNrHV1>sxa|)Gr`J_w)21Tk$cgh~p0SSgmgm%CKIgpeS$xupa4;VmWfLyWc&QM2~ z1OSQqf4&gG5kT$XIluyJyumvZ3^)%MDx^77gyE8zLew)ec1fCWgR2BmQ4h8N8l1kK zp4DIMg%9|WUC1$o3=jX-;}tsyB!IYre(!JMi6{UVLx7u5%3&v;bT*?4iU6eG5Ry=K zL8YWM$uT~|9Z2DEzQP2Q+;u6K%1EOInMC08O%!L292dGFr6wbtCkX`XAMC`SlrECu znPK;0=n*yb&{qT!pp8Tc0qFb+|-CAn8k!>sxkgiGp01Q=F^QlNK(g;v6k;M2! zl`1(EMhFsMAV{%AO5CF?rS&kYO)`4-HLk1IySa9%_39dx7c%UAm}&y{xu7_v2gN{G zV?g3&W@ZZFxWnG3PZ>^q>lUn$7nqzA1ZzTopm6StI^DV>kdmDzR<0U~sB8X3XTe^Fys_Z>K zePd}Wi?43Ui_2zW> zOlf*^tvAk`dudwNHLaF*l}|75E)k=sK0YN2h(%o2;=(br#|0~z#nZJI!9l95MZLFE zebk$+rul#UO4^a3Qn&u$qYYV@vV5(oTe)0-;@3HiC5a+F*Ww@VM?VKsh8YP?ItR+< zmhN@Hq)j(5qts6;ry<@K{4ve-Wlv4yMJlbD_oH70*OzI%U)0PGnL3YAYH;Os5P(dL zE-MtLb3D03hzAL%I023kborXy9YEq;sm6myAw-qIsT8MSqYwz@N#Vs}JU3ns6@M)< zJiqF0{w5_P7a5_RLkRydGVAqgQ;2~}AEFYShqP?AnXy{14##RHweLWNw` z6JGYssUSqaK)fJ0AT1;)CTh5FNCgfA%QBDvL!W?94mkouHb?@`8OjU8n&Yf#cbK;f zdm#dbSgfG{0-RAWf=Iv;3CwpwnJP-IW&oM`ga9I%u`7i0geJTP2tXmnu%{8*;RO|~ zm%^$}AxV&~A;CG33PKGyuWYzLFnGyA!Z^zeq4~F0loire8j_wuONI~% zf*ej+!XWHFwedL zGbx8;DLb$U!vWGvW$(st;9BC~(7nBo%n;-I;nM(>m)HG}z_4^|x8|oR(X51KG>E^I z0Op}H9>D15Jt|YEY}ycm>ILld-hQa*KoX@0kajWE;5yl-Nk4Iml)V1{;rhlQR9+ZJ zcS;tOA#8) zhwg_A5FGLxPZ16*LI4ReNkuqAv^GaMd5I7K9OR&s>|V!x0s#&Hf|;sLAEgeI4VN$& znf4ecgens-GtK}b0Fi=+U_=QJ0mgG*WH?B6C?t%hIYWv`prZ*@B47awR3ji21bL(f z7$rfNB&l5RmS5(AjzHv8Y>DkA5`f?qW=3Zp2nHa46eMO$u@Fy~0p*gBGua51rlL@s zW=x?IsT_c~PGFp601QLt%M|d14$$QxIAts$Lj(WU<5h^67*E{_o}j}iNDRdLM4;7f zkXH$kqvH3%U{-2mkc9CO)%$(~31|&Tl+kfWaF`+9WRXcEMNFVWm-gclgC}YTfC*v2 zv1)Yo3Fd36DXI@|wsDZdCWfey01o(qs*2!=A}H;NSU^zbsWNcMN=8^=IrfryATmfs zDn>b^O;8XEuux3BkqbaxP@qkGaY&i!Kw|S01G^Zu8Awcf09_%*NCNmQJ4%AHDV|-> z8>C`LDpZd11Q3fR;53Zfo^KerWQc?-0)kKw9Wgk;s_tpZK|EtE0Z?*N;}~_NiB9#3 zZ+s&}Xi3O-6r4{(0^x`uR(xUtfX;YIe_TyOs)C8{in#v(7}+Tts1j0Np>l{G0dk@) z90wSk(g`r5GEcfO-qk{dP6v!zct}8!K$zf?i|5N4l?LxwjCPu3jWub4W`v{v0EK_e z6cCC46CQ3SvM;|ZMKiqBBppJS(C)%{n~eC!C)Ha20LOA`{{T zZkjOfRYN&rIN0s`J?7z1GKDf%;z>>~J5S5H zk8K*4I*O*IltVbGccBd3UqooP#kndxkkmv5h@;-twGc%*i# zZjQ3F4W=ouFE*C}Fb*-Yqp;a%bF?+B6Kh%qAmtgKHP(q6Bf+;txwdeIY?rk(hSDLb ziiHOr=#%;l_JH9`p~1?7IhgM>wrdlsLrcH}BO0WX!0?8*Vy5+VS^YA|@UY6Z%4lV(8T= zITr|jo0`(3evKQ#ryu^CTm&r05JKZ9dp3)QE|!%TBF%A*z&cio0d|t{ zAd-+?*vO~TRKtNZ!$~7cWjRj>i{?uthxscjGm+EjtA~K?GT<;+$!|$qyaK^9@Z%kk z-91W_+fAU1M+s*W4U&s#hrW|fl^stN9AQ31%?7yk63F7O=^Ma&r-A`L0T`9KX8x13 zmKDh`NbXOuw`0^0u24ao@y;W5*JkRvUeJ!7hYUN6MRKkun5D+F*TQ-JzTI;`A6lE6 zrxKF)#v__qC&b5~?)Dox?0YJ)L2ofYxaSnT^X2_FYJHdtehy-ItZ0WZIw`LcHO>m7 zcHe_JehVz+iv1!MV~S#9KXKY__S&YEc406;DZ>&3U>3Qns3G8q8q|};I+!O3Mbb)w zIj%BZFy??kDuq4$kN_*g>VgBj&1}FbdpPz)h?^~f;LzVJD-=JR#uX?L=h7z;gIyg+ zv@~fxW|1PYob$#w{Q%)snFmXn;O3VS2q1$wc1LBXKR;4n<0IHm$pm!0J^5z3H2(n2 zfs>Z8i|U&85s;|T5VcT}Ke7rGQUZ8(lOYa5KtiC+c%d+eAWg{W$R;tk$^r{Uf>p~V zth^y|k};6MGtM5?$xMWm+)7FlMlLF#Ak0d6cfcWp1j1*UQ%`5%f&oXO%oLcCP6=rW z3%dbbIPl{D06}r)37jwxLaB*UpY1h-6;J>iP&si=-w%*bs3w4C!v_drhbYy|XFl*a zc0d9kVigqwNZ|vC2eYtnUJ!r?7bA-B5N4`EIdcQu3<0M?kP*NsGZO89bJS4C1KKC; z5bj?AFpKrVm>`oPaX6BhEC5DHUQ!5EK_%kj-vp^t&zgPnY!J}tgDj7XIjt%r95XH_ zB|?gLzyl2O1R@mre-t%5poC1)e$3)PptQTUn*lxmh-f5&;3lf#6F6WqS1N1G7qg1O zha|Bmi>Lnpi6KA-|Ip?8Zq;+XUmHs{txYB(_d8wBObutN85K7DvL`i-re!8Hw4m29$>5{5i zJ}($)W5Co}bg!6AzR}+m#rdTBSJDh?2oI4`)$?S%kt@RXfK?7Fht&+G32^`wpD65t zNOQnENK=u*Er-M6M?)a;fDRzxERsPnhJ^rgNk{L5;0Kf}=ip>SM%QXHm_!G{EUg4f zB<05p5^8%)VbxWffGAfe%ckVV40FaYYlXxWCSg-4MK}HiG(?~yifTZau3fNg;;xrL19&8`~1+KK{5$pMz5D8GEIC;F|x zYyJkSKGRgpeiIyDSYP`7PvQFFZS?;D>aBOb*=7&S($Ulm((Px_YM#NXOj1f_6A5rg zy!_B=?{o-i15C>Tt12&l=!AF`cz+1qiNG&eOHj$ zRMNIWu+^^Swv#zuq|MJyD**E~xwK|M#*PrG%}EE zftWZFe{^n{MU>l2DIf%HU9MP;i@^#4WSC(YOgi2hrDQ+@B4ae3y0Y}D`w@mMs#kHU zTmdB-1}XJMOK5}|T2llw!-Qor+!u17Y9cOtVeMv@Bice0QG!GiaV&Gvxe9}t7(?2G z&qGt+?Tm)eVQ4L;@jX&QED{^~t9Hm~5U1h4XLH9FES`FUc(NfbWKSj09D0tNL4P{K?)p46NhvvMu7JwIkfeX#a&#)eNgAp zE^A9eLZHH3v4c-^456(E(n^E=Q5ZEkkPR+-z;m!#Q` zp$H7681*PLw6)>D$PB^pfX=0~9F<-Pdg*Zr7stLJx!jvse@^CAbb>GmcAZ{WgN$?< zbE>K&Z8bVvNopb-08tpX6qTd@w(CtxO7f37I7T@0g8s z^^vV=@lOYoLQ$$GrO|z>vWurutG44L_alZ~+19kIDb;8J0clw{5tp(c0g}~*1q{&) z=N$v6wt96b2Ueu!J&qKrGzD`Dj+Rvln0#&9 zB#;7WI5_tod;m0s1L0B!?2wVGBC}854|7noIz(lqa>j5P47APIwI;qDu+R+WT$dB< z!W2oTl@MTvv&a3YPyzqZsC=ldX?2~pqB-e3vcY*{(fhvLM|Ik6cH4w3xM^=R@y;mS z7xM3B>325jsW)4eF0AZ{#54>9U(20VlA#)6yVq3iIY*&%V@T1IzLoiBvUIIBSE%3C zUpxI;fD_mnxJ7g3_ft0B&X-TxJhr9E8sVrCpZ1vQ7Zr%2`pg>Mz)`*t<({YAF859P z`sV$noyV%`Vw=7OvO%+p_=(-Ux^BBwNY0Us7Pu9v-_0{Uu9G19R?m}FIB)g>^h50B+>4!E%3a?OmT{xdAZANW6dC?n5ZpWr4 z3AnPju)Dajxd~p~?x#_%C>5$XRXpJ%94@7t_+uS|=LbidJH~Y#l{2E~idQUPRCX+X zf5R2NmHAt}Z{?Nm7;J3`1zr5dwrT@Umr~;zA;`D#V>z_2$EgWH)`+ATw2w5D8B4Yy z1ylnh9oS7{s{Fy)I)9_KsylAoN7dXe5PGDk3?CUjJh95@1yH6#9>q+tSh^Qg#GM>d z!BkR)+$&Z+j?j3Jy{(8Vs96z&rxNoxl8Ow~6WfeKa0n{VoVa*Fci>{wgRuBUO%g~} zi2@fgv;EPGQ$ACOILaqv*ho@^4mibDADktTGU;#xk^4>)i$NrW2aZTYu*w?ehZR&@ z)k2euq6d=WrLH892|#5fc03~Mml9ZsAdNblHTkpm#qZ^LXZ3o_s1d=Swc{1dyizD> zjwnRRd*b`@xK;kCa^|LuEe;1AN=QW~??)rm-~PXl-`xKInHzmQf2+0L{l+)VCOmJ{ z+Fw6Na;^kBAL0fmjtF}(J{ZO8=4`!a)VUASXGDbpul!eoCwg9O2jh!Hm90wkXiP1^pPOk7#!LA-f$5WBKkz=!FwrU|Mvjj7s-+O$|2}Z4Sa!HGz-W)ePJ$ol z&f!9#EoPl69Kgi@jKKmkDc;_-%0XyE8kS(^I4kTYvAl3EbRG*D(sPvX;TbMvR=btO z7GPuB6`Kv%RHiAo0$fz8C`=MIwJs_OL^K+%dJ#-wPijEqkzU4`u@7V3#BY=alH~1& zyF**~fdZqNlA7itU1W8?(r_fUYPcQ&ct*wGnYrquAVimpH7bp&U~9SUOWOT40ONgfSgNTuy1~fZ`$fV*Jh1dcx!AOWU>PdNsHwx}F zPpDkRoh8Llr~<6xNn3KWeN!hqk&8e@PkcLYH&-;vhc1jwLm}bW6?@Ib(#7Pu@71gE zdMqwPmzl^L7Dg^uC003Yit|ZCu{UyPs(nf>dU*qmIN}g-jsDqw(u`wV!z5{IRTsAy z<-@6~5>2(YsP*&JCB&!QVC{^SZ@pUauHM@~XT;hLkP}s?V$YK-p4j)YnO?N=-20QE z*J{+YsZH*yOg5*&?LSOF`$oFo*Q2G*1zga-?vbWBVe0O#xGF9UO<6Lg4m05(jHrl6CcXvvj<3_CtfV!PVmgt`;MOshu zvHt*eIfm0+NupQnpULXj%gz{&DxvxVxPMZ z-O{B}OIcS*xNw;oTSvy(MS=0P==9xGjAOb*fZraBG0msidA@?Q0z_Onj0`CTlEZJV1!9~d8H0OF)J}8l;n#Ep$G>RCVkzozzs8)m@2*eV1yZG zJ{%x50~Y}zN=%*@!CZ*R3Cp~9tRNK197bn|5FFJ2#@PG)@liWMk6@?OWE~%~vq7*Pc|I)c_-!%F@ z_06S9sHaq!HlQGr!Y?*AV(qP~Axwn>h>sakf2VOt%?qXNEfT~hbb6lCai_QRqc=LQ zLrUB|R3e7K$gr)>J93hE7e-r?u7#2G={lpILBNsbYR4b>wb5d$9n?Ez1gydT09M8X z^%x53m&780L*_44+Fw?^w7hU-T8?<7P-N|=urDqy}qW}tV z&Nzc_wu@e{rMTD)HcqRoFw8Si9Anb^M&UzpwWn5%K<4QLI2>gZ@@_Wv<-5#E+;DF? zU~R4`TijacxnQUm;94CppvI$j-DzzWHLvTXJ9;cGk_d(DHXw6|PV*?3pfmWez!dO^xrJ5{i(Cp4lU~@3>>~StW;=4*t~R@m)7%8*Yub2% zBXQIHAH46+XrV;YpdHOxpeUIZh`R2yEO!b!ZF~ z*nKuON@jw1ta72sm3F~aPY&Z$xan3>cpP`DHVbZ?t4hRIHSS1Pg-|feVHJk}szy1+ z!*|?k?$`Bq8-lei9jFX|m<=8guQMTgC+?1ePij(ibi~f+-U^5<&$y@_)Q*Ajs8ar; zbmH`=8r>~9wCCXHpbXM$f#Q*T`C7)*-7B^2M`&rMQzQx=-q{%D`o2D&fAD|Ijeei~ zRlfK8EXVnd*A4xza4lo0fYii!gTYU{d19GwOV+H`5{)hZoB|A`-4}kpZC~raRMX8$ zN>wzHuG2Nt%`tHZD(Y&vz_rgrUPW~_mcAw0bWIVqNRyQlUZn$V;2@eNKpz&|r&NFUu zp5|y!af-lg*6_``nJzw~nWsf35vlGj{K|xYRsR5Y;cOQC!QR}@(^y#2LQvEVeMb_B z=KEi2E8CR{msKffs9#ycG}3dLz3ljAv4s20%7qW1riU8U?`ryC#+YA8zkSK2je*MKq}94pwG;U^t;IXjmgmv4 zYd>D`fP9_-u0uJ-mVn_k;>vf`63gQ$oz1O zmtvtSR^dAQo3dU10Nz*jzP8)NX8yjDti$mjIIxej(~g#y>H8J^h4nU6rCEiQ+sRN6 zNlbKon5f-4g)-+)K8dQ`ZfGUC!jtOm>og+sf9b~%*|*y4 zB95Y3Oo7O*9nrV9-0!WfU3a$Db**rtTTYT%M>8E2+J7!>NhY6%u)C3O=Rs|i8pVHk zrs|!OcIn;A{@>ke)U8U~ngWx~y;NzJa_ZEsXD>8GX|`SNOsM)HDN_Kw?kE8x zmY8w)!$fX38auA#VOrwwI=x`#K9k(8a5}WO4#Lxi-Ha{Q^){5P>}}fM%0pE`*=wld ziS|aH%q^)xo1CfOVf?S}9_IF_rs>p<(yeMbG&SD69&`)L*5-JmZ1+v7waq#kog^Zl z535WLV@=|2;@w|mywRqzqq$qvWz#6ss>0&eF@$P%r3KAYSiEqOAFW-xBnpmYrO-E;p=_u~PwZ}tO(l(auy4))Q=Za}+97)n$OzT=@ z!5gSGjh3jjRjAh&)T+l7a-y`P@Bte6ea>IEsy5UqTGw%iTCY-N^i;nv`=-UgMkUTuIrQxai|7pF;)gra9=l zl$Fwa%^_te7JcYXYB(%|njCROJ^j%sI+`9iNO1wBN)!_Kp|5h1P#sIy4`;qQiL;c* z(I%3?KDY!(gT{UyF+U@TEdV&2j$8)_XER)=gyeW-5~&&tITA=*7qc}B2($sD;J9SG zcupK7vQ-FSXqfW#JA}s2L3?Q@AJB2rbDE5S;{=Bhkp@JGcM`Ax8ac=SP^bs?!3_pu6Or$NCP2r7 ziQ~c$3P30VB+0}>7)f#=DXS&n_Mjb31Wc&~pbbgI2WmkkM^ZOG*XuL334pKlf$LobsOj&B-Ba~?B%JneG z9-tvPW;-Vxm*lPe&03dsb*j1a>4ug9r4~u=jx=WO6ZVMf{{SSk)2q1GaX{T5`z0LC zO5JX6RVdK9%cQAX(bf0PyV?3`;@;13svGM`cqVa_>z${6v3a$vPQAT$5c*((po7D{ zJmc2;*x$O6?&g!V+fyc$0fQ0 z2_U_wo?(fZ!Y6y+Xo~obq~-=$SsILT7Bmtvitk=k+?wlZWkrYq!NF5fognwd)!kiT zY;CNofcid|)^P$XV{h8Jmu9PJN~3BAG@2zrBQTr@jvbBGt|>Kv(7Wf0j9JAA)UJ_U zJ}}S`p}>HNFMOjXaIvqqUDdO=YE+{1B=*P=*%;-h2$YT^Jfp7Naq3{DFbe_l@i^kK zd-+=TvvjT+KPk;67~BiyQo;!f5F%4WY+UB`o9LPPWYP}u7um@v9-4<#Cn7ac2v*+EgzZc z4>6aiqwPkI=2M<`ifrW!(oE#8CrN4l0BSZn72P{-k!5zBS{+iGriW2+YCG`=#c8$a zmMHUzi|#kZ3S8O8ramUp{{VB_t6seyQmF^hC8LqGT1$&u6PjmFENHiFpY>&?lib?Q zUq+)`Mw9X3@F$!2B6~Y=cV2~ULDXrmf;A}*)U@5DbptXrM#1{aii4?55YyCCm*CE^$(=<(XdIPsI@LTSd*)fV`0?fnYo>aHR3!3(4v{rJP?*NocD2z#mD&q^Cx@Du={%2XzV4F}ccm7k zs;N@78O6b@*V0m(Uh2xtmX`YIid5N6rLNM_;?|uhIV!;DzD4laomT;axX-8TFVX=BoRlKQn;$}A(zP~$w4FTED0 zbmbbhwCLMZH0~&!Pqt6Jx{mo}Ro9WDnS2+7{)+gVx~j7nmrD=8r}lu>f7 zE?qM%l`c9i#PmQt%-&2120U5p?*?u8Q=#hSNDZ4LlFiFHC|@b_xMOBIfU zqWVHjUzBiW&Q@n@2NKCUu~|qr%XEH&>EE5M9yz$ zJ<$%=UY6lyPU(nSQGs2X2<0IU-Vt8c8SPm*)3|%wpr@*Im()=AEtK6|Rk+;mAZ0_O1pgu%od-d4!5Lrun?%Yq+B-|ZlzGxxew674zEm>BH&!kF_UcD5*p@G zeR`Dvb4w|Z(q}N|1op+TldcT1RFqPbQ^4g_dV6jaoLg;G$vrM{0WiuVr?b+c)U~2U zqICkUL*gU13kzqlrmeB2rDkSlfYTUv+iu-WOWFw`TG7Q;YZg1VGH$dp#x_%`f=uN$ znDkmM~*IG`iT;R2DGYd+GUUe&bf3YMxCMWqjE zn#Nly+}%1(BBPwO9ZfI|(AkCi&W2Q>QTGOuiF*`~4q9+Yi5Rh2?X0t7LqU1csE$oZ z$L_ifYX^wUrBchnc8M@YEF}Z>358cGqf}f8}h4ti?Z0 zl$ym#H1U!{pD-Cr9WC;%C2=R&4FEJ^0YD@;cfkOGh$LVZV0b|a&Ox*8DR}V42G$qCkS9O$u~%XCC3;cbgEB|3*54TiiamEqJpe7N!$`E*x-E7M+u*YHYhb{k6N2wbHe&90<}J3{{*;s8A$A zV;Kwu$O25v%&cUDRNSL1s-LiuZaA)ltVJN;;;JW%;m((v%n6apu*^d#qPXb*nFE3z zFrIZ10R9t>6OD@^BV4+MWV;SCR)OOVrzxzogvwYf;h^(iPLUzPSwxhYTy&rXCp1Ml zcYIM8{6Y(uj*(XJ3P_Y=R3$Q!8F5&iqB+F2OE7T6=NRvpCRBmp1}$-*Sy72F0TbBk$2k|Nt~tZEV@ z4yP4Cfdyjo^1`E;yVa%|3m(>m_?qFX{ivg(P=)1G?Fqs!KP(ga%|KnH=8!;=BT*+$ z-fVNdR-cjNeh;kI!2YV&d-o0bl3305lbr2uG>9AkCQkfN{ehzAqi?9Oqtu~w>K5RQ z(#<6Oty)Yo=8#S9-vB)#gA=H@Dn`SjHa+UwX}-I-wm+e`YdJ_N6PMIavpC0R&FNOU zsKu`aZiM#wn{OuQ{{YHM(YAD-by}@P2L}#j=~I#rk!ZTw+R*1w6+Im&h~88L7M(>? zm~%!aNZ7vKrDhiCAsW{MkqeJ_iFf^U=s45V%A^$`#45^hjm__>I+KKDIvTC6wMX95 zX=!4(F`(mw=GjH6;q@C!(z$7M7yetc0^mW(&Bx0XeUYBEYBgR-Eae2v963vtQZc=@ zeeddA<96c50$j>`sidF^x`M-&o#hT2WYS93zS#7U%PH4n2I zoBghh8iJJo>`@H#5?5Quz_;whqNuEDox+tcE$KC_s8vl+Pci%25;WW0JvA8e;4G?X zEPG081tcDDiqkB%@u!G(x-Pe&`id0UbLfL2_JFI6OkL0L#`=d*(lipQnMJ^AA&h-M zo&Yx})9sAeKxHa4jb}6m3Ob1)*{V>T2!w{kvwjZ7Vus)+w&0CC=@NuT<_8 ztI%bmT1{GxG=Lb)qQ&9#z-m0T0MtM$zo-kfxSIX56atZl*0NZl()JP-b1nu*FKmJ8a%7?DWB> zO5~dMiRHspN^|2EH~9gY8St5nHv6`=wIuB) z;ohhks<+7J32I+}{n*>nW8My*E+DB-YzZkOgy$$pwc}iho9()lj`ZnVP*F0%_ftL$ zezyA~7Ok&Ry`a8@HxrA^eDJ}UMb74cs2VF%6&s2Gq%L5_qUy%c;D$BLa8gKd3O@`d zZj1X#!u4}e@I_sgyRAp)7M7I|jcjwALM|GzGL6$Ihn8p$wyM&OqED$v5WKDb0Cp9X zwYaD1G0#zhNm>1!B_eCPkj9!UL2~Azgd&?ynZZ6Y6|6HQa(kU3vGy2u6`e)Z8AXR% z8?Q*g?NA;XoFW(3bw?CyTObcz%*jfKY+riw-I3 zR_RCFLAX|ITny%zk(_5;)1_Eby8u1xpz*;AoX~&Sj9PR(LgLDdMhS=1WjPEq>vL6Oujp{y za^=}Cf;(4y*~sOJj>BEAC_IFM_ zA?6sB$_PO&-+T~AK%kMt7$g$u$mjJ#PMHi9d7uFT0)P);Uv6RyrloillF9&6EW6VP zGZQM9N>Bh6X~j@|I6?rKVxWmkDu^;&IOhnW;Xm!l01_dbm)?7f4q{38n=B9>=y2r) znG%F#apS@pFhNhmn4%0|0k|qkeeh;MLZj^e0CW)3sYW6~N#lg!5Ef+QaRoad0je6@ zWCW%Ud?h9GDcywO*cE95?Ir{f48f%(`*uSCHQ-Dj83UT*3EVtF@q_|_@e$vIBme?R zt`oz=z%&Iou50x|f`&)2Q4TtA?}Y##|JCA#P9zzhtG*wOsep9Jd2pQJyt=W^4$}{6 z0^q*o6XP8nNRDcv2_6^{AXIrgpmRuq3Z7N{@ED~R13wr~b9B_yQvzC(cm)e^mMapI z@(WWC%@%5L&N2ZYhUz#BWQK78A!Xi9UFIbm1k=4^HLC8Hc=R$T>Tzlg!zG&WhPf2X zB=I=HrtA`FG*jGR#iWT4N#)Nm+Zxhs2y5mBP7(lthzSlr8dEchAu{mJ3!0@>Oosv| z3CBq(F)VR`Dr3+FRXm`y91?s<#~8<{GF*s6gicJQ*rFk*pgUBh%Z&YzRqU7{uD}Fd z#1I++3`q*P06ZY+L94-XhP0u=U$%rl3sF%12d^|#t?lbwR;6!2 zrPXSI4P_4=c17P=cVm; zTlzbNCw;)Ak8jjL|{jYV@r7L6xh~;FC$R>(Z9B4 zn^qi<2}d*2q~99Ww28Ntsdjo+RV((^f2ZA?#tAS#PC(O5RD1MuxA2orR5hV> z0H|{@W36!7d%Y$C0B8-W<1nRV;L{M?TMKK^t*A*PE(B#LuH;tdN2$AycBwa>+I=P_ z-?Dy+%)NC~)B!ooxF2L`YSi>A66MgRyaySlvM!#Xb*nTUMvu-I6^gCxyJ^rwzq#ytSU0CH#B)% z@RsN67E1h{N`xW7YP;l@ ziEQeAuD7j_;kLT1M2mGdgG6urS>NW(pp?W4l@dxWf^rU1!!+GPJR(|GI*I+Jq7BGWW!!(pmNq+Uj$fw_Be` zZZ)aY9=_9b@@PDTXatk#KkUDo%@#X8qp>JOTGqqO$aOIfVOY$&r)PL7(XJZ$PiC5g z%qhc9Xo_`>jS5#4_S%k>HxW(8HYg#(iH!Ky3=PTKxgRpdIliUS{^YP~AL=p%IhfTS zL_jxsJB>uOs4A{5lRzW7G)JSYsvlLgPo%0_#arW?ZY=hDUA~<&Nb@#sLNf_JWDYZ? zE%hMbVJ|Cfnz0RnQr&zk1+Uyhgf9^#xwT(Mlx~7pp*GV%&7Mj(K zm9w1Lu$MSA;uPxo<4Ho<*L7WSUrjn|t;O<}XWoS;M3T(6jJBysx+*2!$w>yy?r@n5 z1BljMa9+sAV^!Ko=6bXMgeoY@vM0GA>}^orO|{Rc#+i`7G%8PQ6&fj zQfG!8k<~dO0HFXVDVh3UubW6AWiHi}E>km!OumT-BuZliD*y~~w32?w$KCY9(Q+~{ z)d3JoBZ)|QVj8%W{__A4fB^)}L5I7s7E(m|VHE^`2<<7912WJWd@=EW1u#(HCOlS9 zh$ENYa7jV{V!5Y;CrFYV{m{Tkj*_4ehrSxB2ybdTJYmiOWHg+xCV0Rvkq$k$B;XSk zktZ*D{SeXt1fuZq2MBw$3CuWlAY4xf024DNnFA$3o-hpvq97s7o|RGrH%V?%%`-@n zV5kHxD=VnNKgU)*5b)B;NarijEbK4D~hU- zfR`Q-x9Sao=co54+FUWln}X4K_?l*&;x!EbIzuHXGuW}d5>kqi$>NB#)1%OH1Ehj- zOn4qK9Z;wm>}2nzBx6L3nigHT$pVaFwVa9_fh7R#;IS!x4T3R8Gr%MOpcM*}i9u13 zU+nBL3B^Rl1B_$`6#=A_l5qt)VvAYGY?u({t_g#2mlcJp$_T-ESHNchhC~S&;4$|? zegp?%NY#-7c*eDn5fi3ZC=$hI#XBLT*d7N3!e{rxQ<$_&WHUlZhqAlr`i-Xl07{)| zew9!r9?ea8W0N9S*MAH{a1A=XDUh;Rk7PvY$b>SUd?D_b6PODz+s8RWS__;4na*Vf zIiAR_?CB0q}-X1>~Px9)G+bMpzhK+YE%_HgRE2#WH zpyDG-PTaoX^^~0rCB5VqDEuHs#^_h8PLAPaSktcRVXo2_QJ0yg%qI^u-s)Rv=$#r> zG8U&)a(mSm|$8#NyT?@s2p5F^)`{^ClWp$Fzd~)dEH*tUe#?7)k>6{Rb`7_ zBPO21TFR4{`ljKIn_hh1hfZ*oI2TDlB`wu&G>vc!Cx)hpWSnCzt4mr#W@*Ub!E=me zHF-;B(pp5+CKAdvY)h&_vsv8htP`S#P&7zn0dZ0SZn;oT8D{JI%c& zRc#{D_UaAMh>f=Exw4JBiEv#cIgL;!oT97Kdvmvo^SWG4(u0d=w5e;K_J}_V1X~qV z*s&~xcGgmD_nRHFdj65J+ETh}sZ@JiP_8FYB}_WoudFNA6uow?4hNM~WyUOfj@4zj z+kS@Wy6HOyAPyW0d*Z170G8WZoE+MZ(iD|Jh5{Pd_UgNcN=>X1@hq0zM{fNOc`1mV zTw`Y4{*&pox`-lU7@^&CrQX#H zY_%fXX^N@VYi~^&ovEc#QUK!`OWvszDZibJYY#D80pk?SxB9N_SJPwv04V@3AfhTo zh5a9GU6k2zQT|nONlLFMg|?5*#;McVVAJbryM^m4>}kHUi;z7NP%=QFTZ9m9tXjiWiZKninGz zj>hPf6Tx?lIjnGW&YGh8v~KOWpx;kRPyps1rZcPFfE*kHo=RwFj#1M)>wsu;@son) zi5E&&z_qIIRH1O~9Ader7fRHY$ht~7JWa)24rnMek(~Qt0ZhHcxpD+xq+H5QaMewU00_UnHddAvv?#Mk(i-PG zGIZc3V0A5Wjt(u+WQ}$CXt*o^0~*#f5EN)1MB^J!(i#~fHx(*dz^|4aS-{k*rvlaN$09}M4 z1_+2yx)h{`mml8%6<>0k;ebFQWa3jHI3=oQ<{WY<^}!(GNk>n|*#X1=f?x$g5S+W< z0RU1#1_=#wS{7xgf&y|sz7w1rpkNdLW|K&1CSWAbb`SvZfpQ69JdX@k0d_!y%Yt_Z zZUDFtyh4u*Axr=!1HXT0?S=tCi0m#vd$_`x2p1Iz?3@K6Mnq4x4xpUh4hM=r0gx(^ zL{(fN5?3VvEH+q~E*?GN99=X{3I`GW@C-A;Toc9=E^Sy1OOL%FKnMTV-}R45+w~^S zqSa`;z2>Re-zBG0lJ0#oZ_`^#Zf_sz-&_zIOkfmt zFP14)i7v-dfKn~}d%4uT-ra6qw5>u0s6gY{{4q~-+*1s|IUG<&N)BHlGMn>D7vg8R zP5V;8U__ufT`mSPB%!Cv#2KG%5snjD*Z9`g(-lZjd9X& zGCt18LTWI=eh?Hiqh*&6O8cP$mOM{{K`;SCRFLEQX9F-WId}2yhbCi&a39)xhiKyv zn9v^V8sVA4wG|^LZQAblYsTAIFua!#T#^Ae!}h(l<94pvR9mD1uXqw1XC04W*sOND zI$x(?HIMTj@fD2nxSXk6u`X7nw|iJ*Wpu|!?fVU#Hx`PwP8`p4(5GxzToJleJF|x} zRD|I@p#fN~Z#s=CItqYAOku7(uGPmL5j6k`$|HD`v0OOEI{yIhr}$;=Hd^?j-i$k* zXWF=sQQ$QFMmKi=y3Ob{I!MwpSkc@x13-j4!96fH#o8E&q$D|NrP6s8R`2`bv&ZB@%&gF5K|8=ss|}W@65(U zR)*0f{!r2r>-R>whUsfrtSx&>9!p6QoR%~ldDd2&MT@N{Rcr35qZ>_*1;IYl`CIs* zD7j#kIu|S+I7Mdr2R}f%c9%-2)YQxwMs2UA3u$T&Ad&)Q;T4Xa-XFWzP`i5tI_(-9 z1i;gQQ=8T-?H?4qMA8i9B?rbUnrclv%M_DUSgkvVt{B$UW0ZEda5MU&7V7^1(GO9M zMmARzK`nNW_P7mvaOIbTRJw}k^xIM3NzFzPIsX9SlqlzpMb@$KG&_yeUEb=B-)!Q2 zN9l9dP-{u%G0-;aDtmpc3KSXX)27l004Ypx%*8*Yw(Tu`?Nyqp+&KDR;?^XQKGa?< z9CWmXBec#_i)PZnX^AOI3X`^&bA?1ge5B$Rk7NYJ*c8&BVK~g9_IhPeTaRD|6B2qt zwWV)Yk=jWJqqZWkYqX)txcC^1qAF_E0TTgRse7YachmM&OI=ccs2zw5B~P=KEH<=s z6mdNAY}AEJBPrIji_lF;Bmz|cMG4;$N#=ufFwm^g+3DDk`f5U*DHDW}Xj8r+cyM!r zRMa2@7$7`i(EDbz>89rf&Ck^|Ps9vh*SDkUHk$&}abEM;5u~*wrkgbmqzFfHI-yiU zAOteCgPzSNHjqIQOe1SjfNNScb0I5;9AX=i8m(;0X6G3vwhGe`sbZI^+A%-kJ+W-vw(D)m)z+634Mw;fE&!`Zn_L~7#8S4~ zC(ymCWm%+FrBEyIR3tS=Xp0=w8>7dztBIVIje1 zG$S9TFn0SY*6T^gO1Q8L5a$~Puu}m{flx3w9zS|1)P>4Z0xI>M>P>Sd32~lN#w-?0 zcj;NF({pud6Pd;<*EK^ZJ<@Af!le%k{gE!o)O&U9sADb|>Xi8i3nC)Orq^+vYO4cc zTdL`e0sjTvbl6>zxw8Tes zNYZ_3b)G6#4RCgq(9!@_Gwo>2(LOz|nSR`5lblok02T#&i>lq3$vjjWOS>AH>969S za>Kb!GbH`);EX;zXB3TNyPyvm2TaP~q>%eW~Dl3ZpI7zmSsa#q#eEROaxqZkYzo`EJP#lV%sxX{RqmfV|`ST}Z{++sw!HhnQ)SsYBSOpJ? z#RQUEc#a9iJ4)i6n6=npoLpwp>TNn|(%$!Wr$*!xMr3x87tW>K=xqB-Y&Q0_m8!Q$ zBOshiQo2Hu>u*%_gUS@SxM7?wqN`q}@o?J8O)?t6@P?VGjFE=^MH}+LZl`Tal$0#) zS+_%4S4t76;-x_Zl4tm$B9rbyaKs$p<4?8*iuaRlQBNK}#nCWGoR1Otg>GN1>5WvE18pCW{D3Nf71>4M@M;xP98 zaH#|8HL0i<1sN2GUHD&dI4a63t4rF;F2{5IKIMnV$CY{liiw1D?WW?B+ELor;UG^Q z_~gz~*@PJ+UHUTj*Q4mUx~kIh&S}}kIVx{<>gsw+{CMn9ipd=AIt;IrNI<<#V(C&sx?FXC)FT&Ra{;q1_TvBsKt)@ z_MzOWxusa?IkUlXOvF{UN=20(<4vTzkQ@$;V3h&QIfU~@gzDOQw?Zy1E3mcglqz1( za1~i@_KhI5;qK-?5Ma{dFN!GLQ?pgFxphjFMP~x8cnacI98s~Ze(?Ksb-1TWtvTo_ zq=gEg@P|Kp-8Tw)9^+JL0l=A$13aTdElMpRg^@9d62)3vHv?(EbH1Dru5Nz;z6X!g3B$#FVENC2XYCmH!Fmn+p8 zxbJno)NO?^S?y?PkU)DzabD^Tr`L5WDh+XcL(fp3D?T?ceW!UR78@$X&kk&ZK`7bc zgSBHKr4^><2M5-!ONj#j!OK4R#&y)b5^-fx#4?4wCejT7jMM3dDQIZYhaM4{F6mp_ zjZFdUam5Bnl18f7Rs!ywrjqvQey5!}b+-pQwany=ZPv%VZv?XT{9Cot45ZaXSvk%i z^Gb!%k4#L^5CRn7o`NezG206Nf9%v?xZ*GEa}z; z>a|MD7&xe(;cRXl)4OA-mRi~BRB5SEp%zo-gwoXsyK6<@R-M-u1eU&&TF?;zQ;uXr zlseyUuHeef(xmD$AaIS*>1~SLo~2qWd)lrcpbrcojiqaB4LdtVvBMo85{I=Sxk?K{ zKf8D(^#YdRX?o96)~jAnOqm6jhA|bs$^Ay|w&_RnA55dDiE3*}!|&?mhW`LgX0vwM z29;(>wHgQ{kN{EfV;k*)n_Eh?%_pNvh-mmrd3pR?9S}`vlOZZaHy4Q`ZP;zwZ~<^A z8j>VA8jt=c#JP9@G?GDGBa3-RF|g1m(x%X$SEw8VrKIBFKIp=z=K78-a4jko%Yv!G z9;HpLP{I~!vmSbj@SqSkUPLRzybkWCaZ|b5di{ZCB_R# zkkzJRRK1XwCjdJUxvB_AgaEyeWEN91X~d`k5)@O1+AEh} zh67NdU{ZM|Fp1?DCLoo_Ld9@QbIZOKl_U=glz6}~x~8ZwUl>s2Zlu&veykD{0Du3{ z*z~^W&A+05S5eZZNS$AVEOBq^Ta_)wrs-{P`gb*6YProMHfn&kx;Ix{N?X!Ywkia+ ztB_R^9HL|rQ_U4OWYR>!u4$GWBHlfvADOv)Qhv&o+_?O^UMW<)SL@1q?%3_Gs4c;H zX$`I<6k(5&C=VfYu&7iZW->m=cqB}rUhz0%oaZ@k1T-vw4Ff<_my}{+ZV|Sch6&?M z2syb?y$kQkhFVtKuISe~YZ_qE1dig7f!FpX%Y6zH)nYFa)Bga)6>Yxu=We^FduH;= zv|ONxFBYE2xb+VEXVlvr8>)^J_bXBh*@1RW_eBtlqk1uQncZdSfZRt0&vIx zn5c-=8ZyL{PCbz;+o9TCp$a$<0SCq#_DKd=3<&UzuTwbHE5J<-M2K++fU1$~NMzI! zIi`sRj$wg zD2j7PuIuu?=38YqauHd%DjRdBNGK#q0o|N!I%fUbe&tQgc7{|sNhoG}<5=KBLQgZ< z3XXcTjnAgD>GNhv&{R+KW&IEC(+lIHH?!ZP3ZC-`OV z3riUHqZaMO%%pe_b!1a0*j&7RjLv#WCJ2$OtXNr0su19#8UENHRFSu-P;h&ZM&FjHwq1`3hAmO@rRZ8#g;u< zacVc?xn;R$>?ytJv2}LaPop_qp5o@!q~vLs0j>##Ofy*OTRqz<&|NF3jUfzzoqFkGU%+T%prLbPcu zI1pnh+WdeiS4wVhks?}t64!BzPuObPRb5Ov2Aw_AbEMj1L3O%2iZp?wMqNu`P%9kf zw+Dq&oP(nmZ>lag9mb_5fYu5ua|y-ZJx^lBQ|mg>$z7(NyjW(TIZ?t-SWOl-_joUH zjp_;n7PIo0fB>q$*%Pwv$@;xl3Ux4S#ynz5ev9cir~u$*7BDY%`+BsU`j5>5YSX08 zfr-YQMPCDNPjRuM{K|pgk_k}025}W`rL?ByqULZ{VzE}K(~hhb-s-a^NeijDNvIAH z7yZ9h+4`F2)~G@!F+%Jlk#$uw7kOi;)V8%%xR6ZnPWX`Z6P(76fRW`e= zVEQ!x0}{uPsX3NWJsJpg5Kb9_r|8u=*;JB;rS1)x9hfR5w`{^3?Sa#YvJZqR3UWwPkXmfurgQ->SK7 zvkIXHFpe^4@l3g*eWHn7b%69~1*XMKL^KzF2-6rgHlM7j`-!V@0L@7XiyJ}ehta0m zdJB-eyvRIx0n1Elirnzg7%odge%r;SvjZ17098??4jM~i8(yO4JxLRU0cbSKiD^QJ zr81lXIZ*e4G2tGIfyVh(_dkksZ}6NRW$&3fETe~rlmHYe=lh{AXp))A4AiMYay^jj zxrBm7=43=^$mI%TK@-LhRY|XmuNhMBQorsXs%mNS{N4gLJ9W*#hcH;<=8j3Sg zC;KE71O&w8KYS8hGUQgGrU?uN5*2Y=CY_L^LY#6i?o^PF#%R7BkN}uOK!lSiN)Q0x z2r|S70n(e0yN9|Q8g(vcqmQxykO26R*$iV<%B1rjZ?M7>B%wa+R}tX{5UYg5ppYa8 z!vzz<0RRwGEXpB)FNS0U08hRU6D>n7%q5H>IE0d()WQG=L=u(?6UAYvq!F1x!6d(2 zDLiT+GC5_!9ORm2O8X&z(JTt^Su7OPjUEsbfC#{%D3S5;fJ<7fmY|q$&H<81QBRxS z3K^haha3`q$WQ_Q(&BD&iBe05LLKAV1S-g;DZ(0MF)(vZFj__@A|=E|(jmw=`Ii+E z_QVcTfXFBk$|oeC04d-H5=wW(F8~xon3=?Jlvj5lp{*swF+2#)CIsm^N`r_Q-H#(!=2*7nB)Vezkv(=vs1T^19x<14;M;!;u{RcWRiv2L)8C2E^b+;t%i1Tz^{_|HC?RO!dJkYm795@_iOg^I@QyCGg zrx}hh4KiCsn~KPQdjiffs+Q>iteMv{F^KStQdE-fW}>as%vw}krx%d8rjK9! zZATL3$u-n$b~tfz<6)-aj$P<0NlHpBNhF?by$h?_=K8c*%;C*XRzu>_yc-#`u0XbZ zm!vDT?gHJFJf+#4QY;^9Ok-}8%@$I-Z#ZETs(v3V;M>=852)7yDaoz>0L;vO7*DA} zdUR&SjXPbMlWpk`UL0R==HpPAl1JHR!S^C}RHF8WT~k&O$ih$t{icsLuZlH{xD3Olw~gC6-jG>8oj2y^5L+on$6il!0$&X8Pf^ zgY{Aak-^AH%Urv8pt6%0RipwOFmN-H46s-%xVfou=xQ|CG8p5`;5^@ZR2Wyhc;iQ* zV5LHh7Kbr`jxr(5!Ab-z62mx`U3D!C0SX6V7wvNY0J9pu>P4g$lJLf(sXLAhF`6#j zML{hLa+WkM1kAX_l}`nQ3S3`*c{U*hE!{7FRIp>MR^F+ zNH}WLWf*_2sZzTsNCr5of}qr)UECuookmlq$^)2CYDFZT!vNZ%SQvG7dm6+zQ*AUT z*EP-pUSI$zID0Lq(?{Dg(|!)n(WEqE%6=%xt#MCDYYfcEA5w9Rbs{&F(gE(OP!X9VoU&|W@4JmXk7~2HuGP2wMK)bL zP0XcGIC_j97MHX-G;v~`>uPvbNfdJA)v37VlV-1MZRM>}!%vyTWI$;`RDLdBEN+be z>WwwL;i*hwqC|JVsZNW4btP~hE0|SVh;t12v}u3;05m?Enp-swToGJO9x>?b@NGhE zwT_3Jy~>=esyx;`N9ma&ZUhsNhX9}e2~jkfp@$$RmjNQ~9uR;DsH$Q|9AzC7olpRi za56sl+1F|E%2Lla~sVkaz;R!}0_Y?I)m#{eT zU)$dR0EzG`k70ldw5;3`a)-MB+=dH^_(N2Qx_y~_aKH%Wf^s;aa)x7o3xeS+j4&8T3Z_|!;vq_CF_Nk7 z;SbUTvdHl;6fCh%Wd8uN0gKa~oh{?1KKx-r(pqPNrg-BD06+iJ;(A)(T&jcuy}7|+ zW)2v(7!aZCN%2T~W&wBZ!Zx7@f*OM*C}lK=95YFFd=dm=WnsMwrFCzKpO1!N8$-e^M**Gv*a zr=CUt1rf)?7(fz6nHiu!B5t+s5acCD2T~}=9Fq>%*xK5)qtqcnD^(MOD@!J^p0!2R ze4`2Eq{e+z1-ei*ys|4XJiZ=i!+(*3ggjZ?^#1@(r7zx(;fo6?xUtg{$qD}e3^l~F zWyQU0?D|gEMSUW!Q>tgaFgB&9)eZqB!oJ9o!seaDy(*1Us4V!5;w`b_w^O{bewz-i z_{Hit@u%Q&?8cX7zeYyesoeJ3RV1h&0wSqV#+9Q33L{FcYZl(6plVsvEyN7snOdt~ zX5A%pgqA(A&-A9LKBco?y0_jtZhxs%ml@$X;DR!3*jsg3s-Vot6TUEZqU~Ue=BkqI zjhzPuSE;AXq@6e!@b*Q#>efGmvyQ$m10L3x6r5ZE8jO(S?h%=A$2qjsp$dQkIZj=X zgKMbLsz52N#F58Wx7iwgv+A0AJp;GeJUeu%(kdnv?Qr?ZsylZ>-*0yS zHtT<7Mp7n4melprq#C3BTAJ#)-y1zIvm6%I*0$ksLeAfs;^w0*=>vg%R|Zq&{7#Qg zb-nGcN_5?>*=iD%DonJrV37iwG_;;FZl_14g@|$ND{#?b`m@eT^!Yc`pI<8IjRi)F ziZ5kvElP7w?(-~DYu{GdZfaUwDJ&~Z+**zDreAt4t$1|`Z&2M#X+Rgu(CE?cdu<)w z)vc`5cm=epEZXH09sd9nb74f@>`~TkS*+4DRXUNrhKBqH&Y6>eK3utDWof&T=F|kC z#RAP2>xu>4h0U$S#j$Z{rl7RVQgZvhdL3(Zdv8!R`ipYJkdlorj~1&JnNQ{1Oa+DjbJt4#PoWCgHD^4x%xzsO5=%(KP@$&N0n*eQ>EJ9(_?=&u?`_v%$iS_)H%Nh z$f>+jq7l_yRSzr1cufRzN}W#DwXL@Z6zg%ta0kLLKkkdA=XiOh*|~JOXHS9-Z9(``oe5ZlMBU0_&=IW?ryGah@ zMHrLIr9LF(fm?6YRkV@MK4;_L7dW?cw!q6eY}~He4Z#P|W06=s(tpH@8@C0OSzT^P z*CcGZb7-$<#@4+)rFVem(=%-)2lD>&;|u0*rrw%osJg#bL;wanl}|U^i%qLey+zBV z36=@)n5TTxE;}8#B^&zIhZ*8&Rd%1@AMM0A^_BQxs}~`t*_*9=rE8_S)NQm|w(f!N zvS~^e44dgU>^ZB?NFOi9PntRvt;My~5p``*wP`r=RHi}1=fXH!NGuF2nvxxy$7zkm z&D6KM6>(CF&1y9zacDqoI*jE75ld=ImHz#u0u^J;rag-V0w0BQ#%Hn+9NL6{;rp== zzz8)0dwg(&btf~BG-oUl4yLJ;g!}%OKn0e59@K;aP*5!K_T>PBgF*~*k%SrvA-IWv zMDZLJ2{_AuJPe=#gn*)FKV&AN0^^jusSZaKlJ-MPu}pX@F@-(36-+xI5(`kwrZa@N znH~}l;xGv`P%u%P05r-+Ffd#Ph=$1Lkx!~d97sVmGepDpz~v~T9uoG$5WrkP3V0F6 z+YSV|RPu~Jbc6sGNL-hVuK`!19?;Zt|A!WsG%vBwjx>f zd<Wk--5~iL(=abWU1FCJqK*p!S?0?Ho$w zANt-!+oMR`W&#Tlor13uFitj?zf9KLJ zK+<%B+9P|?WYFqsw$w(Ks%02GMSrD#@u`mFHr~HYV^!i?_kiC*<*9%%%w15^vU>b3 zAbU)7t>UXa8?XwU1w!^mBHVX=>`O|OM?3zp!73J?_-w+TJ<%+yKlBVzTvI}nhk5nTv)9C0Cpg&I!2Hx81tld8B4x3-!88Weww7( zpc6v1dQX(cH>)rHtZgd>wyT2bYk(H`%1?5XbBf+sx;LH z5)mYyW-ysx#|I-@s5#AHRgnq~XPP(njSdf>M=Qd(e%xbUPfJHK2NvoI5dvv$IN=jg zeMh#s8Y|7Qm8qj?^E87jK~J;IE?BNzSk|P}VNhmt%N}X=eVFJ~nM~8DLsyCbvc)33 zaYxiHXaImv%?b%%G37|U)yAl2yJb4)Q_i4qI&wn*>jW;TnJD4?@wl(Iw^>ojWYi0h zI7S<-i#fERGvHMMs8br$(^WCb%#+oaY-``j3Fa~!Pkd$D*q78oe4|s9N}PM5N;a<6 z2^dW<65|_ND|(s=v~v*I1WilEAv?LzmfGXh)uTUV+nU;JC zXi{Xn5XG*WOIjEPqQxbkVIpT5-M)iqfcdlnbhR-6CaBp_rsC(+P<4>17EQ=Sb71o` z8(H*dk4P6i?$xXzGxC53naY|Q_M*A-n%&!9PW1NQRHenm&BevW1s>XLZlB<7O%c$# zUu$j6-}MTk@@v-fh~o zZDg^1=M)E56*Bfr!XXM`2t$rt(1}@MCI@jbB zs8BhlnUz{pX{gWTxw81V{={`&o9hj~s9GqUHk*Jz>qodF^ocs%&P{*$es=!=nmF4= zkg^~ENC<}*Qe%UgfSX-4<0CI|jTSjFw}x@){e^ceMr)($dKfjps-a&ok)MJ<(sbko z1QMw{y^+QC{;}#i4K%1;I^vKsqjHs+W`GAgdW`-Twl3Q*%#F76dOO84VOZf^r_`Wa zPIC1rYqKtgyFMV|u>GQ12d5;m%0-7c9FshM=}Yo)c^o2vjY@NGJ=?d#!^G$C_!@?cu8Tw zbk10X!26&9hD46k)QQ7{Fs?-t+Y=Q@;zY4RMB$2}VFFLS02+iLbi#!$9r(Zia=8^r z!b8FyMi`l5m_jN!0P(^a85$@ANcKT$q>1gz2yy^M8d7+w_(Jnc87G7QFsK*+MtP$4 zcfu|pCMBY3PaY8Tkr)G>Tp_uP+PJ53m;e(@LISJa6&J!3E@joIs$!WUVG000|JCAh z1m?3$1Qz0l85QiF0K#xBBse0-NerZ60iH5cIGN=eH4z(#R5Qp=F8MY9K#`+h|nwKFY%*-N3Qfp3!c!dijt=RE1il$26>pJsitMeVzJ zNFJhL-xsU8dWbSXBgQ7HYTDKnD3*tMjX<$YH&tqF;&NXe>@FvTabKcwI?O>bs+e#w zihSR^u5n@BHj3M2!#yQJGxD_Dq{4t?AiIxFz3V~v!n_Uk_b;3$FAnK)xBpz9b zk&hS;z?^%ai6L3`LtY0BAv42{FjEp$q@ZX_6E zZ*9Swi@0cPTd}o*XD)Z_tDb4aQ?$>*QcOG{;LHrLNY=Y`qR_3T=Wx`5G3)Mk6;v@P zvx9LW1!i*6gP*E#ZKp|ZdL&+YbEr0rimhs!rDDpC%{`6qn{KT(EaI1Z^ z+O?(182heb)slu&EojH>J}E2l0!x(4l1V>QG?G*jTu8*lzm>G(wrhIUxg35+souG9 z&NPKpFwEU<-B;9GQruYm_-_C!$UnbnA1q0A$Yk!VxG)$dF||F(Lv5x>F3Y z7^k-gL4=mbCM))mX*g3Mz>;%8grNW>fbB9^0K0ssbDP)uQ6L0m&u97-40peo)x{2x zyu2fplUnC-O+ZWA9(?&3l^OS`y^VWfHzJ%GdY0OgjH4y?S;GEUUV58#{jq3PTpNYVdV!5Y;K;dd z6dC9>pkk^(!&e_?EOXt8*88J6iuYmM8Uxu=PpT9(>_5w_&$&P7G2ns#t8vT+VCXbRv1 zi*WNF_FRzzDN_Wv6U@LQ6odr(u!_^Cfpr;M{oY;?u(zPvjL!khJxBqZ{ur^!iIGrE5SLW1Qf(hSeZ-cgGHm&E@@S1MHqrp>WZBHg6O1-gfy+S zj9V_l;~3V~<<$e~OcDt#0F_mY`mEB{Q<})TJR(XOeMY+`gUqXsV2G(v3Fc){L7)Sq zlO};APweK6t%bd6^%+K(BDH5FBa=oBsGo8*v?{ineM-qqwAyMt&IpZfHmJMwZr@wb z{-t`&73&6yG>HCH5&X9PH#eT7J67GvsQ9vuMa6CoU2sK?(%O}slP#$YEnrFFWM+Hg zhJX11asL4ByB(WP@~vyvBetzj&;J0W9r5H6MnPxU*&b*4)qPI$uI^}we^0fyww`V+ z_1vHRc9S{p%8kDku#pGzw%p z@rE@>`a9vkOeEGskOna&q60F*`u_lHtSjyH-)&Rk3?yn#w8E^P;ft=*^Bw-6qrKY) zw4eRWss|6pN$A%n+Gd}@6clMVWJ;nLKqbJS@~J{1=wG=Eo9<*~ZWCT)cWt+I+qZgt zuGMi);Xp2HPdhR5vD#kG;%sEKbcvi56(q62Sy9xg3bY*RH5eCNGFsu5pytmgvu%BS zz1aZE*Nx7IDWKWco_uD$2UDu*)Kk0wW^sa~5`=b7`y)xS z^Br@KG?nz;?Ae*=}xsGoK>j{&a@a3G}kVTh11r8O*5uW_BltQw%x%!FXL z%f1Q(S0OnPA!D8p695y0`w(`+>(PC^M^LH=bs|{caZwByCyM79)-<3= zQPGTWAk5Ah3Ms^A#vJD2oMv+DhAfZlB^6xM5!Jp?A-29lEplI~I+ZZ3El$|wpD2&y z-yry~$M(iLr71U3gTQ{>@rGX$UOz#q9}IhpN78eVW&xCJZKyTCY6+(f+#&}xbbu6N zkM4<0{e_(xm9H-7+tQ-a`o;Ac-7Rn03#-0?yr8=(=L4*BCV}{Mw+#kA5FjL()-`cd~-jET%|G3 z-xHOD^WPqTMIYc0z z96RAE6(Fb*5oB$~IU_CJUn1|vakbGpA-GSD0OJn$R|F{|#2h`4IkN6SI7^5HQz((0 z<|+Gvp!R(b2_%Iim~s%}Fh^mC{@72@2@LX*idJ3%0S!VHML|M%9uNs0$bPLV(>C@l zJZ6xBmnN{#B&Acu761lnvIERW`XH5A%sT`7p(}uDk^vH2yPyOvDxn4-`@CQncr+;B zc)*@vIG^r>WLHj<?L42Wlkd`A5t~_mS<~Fk&ic}s5ovYg( zzuRBxZQFIz0_&knuX{u%)3T+J%f^K_=I2em;?|!30BcQdtO;|2LTjL)y^eR?O>K(K z#k)6fwO|B#jQ|h|t4h!4j^%T0M{%@$y>@G-Ri%$VNWsaon^>yU>F{qRzErbW7J9+7-|8tXLehiG(u8{=DkLv6 zrixRD#~fuDe zis)GN1@HkIgT^T&0COxl(@83_(g_)XgoI&VR(tCfhY;6GQfdSVpvohmb9GwchR?E( zQ*pae*SHcXu$k6+;pw-_4{LZNSCXrAMWbo4{=K|9(XCCc$?bL9o8_rkX>~hx{n})D zwZFj~UX3AdNvr_?0zx$XE7R879@|FFg1E3N>O0K$nZBdX+IzX9d|M@&3^R_08qUbq zR&qI4zGSd?EQp-uwWakO8B7A_xX1>8u0P>e@xM^+j^8)E%XWOD8Oj2xyN~xl0btYza5#hEKiI^KCSgzm zG@by&bC5Gxj{gAciCXPV4Kou($#Y)RhJ+BLfE1G{W;hrmAV*Db!6%7?(h2~lPi(PC zLDGpy0<{o>J}9vvfPzR>RGiQ7_`#74l<*)(4&RH-05mk|(ynahbs?N14 z?L}HIYsZd~=yS_}R6&UJcX&1P6>Y6%|2LBw?IXHqV%wRnvt%!rrBY zM`KYR$meqvm1mlbxMhT+t9x@yYT zG|rb?5K!oF=4}tM-}NJ%kxU#VfLW9K7$HDt!fP+v?1)IyiqB>aVAg(}80w6q~sr_QFg6IN{?99w?s3KmwXf)B(n800SVC zNp?er04K*ZFaae31zbwZLL?T25&}cU7zpW5M9L{lCj>O9Ddh@yfN_B`CS(GDC*cVd zMtL4700e?^Bo8kSY#ksVZ*zySQWBEV0+BZ%C@{hckm@9XIW*(j-2e?z1d&D#%me+= z>)wvIrMUEt&vi?gwJn$zfoRg^_z@mN02zRd$vgi5ba%gz-A3BmJ=acLJUZgV=DoNj zNugCW0mIXKUH&*JkAW!VPTHN8+mrID)b(z*9mDFky=33-?bc`MTc%xSvGd~+?9G2%2D```m8Q%gU*r{ws;{? z;}!b$&o^}GG@zu0q6r1^{{Rz`E>-9`>gsJ*AZkeo$5*h%j8i7g6Pc&NHQX?AF!~H|TV&V!7G9Uno zpY2BO#jdAn9OPPnAXFp(ow2F2vUSB_Oag$bj%Ip8M}IVKH+$+kRkWIF*LpIIDH2>r z8d}jkPIe85$*QEYWb~uZ)NZ#Xhox^$k*|AM-krU>0n}XeYeEl-I-kq#ggXtPyT1*q zy4;4E^+dw}2--CGF@yZk2g}cwcNQ)^L)0w~-Y!8cr0M>dkDsP8^?uvqsIrS#14aV~ z+hWBMaygv7Rci=jW44e000mZ za!%gZA&LnLEYeVn$dMk?;RsDI37Iee=Crw}0+kB5=6QXP_8m1i5J(Cpkku(LsWa^` zSOFvuhGuIhVjk27OQxT{5d?ssJC(&j#3Ij8hO{E0X~dbLL!8&Oqy|FlPm>=H=s<|DOn~MLu5vh;0~2n84{T!03u{c^Kk9)f{jNhDi%OXEZ`ClrW3_h5klp`q|B-I z$^ZdJ5v!CckWiw=R^7R7J4I1-w%oqm!y(Ma%@Oq7%I1FzV}#VN#ZZ%vcNhgUtB7l` z08K$SONVSY$;H+#f@-RbSG?*SKi4*!rB>N)jcuLr0gQ6~O+%1tk273P`fo97b4wRc zMRseCjBqz2!;li`Oq9>Fwl-T|Q1073QCiaB?rA*qD?QGn@tAW?A9eAv-!n^##eHUb zoMiEXq1lQa#l-yy2+dQ50Im^*vu>8#-rYxWwgIiEODfV(O*)MT>TXQ_Kf>tDnkHd= z$s3kcSk%bVPLDy_MjEC2N)$B?DWC1x3IQMg(6e<0*GG5TZY^s}y`?6IW6t;16>P35 z+0|~B(k^J!QVE+K9PqT&*Pa5M=*z$QBF2i(t zQOh)t;z`k^Q9`LnCx6s&&h%<9q znavPJlja8UH@>I2sYtE0nf)EO#A&2?-aj2u_>-ZxF|wW4bzl9xql*Hf)gZEqe~O`O zRZgW+;^#VUc~vBpbj(nfo>K1j7fI6GS6chVi3CZRf;*sqKqxp*C~^|jAxOaHB!yuR zF_FkD@DX892qd*g=FCjv+X_I0nvC-U#K1V5vw~FPBZEpRX8-{~Z-3Nr5baaYrf3Nm z6f;;DOt(qJFbR;FK$-fjTPd=N@$R0=`uNC0>yK#o;11cFxsoLPD+cdfDYF5kH80-KyD z*JvOu7s<-j@QTdTU=RYWm_!jZ6Q-pnNx_Z^d||www3Kjby!LIQ!$?01fGsGJI7a4~ z1)?e48MuwtrndW~{J~69c;M!875@MXTJ35pG`kguyoW5>G7M0<`=Xt?fxm3b{{Z)k zW~E9X04}LF{UgG&f6{;cm&eO}ghGg} zYn|$9{%t?`jpuD1i?^uSf7EYWYEvM--qQ>av{iW3p#By|He73}dt$1Vx*c`Qu_&p@ z(z&IYWfHB83fGkD7D<3Lnq)o}upGHYs28@>*hNRZ#+?cdauFk+`J5cs=?A3tCI;!X zy?JRRuW53w93u+zVcD(A-Tf1DyVMcdUhIUC9N3%U-t5D)7dF*?)2CrXw`IQHPMB>o zQNo)Nh+%WxTCvpDv<**txp`R@{)p&HqqzS7-91l3P1_J{b`K%mgH!qSQCs?htCgSV z{{ZGYmZ7+O)9LPU4 zfb1ig{O0OE=shP^(vj15y1VqOIkuVRs^R=1I#oh}ct@8$a`hnIx}@3LygJ)!9+2Q_ zxeMLlgh>r`Ilut`L{2<1fus=QIimZ2vL2rZknPWj@gfEU6D9iA0SyQoN})r|`@}GU zLLk9Onna`m(8ZBVWKZm{(M>$UX@KpV0vZIu8BFl{VZbAZYl~Zi^ULdkjv-Kx0+bR1 zwilZ&Dq(;FR{`Zk6EPwIG&qo_!99@OC=_!ikJMq@LCR)**h4@i^A?~Nor3*rkfk^}^ z;;8_~0H9(B1UcA%g@4|FDgp5pgC{2u#~v^yRjNu%34&2O)&W3BP~v;YGX_wAb4UQs zQb=({$Q6d5Gss92gcg>P_qc%~iZOS>6D9j!3@{-mNGCqn0Flllf>vidVKfG~ggWMl zJTs8yh}8MB%ff&_&DMw0C($|njn;q?6mbN|u2G@_`NY+-J|T-nsMcAYf=3R40jEl`8m z7`I1JYes0XfjVS#sr~1k`iEtH$+$k<%IYsGN%-sPLvq+(p@7{q zYHP>}pcvt0j$_Ll5zXU|Z_yiLb95=HGS1plPe~FwtEt*LgJdg~mw}k3Nl$K=!~?WS zKxYzED(nd@8R6d%%wkMRDv4K87iGFlERx!XhP6mStB~UjlS&gXWKU)=i-f=`MsQj{ zQ<6z03gV}{YKaw%C8|auXuMByLzY@f<0;Do!VwNHM@~fI5p!R>u+@`7;8Y~ID#8G$ zToe$P=Y$0X5<~d?3OF4wcT-_x@&&upSg#Q2(b-E)>MD`f-XUW~*T5kr?#55Ax zjSe`6r8=qJ=2J20Rx}VC)8!#Mk`h=`p(*J4nP86?P9O#PAOf?F8}n!8>+Q{cm+32l zE?P}J$8(~8my2I-Z1MiKRV$C={2!UkO(F={{K@&nOJl2BVd+;>x!stw?ynuqM%m&< zn*_yshWt(bT^DjcTlLe9e^cjIUe&jv%2!s_=9^K~1qQkGB!JWv4MwVugG_Tx8d}-4 zyfK(KanJ2TfmWa?*eClWGFywS=gmI6x_k9+)tVfID6*9#2t2wN*90pXI{jDFS1fyO zH?6|6YN)K*msN{g4`#hT!=^N$x}*U!z%PN6DV!#`Hhs`kH!?=mK#ZmVzz{h1d{LLQ z@4K$SxX|79Ylm9iQ6vhD!R?VW>aeS7QgU-K(XSe1sicg6GhY~5kqS++2!VUxsH%F` z<dQAN3-n5ta>}FtUAwRx7@7JwcYu5wyLh7Sb3Q5yYbR@#>gp9?Ir7O zqNUSaIeog+c9+wpTR$AC$)@-PHd8@JApnSqb}K$R0^}ienibAixcC)i7kqOt`rr5`Y7l z@stMv*tm`d#t?^cxa1V%5&{t}$??ZHAyP#`Obol=Na2EDPEf!_ppdAf1@SOLLK0LU z`VM+wV~oOJGJsIXU=E;rPZ$t~1Hhh9j3$YHxa}F;t#C@`N113a%0YX*sDq@WdbhXe0ulY6gAKTpZw2 z%1Kt?fX!fs06`#1YEVxI1xQd;#z%y~LktF%5RNP%rcEXbsp%zzNKY+8_(r1umWSvP zG7k2h!Nv=VfW)Ct3UHPw!U2s9Dw0%EQ6-PI8ovlAsWM85!_fJND5S3(KB$ev5Q{!P zx)2Fak-|hb4$NV|@%DE^B&4}~IOPUK4le!BfO$gjgb+9c!WyKgmZU^8&I$&|DI@~n zB7~efA--T*GawYkMgS2COG*@);VkivztNY!uIZ~X;tg9u`)48l0ET%z;73!ZI`OGn`{r2V8&$zB$#v>C-b8 zb8^rvOp#VNXH(dl4c4L38p?|1wNT=5j^B6#o1{%-2k(wg^AWmkEyREz&Mmax%%@dB zNiUZs^H`#hsLpblrvnZyEkHpEIe|cZ^N7$Y83je*2~hUsHI1h@c`AZr4=9|V0IEvQ z6c83bkfRW?oB|}L%DBJ-oEqo5OxmYOB@`(|fV{eKm?@D+B}%ASNFS>R9%lfBa>poQ z2_yhQ5+EHAwJ3OeqpU1f;vjgaRa~AOXj|17;5aG#SB|LWjTW zfK;Ara0B)bz%F{ZKmiK(LuOBQaDf9kgEJ0sGI0WCM1vvU+Xx1YLGi4{XT81Rqp4H8_l+;3qVtB|hi?+H5<8O|N&fyrf7icw@BY*ANJg zVC)y|+&1fbru=P3ejria#|KA?0u)xLB^M7F$4jB}Qu}4-iaQS3Ro~Sv^#|QFkZq`v zX}Gk1{l#5Bm;V3{`M>bt)7(q!)3*ZU^K<62tc`w?>3ZpRI$N&cPZ|T_x1jKU_XcgN zQ~eJmKhFMF0rIw+K+*z8su|+})6_{-R~I-ET%~h{NC1jP0OiI440ougS`|QE!`VK_ zCZS$o%&2n3WME>2&LrRGv(0Sd4H3xEL-AjM{)Bxw#UL!{3fKV*nXfu>2p zaVHp&q<~2%CJ%%v08c3kJTSvKtRe{jPDrHT2ox#i0z=;L z9Y0o9jGyecAL5P^_0uz<4EqFpPBq8I36)5JA z2q0o7;ldNl29jC}ijK*BaE#Caq=7Ze!*t?X>^v7Zb`2nfU)Vy_dI$nsi^WD5K`L-g z9n#|g%IT5`1fe`KfRa*3amhWaFp!F-Dg>sBKIn1~q~<0efSTbw*ra6hUfKwT$hJ`%(KnezO;6HQ%6x0(u z&ji92j#5)jC|pRx5IwM(a#<7^9N|eqg#;aBV5*BK0BYcfu9V3${>Vka$i*e20p74? z0CFPWqD2;MhLT8YNyD%n9xx0_SkMl51Cc$R5Ef;p&48Liil8gYFw%h!aY>{!fv1<7 z>4>lr>1r8C2<_ryFK)c6w%*(B7W~yL-pb*KiW+&^b7q}Bfblc&|xpZR|bDCZy7+J$wxy1mCF;Lv z$@DfYvcGtU-E^@(4|z*jnX0GN00qO@;|PGNt0^bG68p0ewC9A0DVKh5nhetch$j`{ z2!IhMAWwRc1K~sqDV3HyAOMLJKnWiaL<1;OP>K?BJQ5qkr+#qeg$Is4$OHrfB^a(5 zK?I)O-v|f^LBV`r;L-q`iogPEON&|r@BwAou(HI0PBWKu8OWYEm;`x{U~+~6P+YQ- z8L0&ha)mNUArz?qOBe(RsQ%~xa`(*lLIKD^oEHHINlpbqcV`Hc3*m$;Fq({D02zUp z7%U*ICDR{z2~6aGAutO{g1kbR zssJZqG=N+ha0#96ZrBbV@+j5fUHH94fxpngY0E#O6R@Za9Zg$&Uu;;mcg7DSS2Q^3k z0O?Z>cPx6Fq_#_YN;IfjQ}nH^SSe9&4Knef~nvj z0D%YuOk$>8#Bqn|#lR5N7@o{hBm=ayzEyAyYPgKt&`W=X`r6lgP~lFZ@`6ez(g`SY zFwuiZmk^SMqA>)B3WWw9FboeN*q|%J4&{YW0~&`SMiW%Q0EokyU!cdd+}98A!w`d16(|P*JW0TRnb5!l zoJi6}7?~V2j6aGHG749)9XQJf1ty3{QfI`#0wiD%cLbI{??MBaT2e-07BUFX;*L29 zm>jF9fB`B%FaQX|j7}hZa88oYK$pcZg9IRB-wEZ~kc4(6L6G8*9DtA#QU-JO!s;-C zGB6I51v*g4aY#gHsQt+-opuWf#fr~lNuyMdo%V%45NWf=}CQz*r(Ovk!8?uS!T z4zL)4aX`J8#}ezh;H~tH0Q)uDH2_NQ;-1(FMW01x;A5zsf9Yea*U0?N8Sg9fy zbW7P6an>#LMO>Flbn7Eh2n3=I&e#Z3hq@QLx^PEtHh$RFVH1fiDa6hisO3x{-aXJ8 zjEVaF@BooO?K2-lDbIr81=DeHIQ>uoAj)Sr072NHAa-F0q>e`!WDFfBhY-Fp?D2!C zLJr0Ki~s{XCOycoP&heE=aPZ}5Rp`wWD4+!01azi;?fE7h182YaDWI%E>sB!&PkA8 zY(A!)ZZwp!PK|h>8T(;^U9*&+5HyYi5rjAu8d6^p98ef)xFI2UE^wq(6SxQfAWBLk zp>aTv@#e%GMMIBfFcn0Iz-mRyz5yhbtiT9^AgZ4DV*n@@0oe{m0Iml}kUjW7AvBSU z=aP0n01OH)W^f23s*#8j?0`zSBxVSobQDlQSx*cBg6A5OaT<$2kN`;%02`;93E}LA zO;LC%${GPujnowY#1|7J?T5@LCQ}8x6+dGD1RxC{qt;`%X0Y~&A_Ay~3>G76RWrl3 zC2JfSq)Px77?Q|BIpRhEeWDy()KF0X0%$vDlqUcYc|s|gcTC4Y=njLcx>l#W?sSN< zU=FjldJ|mY+K*}SP-mH+@g4Ud$Z9R%ZT?yGrF%d(d%7;AcXr0px1{0B$|Vl6J{MKx z{4|>1w;qDqEn97D#v%|0(Hbp}WHj8`^@^2|F=_)yL6le9)SQ~77J(3A5!=EEL@HEB4$;_5CL%Vd24E_K;vxE8iKMAXI3_R%MgjurRPG)E08LX; z0X?`v7)T+(c;^T;QG8My8%&TMO)!c|YAP@QOU(e69x7CU!T|$G)gy$_s|`ab{@yV5 zsuI+kfX}rcl|hdXKW^w~2DD2Uzi8EXv zU?P~}rz~V7urQIw6$ke~QcrFv2e|e`MiA2EmZCX}g!?AAVF0e427dSw=248&8~~U& z_!N=NS)npeC7*2*s)9X#pf> z+GCqjszgwoMDpgm265**+U>7*ad6OrwP(C02T>i9A4AK)?WI1pHwHRU{tug476BcZA z@~@)rXttwls8(e_qM)|7q|yk)ex9AbE502toSM-l-&=tZC87x&8@X7drD@j%j#TFYB%BaKT<$*{EX%Ex+&E<2cupBSzOcBC+Xd| z006{_w4Vt&=)#>OaiEjch{*&XtAYbOBd`Ae{{S!PN{;-yoyL(UK=mj+qFuC_a)=-D zcEX`cUGB9!^whRxL;fb#r4sOAwVazyNQ;Ew5Im?CDaTN)@>=}~v)-+0XZh5sQ}+61 z;x*O2QFUd%^6OaF2MQgC`9oK3Cf62fh7kM+zYtHy5r@SS-ilb{1h=ke1206QN9 zx|Tzs+H6o#9kYzc)9p7+eK9dU5w}o^u@{acTB8(TU+xs`X%Y;==L-99WeQ~vd5&Av zJx=PcbqjYa%Sv};160hEBUWc`9Zy1#t#@$nC{PY{LbDBQ);#ySZnd|~%I|Z!DN^bB zgK;gU(WaF#_I);KY;XF*b*x!V!&&t!2Gp#{NucsHr`uKj7^yo*ASNDZtotHSPL&Id zEII&^R2g6=l3|P>aVsMw>>O4IC0tQtpG2j`7*x0`vE^ty5K1IgJ&qxwXycGTxRoVm zj5LtCPb#AEKs;d=@|Kg4&p1GpIAP2%)r7wECNYOk#f$|2qDzA3+Y$oQLJ4pQB7i$o z&TxxKBg#Qggz&_b!|udD0b809W&&ja6yj8z0OJX*ILIoQl*1@+DC`hQSnv=4N;t@H zn9U%4l{5C|0$sob9v|HZkQ(DO01)m6v?nw=fVosV%s7cdOwSC-%_D7FruW-+&^H!t zq%SBk>f2MB&qksAr}>8~lL#lJ_qPpBqQp3dR;CF505@bCv@eenku_Lbl$MPnNUd{; zJ>(!0@Pav7b9pi)WCscM#^?Uj-iRmArh7eFYxKb#F}{(@L4~1z)u$yNi?Rx&OY$7A z`V;mVazJQI4{XWpoFk>~#7xYedPd{^*WW^cL8^#?tv~!wuR2?P+jh2ZH@1PBZCb|F zwf#modfH307O~pfCcm5R!>JpbTlV9+<+(*=ilL@Yr@9g+Y4%IL2?24L$jLkVVW+-* zivgT=w(3&FRFRmLIB|yHjV?l+zq$mM4CZl!xSY@-62Q$?6BwpLK=_7To)C!T)QOg? zrTy@wC7P9y?J|@#+C~f9umF71X`UIZhXDv26PPJf&ph};Nl>VXiAW_}RD@MBTucHG zbee@4Jfl2e4O(>oNF1t?1gXXXoO4_<5`xqKkw3fOivS1=Qyw2o5Ds1`IBu_mg|8C>QI5MDR}k0NoNck*@Yp zWZzZ7n3isG2w_Z4@zYi{>FpG+EM9>rC*^Die~()b))))mvST*|0sAzP-f@>eNAKZCguAhWeb(Mv?2bm;Ppm zrZT>!hd7NTNO78iQBMIEJ=3UmzKpiGtADxw09NWM!S1uNd0?*R)dSPEp>0Tdv@JDK zXY+OXUlmxPS<|?sZ$^#T6>b{VitlI*YpNgSpaViHoed?rso_Ddgx^uxw*l#~l$ht# z1sGlif=d>iRX>$i^QmZKx!vnFCZ|T_h+l5v3mH`aRi^j)Y(1LZG_{_1kZzPsT9sQ?h}(|rT2uBGZ#)UJDIywwssytOJ- zdw&x?%ne8go$E2rr(Tg|M#$2ba3*0#;Ye%iF%f6L5tM=ObUxnVlP+y)w^Qj_vYk3= z&`ZTIPNxQxX1ASMb#gb8;EA7l!)Er=f1vS5)AU%g$KO^a>tl-gbNKj{c_Ke@qID=3~GiwJGIMg!!2-#rER{K?w=t{gK}KkEg79 ziplo3Kr*OT+uSuC>MtfheX4GgDmA>Z!mlkWZTFWJR|slb)O8AlgO$<0pkf~a2kjpp8TQEh_vc6AQLc7ND>GW`$ZIX&nP9K zc}#)=gqirjac~1J-qgYX6HpQpH3z;VXe9fDu~=&ZR~aPFfQJrUE<8-QWep(`NQoTf zn*GvZ+4Adl8*fkxSD*TQ$-aSEB~X7M$G6PSX>tz z_FmKJ)o1Y5Yug!{=IQ8Aa1wDq5aOdKg2=VTtkv~a{`7Ud71ZfUbITa2(%T-}di$>A zO*?CLy1m6r5sVt8+wjKL%HMDOK(D-BQl(Yy<)>A{!B@)~Z=2ioosQjM(-s82&G{3sIPwCC)16~M_R=s*DNSa2!Wd<-Wf zkV;9~bASZlq$_5U0OL8pa3Fx}Wq|`2L>B_=mOzi}gyPXG6ETEEi~y?Sc(05uAfrU$R1M6MOr<1<>Jy;L zBINMSI+x4umwQ(K0J;8?cNJ?;7VoY@DjHz9ucpt3n>e=hx5~ba*tWZB`*n>^S@p{* z7hKkIv{9<`=X;nY!SKXXtCGyr_*Yf?!3#@4 zK11xT5&*#srqQUAB4jwm%VpShi{7BvHtN5rsY7wB@my3}b^Z|dreN}8Rd5JLTyx-6Ft@|%VYs5_bH zizEKy<9xSuTAeqjX`Gar{euKF5N1k^M3Izj`l^GS@_yays>*Dqy!w?I_XfG`TJ*g- zVHNMx=D$mS(q%{dF_W}(&e3)A3wGFUDOG;mYiV~{jlb$EH7Zf5KvQ`f=N3W>u?YP0YTbMy16^`gOH2uPk!3`EDzDq8xiOySj0v6i%k zQIeCCCZ3d(YQJpcvr-jP2MjUB0n`Z@<_<|6&vE&A(Ui8^%9h(>g<3Xr?&((b>zC>+ zIL?*6QnT)Lpz%u&f(evsQyYN*{{Xd5%@VbKHzc5wtF1f{r1&L~eVo@gvzh5Jyw1k`7`mCwlboXfUlZMeM} zUfAw6;1Va`IKtD)PL=K8xG91dxhWGtj(He5S`p2j1qh3ZhLAoIqYQ=^!+=0Z!wE?C zLy~$)nIUjV&nOs{jZy)7l)xcK1mleV0CXgB7bJpl@PHHv<$~rR!6ZP=MGh(74FD8j z@q3dPP~~nqJSUDEVL%7}*S3Q>;SklvS$M;!{{SqyN4g=b?qcGLkoHG2L#?UJersHE zZB}58XH~4>+#0}r?v81w1ed>$7`uGVxb=7HHo%8|p51{)MecjgwWYC0YKa0uOR_o^ z)gY12zm3Mbgmi*Iko&!pNGFs4*8)mRDg~aNXbm8V5)cv;=6iF1a$j=9%7+JOz@Xs( zf&|Kp7qS+l2nqrM_(07vPumz_pmBDUT}3g8DAqRh=~igf={>IvCQ8=_36H}O%?T<3 zg#&~oK_vshFn|^X>Ay1W3yZ34;$)W=RhpGgGa_1lFs++Py|pSgn{_Lq(|Fq&%%|%H z&qK{WOG{scM^keu=>Gsy&lDnnqvekne#$!R}{mx9dKe zkvoMZGz3ECk8pOcbZl+9pJTZzM{=VglXz%O3gnIeOpNj4KV$+MjGVonz8jEk>^&<_ zaE``-z9vy0=s=3EJk2<%y3o0MU>fe`Sn{7k8G(ZSHE(B@xuBxu5J9 zTZg%HJswM51DZ7hoiYfNHNnjqwCPZ1ikS1`ckSh;Q@0D3rW)!Rp6a@zDV$vsM|BII+~FU zZR1_|*r(|+T)3?Jiq$SEr>RlSd(a@%=NCH_!oD}sc_7X5HrG8NMk5CB15Oep;Hg%Yp%&?TYpzZfTDXPOtv}RyKWY)IN<$w5<<425JvEIGsP`Y;p~= za=O}gD{a!z-=A#pQHKpc$?SjJdkdQAvew+2eRERhQS_@FO$KDN_v01abW6O^W{(2T z)7KkKU54uKcDii`T575s<#X#ZLDfthT&Gr|6z1p>&EC{789O(cR9 zWWAm77ns0wtkzSQ3O%r!BdS>`NrIvVpoIBR4Etvr3vIRagKAfmJQ&0vm02DdqB5F- ztLakf9DA4UD|VvW`Vot5L{_NQz1moO&H>ITFo?mPn6+)Rw)=L5fbi1kkxHp2gu@Oo zbLndqHipoQ(g>#TCa7P#BTw+s5p>*3g^Sn;O>`PDm^G=4Ba_bcKb6R}$57q#HoDjE zJ*RVl#$0(-ZIt)8)vT#4m=(T(>mHrjH%gT^3x`|LHsy_|`c+X&+ThTPCkWkj{{UU| zw(quETkW@wx~kgc?qxbvWYb2gH7l40nej^M%FnC0Qg9|#md$V^nRo+VmmW;)dW`Nk+>~KGsd;`BSUPWVG`Ib?aI0l|N`q=r zeQNHst);9E6rA@R4n}&P6Ju@DRByKJ^Q<>5!)?8{w!9mottwQtula(tQ0gI^isJT; zR>pqSbzZK`X;toaw{7*xsh)>f+${d}M@Apa$~QGP*Hmp*vfkdx-lg3)nvV`;O4Vsn zf2PkbrBTCMsrjkv^*Ljq%$%h=E{eFN{sYNzp4WXmu16p7GXrY7yW97^pSj#Ny~^5! zwbQFP)hsI@XX&d`k!v}*>NAzR&4=vOuJ)bFXt>=r&Bn^o)Z4P9#akN9roz`ZUZbIB z%E6Swcq+*G4~Fj{T<~^vbDAC3O5w(Yt#>{MH+&RhSSs@=23gv+Kq~q;keXg zA#ZiH-%@?uHq*0xTY8Pw@)%i9NgT!cnWQ!?A1#H&I(IIp9?OHe`gPjp685-TRbXva zyf;hfvAo}oHGN@HxAkrno%=u2iyy~7yI`BeY~<2t(=gZOR<5-C+&HR+OmK$%5?)mzQCYVh6_I>Jr-`pT>&t2*5wsmhM3*%&o`={ws?2UNb zj<(sPInSOv!IF`*pTB>sY_82hlFU8Ky?kY({*S+zp1-nAiobn4ckPy^mf`4a74sTu zTo~{0hw+>z&D_l}Vt@60lc|WUz1MvvYVZ3V9jloP$)#Vj8mxEf8mKFM_$_b@UT-Ec zh)!MTk)52MtE`JL7XyN;z`!fa-I&SN9k+)J&6@Upz3^=DWx2*mamI5)qol_A=Bd-Y z!H;ef^BD{*UlwRH6W526l$#u{d`M z5uqkg`%o@I%oClkkUiwNVj2<*(>EiW8BVb z+d@l2c3rcRxHRdi&-0%4dah+jyVmLMx$xKNlOB`1sG}jE?x7P}MZS(*u@`PUJmW=M z+#8ctI_BH;O6^}ICL?fP>IWyS>CDhx-@fk+s1~ko9nN+nA?OpS;Y4}7P7fFk{|Bzq zrV0pM`<-1BuW)!WJ<@qtHr=!zB-Ux{0kHAa@={b-iW$cxnd4NLgug&n(@MqZp>?1r zvh+#Y16?3M>VTbpVUX5(ud3M4zXP6i`4sZsY~{j?QCXIMgZet)LP*0Ae-x5k(P_Q= zCmDc^^&q+sxW*Gx{JFZN!QhT{`3!nqBxl!X+37-j}oI?Kj?Da9-3 zzQ=i}usu~#N8-;MNoI}0U;x1gV@U_3iL^w`*r@Y&z}W+Z1($Gvr~-o?()uC$sxD(G z9g^0RR3tPI5y>ZE+lAoy3EX^8P&VO-l1ja<^AG15TV5`idiDIhh)Cx^VBT1Z{iAc$ zzO?q0JP-^pa*be26k2B<(YgSDgjF;`#?+2%df~}bHN}smQ~cqx20?sjpNi51%0*cG zafq`?PQ7-nCpKh$Io_{vEyD93#*NpPTE4t}HCK%b9;M;yggG8-9MpG3Ej0FUKio9G z{FWN^V!R?GG2dP;024{dv42aTo7&j&{KtejiRsW3fm=a^pI%d?_F-iy6MnwTrZGE8wDSsbtFOEK5QD~p)@`BdCg_7 zkzYown)|mYo@BMqcjL10MBC5i){>IfqM5UL=`WB~FFsVeayq4pT@{Pcy6?6>%iR0% zi$c=}5!ur1EJkgfF$iwqS=6fdrAFwY!AG)BHlFV5lT3XpclL>fxW`~ei!+!sohr|) zmfKSs?>!H{d^RN-7Z91(nG;FhijP)_YTkdnyyS6qE;(lBk(m9=L&`g@MnpfUUJ7qM zo}4dp60|bDUh()+OM(aB2}V>=By92}_4IT<&8V2_=$kgKX?Ks@%IhfUj|&_Bc9ZD4 zXZ7J#L~>({$H~_}TFfPt6?T;Q?fuM{7akji2(#vNzQ&}?mo2rwi^3_Ieso<)j-c6Y zOA-9mK7i){oA|aXX^RB>gw2?W+Cj(N#CxN^eQt|rdkHfk!Z(o*r>x$HzSkLE4+_ZJ zbLaj)aI@kcR*f&0jo5cD53pMgs~pZ^Zb&Z$^gmdA5_6=rY3jINPqW+g>Usin;`hCO zuOFHZ6Khf!Hf!TKL~c^y5IXwmc0`ukZ%SL!>yt-Vd%V>h9?cxO8RPf%;oXQ#k=HLj z5W6pO5t4r+koxRiJt-USfgr-eKl65#L zlyS?umoyKQ?r}jHE_ow{XF(0hH z`OpT|UshHj{_3`E$SBQIOJ&*p}Ex7GgxBaff-&A-C?r*<=y?B2Xg=n!Pz+4acOB<`Qh@{71 zQ*sd)EnrX>aDoWJcf3qkPrP%?!q0C++jSjJz?hSwfZWYS64s+(cqxHnS&ayRj)>M# zE=!ot7JOa67OrKQug#+Ag`|lM@`5-8+k69&IAxHHCs}USnxHfD%_VDyh)(oq5`%SG z+;G>5I)>;0AV-vWpnX>AJW|=Fbxh*A8;?};Q&Y3%aDq#UlVB<#%8ePS%W@~mj=oxn zuZsX{yrlOjOMzKPX9~8Rs&jU7NLg?DKzh(8t_E}pXWGLmofjo2Q8tf3R7}RQ)l&11 z#u70&IPR&ocRH1sxbsdBq`xNQOOg~=GLGS8=`b=gDbhDo)=gh>8IXP@O@iHPRk~z@ z%QZ>oJjp-;nnPX?z6d%X#n_MHgl05)7_gH5A#8T(&XrnN+^R_$E2+8CyC#O7Raqaz zlscI%N?B`q=bz}K5RRp!sH9ra(ib*WsOnD_y~-dU_=8Hvit9-Hjo0vQfl@>o#Pa)} z4^Z_$EG5!G#-IS^a?|Q(=EuTpg%_giMM|iafU2ee~ZElzp`Lb}!Z$sV3=yEvTjy--obJu+s!&Cmh z(f7r#M>V>t`zt$*Hup28CSKh-5nvf{Cy#c2y|AhsUWXj(RAqaPRr)mWnx>Io^uP7) zF`v=+7F!mw%i4pzsVB-ibWsglx6R)guUVHPj29v7=tbK5e!4hovzd{l(Zg~IIw(iy z$X}JZhS1Vf=wQDBJ(%R>u(hrL;WxEqdKms_L z17bW_=V=zTD5SiAQ`3aRhU)#{JpfEjU(3F*6Rogsrzg0^C+m!aNOizI-&YAAij>l% zb<0nwIP^n@gY&-IUE#)yoej(t6H`Fk_COG-WzdKu>--Mv1evigAyjEc@m0euEMu*a zfG&Q#v2d*0aykrhSGN_y1Hmt;~!zdd9MD(JA(rH?kGKRMo_ z19M5))vffa0+rChB^2?w3Q30f;w{K#5R?f?&RfyxRDp7i;9@~5$-i~nUAfg$ z-zbRm*knfjfn>dB7G&sZIUK;7KM~qC_`CM!w1~U*%|`yhb5>n$@f$qiF*yQVt_~(` z0dl+F_Tx7y(+OqGodvE(Ysl`AND_^B@nI`qc%eRj(0F@b&_c!>fFShA5~K3mGs9)U zN+O=FWoe)Y2T4Yx#V2!!V6bZ7j;!Fh@2FOukwK~;iE8^tz>ZSVz0X>QsLum)o&ncP-7LrU?pJXA{0amRNzTycvtRG$Abz*Yd2o5`9 zTu`!4&zJ;TAs0KT0Kfzy5rem|uenaz9`*)ENV=(|2$gyqx-bRr1#ghKmMm0a8Z%W9 z41VghU5&-FQG`n5ut0KaFP#@6sNMMrpVWEGl1Q*CJ1ShV>Ax7?t5nE$;^-UmpGPVe zN;}uyb&575T)#F@M=nBchrW2=xyV)J|DQK)S>X&|D-ZeLzEfa_f8;Z>G5!~*k6m)w zWuhn`Mj6B6I7RIGIT;MEE!20*XD8>`dqI$M)`}**8b3{;|3pH#05pftkR;2P$3Xq_ z{-2zWy_eHwdTiZJYPI+~`+Yie=(s7I-bW|x_13zB(VScJ=?Fg*Dao*eZfCb|{yF^X zpsso~i1z2_30G|BD|yGnnvi)wW*=!hSD;8}G*xjTe-5@CvtQ>r544xr;KOBcY)TJ^ zM`q*YL-6tYlzLUjaQc}((}Yz*D`Aud#<?4~5KkXa-y1TtI~Eh+M`&QuSMxKLLJ zO2z2T<2gzcDXq`isQ7aN-Z`*;tPSqzv~sc(SUhwh*{6Q zB%TSHwXU=_&rvO1=RmQF7_B*DL`cR@6@p#_Qx1v@;}f!V6FNag^+WoCb8-3*mjsgO z3JCx9Ak)Q>(Tj77CZXg?xL1zffQbNAo(NVq%&p=1o86~%6TP?qIm*6h2yUWc(0#k$ zM;F<2I73x22VO@8Z`QWEC?H7M{>sMi#(YLPDzh9Uc{AYMrR<$EGJ_W?n59rngb}(s zXzhxV-Kp)H&sX%$a6P6@eQR4B&h-zN?J@3MiC`ab{~MMxRX43=dCA!8lg^o`RxQ}O zSN}{}9T2s}C*K=rnttsr<94e*gz0`I&Rfg&=y}h;qo-ptCwwN?&gGr@c$ZS^S5oMr zXcYsM;1~X?4ZlfW8%=sR#ilk0-_T3^WJ?YXhF_@q<(IYe`eW3L+18$=XT$3zW?M(Y znqQYw?;37D`<8u1?Vyb3#qeW~%$5|3M;33K93Ec&H{9B3?|*>a#E#~~?bN^HRA*>7=7ODw3OSgGxE8y&uw|IgCsO^QbFHvEMf#J#-Fjw(Ix} zgMBSUAnB(UiZJOg3L>Pn199Ajam)s1kr!35KU%m>@&waZPO5`-k$*ne$CJ3IE3!-_ z@hwt*fJ)gvy3P?3ZiK#%#4u$Z`HwBI>|rPd;jj$Iaq6p5`rs9ukFaV%0a#fqoD^mDZ=O5 z1YRRl>bWaM&O!Xje6iw!klZ%-xdIGH+SIp;tf?IX>YyI@zR5VscrX-4D`;tbTw%6+ zPKR#?m%*E+ACq-2g3?`DkcSvRn!FxD)csVOs(~8!ve%4jC|xp0TjikoVlh9c7!*%O zO1+Ed059bACJ4Y^q;qjfP#KgPiF#aNyBdIU9SLb2%(NsIzSHVq?vkQB#$%r9)<9r# zl&Mrxu6Q?uoIPQXpmQPG3|K;500iqi6n4>_WB;83#fHRFBxGtbx^fJ!wK$u;uMlLc zf-jX}JqBcVbrNdbZwmeW^uV&wWxl*3GS?itBR;SI%BE87j`?S@RorzQ>=s^3q4xBU2lgicnsZ6n=z%5-}-Fg6U-iA5W z^X~zKVlJ9j0r_wIJrwB&M+f`|ymTx-FRUq%L9g!nF!g9)s>W{L8`>VNK!pSJCl%BRJf!+^M zjPE6Sx$3ZNMZ8G_mwIlxs<^U2({v-08mok`VS5e&tZ@hiVcVzj48!6V6(^NNqv zfz2up@IG@R^GPSfxH*Dl&8`G0R0a$GF$8@`SoxG__Wp=$CPRElY{g%Ov3G3-W#SR4 zeX!v-M8kkxPtDzKv7|9y7Q1F(!0a^Wlj_gTzp8lYchAx4GS$h2Ti4)SFmyYTic!Q(?b9fZGVgT+M$+bJ2z;*AE z$81xJD*x24-Y;?X{p12>|6UjhHhiFUX)~^w-7J{RGb`whwfXaYBw}QO)yc`qemG`f?ZSDE`cyHW|d-pFq zI`4DX=Im*a!;7oU^}4TY+kBqUYnMdR4+^ZJ$LqgKP<-Bf9>dwU_nxh!FWwJ5^6*^} z^Zn_e=@VmH2X($mkJkSB_553$`ssH!sur$EELgjrZ|IJ_?caV+ug}YB$?0WgRqlDc z9OjhU=y#PjBx2jcvF5e&a@*SdqL-((OHT-*)h-v`VsE+Js`+t$e9R+vU;z7j&xzUN zsdutHEE;2IZ%WRdb~=CB`>&Y&Ci8F0{i@HCbm}IHPbPcR{w@zFQUCq&W~+BWMp|aA z+ztZ=UESa%Coj8#r}-bhbku42%}!zL$E5UJ-&%9gx%f!luenBPwq;jqYpjyLkFib0~Hh~Axxarf3i4W7y{zjqH32Z67=hAV+hLcz#O6% zUlxi1CtZ71IO2*4tNR?qLWx?Z#zi`6%kqK7Gvv?7d6e#*r|_Jn!B*#m&59*e#Q;<8 zBE6e0W3ai-0aYBtKtHV}yj#$Til4T|A+?f^jQF=Ew3fXDg3*HD_DE-t9GIS& z^r#{G<@AB%ybjx<&ZxRPyT{5lcy%lGrf8$fW%mTo*{;ZGb(;a1YM@dsLM(QvO2!9z zC^$*dzotB?AeJFoVd!D;=cIwMeV;CY<+RK=tfy|L2*W9=gLITb$W|GtG|@%f&QczW z1T=zE1#^V8o%Uv*X|wp{Q9lI>^rI7J%=P)mlEGZGj7-Nhv#u#IaF~&Ub z`}JAfg#UT3ZorwgrlOmy&ZeT3tmdYokF5TtqKT|6OhpG-yO@e{SsR&(T3LIUib7f2 znTje|JDQ3TSx-%UeE*C;Ugi2tTyNdf{F~uOlZ7=kb3iOQS&ODRAd+PmaMjNnZUjJ9 zlUR+5Lc--_C1A-ayjS3rUX``c4B&cG-3=31RsxEw^1%(M0Zvu|%SvEb2`nptWhJnz z1eTScX5Ns^HuHuz#(W(4^W5BYWUP<8T=~wzvHVw=eSveE- z@vmU*u$W9x_LNA6ouqO*$_do4iT5CcyX{v$JfuD{;{F;oStf}>w4t0GG0jzx?!IRz zQ-)W1Qwl#ZaW_~0ufxGg%Bmjxm&rPqx3c N002ovPDHLkV1i6h2Xz1d literal 0 HcmV?d00001 diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 865b3656dbbdf..9d56c1169dec2 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -496,35 +496,20 @@ def run_word_language_model(self, model_name): # Only support CPU version, since tracer is not working in GPU RNN. self.run_test(model, (x, model.hidden)) - def get_image_from_url(self, url, size=(300, 200)): + def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor: import os - from urllib.parse import urlsplit - from urllib import request from PIL import Image from torchvision import transforms - from torch._utils_internal import get_writable_path - filename = os.path.basename(urlsplit(url)[2]) - data_dir = get_writable_path(os.path.join(os.path.dirname(__file__))) - path = os.path.join(data_dir, filename) - data = request.urlopen(url, timeout=15).read() - with open(path, "wb") as f: - f.write(data) - image = Image.open(path).convert("RGB") + data_dir = os.path.join(os.path.dirname(__file__), "assets") + path = os.path.join(data_dir, *rel_path.split("/")) + image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR) - image = image.resize(size, Image.BILINEAR) + return transforms.ToTensor()(image) - to_tensor = transforms.ToTensor() - return to_tensor(image) - - def get_test_images(self): - image_url = "http://farm3.staticflickr.com/2469/3915380994_2e611b1779_z.jpg" - image = self.get_image_from_url(url=image_url, size=(100, 320)) - - image_url2 = "https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image05.png" - image2 = self.get_image_from_url(url=image_url2, size=(250, 380)) - - return [image], [image2] + def get_test_images(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + return ([self.get_image("grace_hopper_517x606.jpg", (100, 320))], + [self.get_image("rgb_pytorch.png", (250, 380))]) @skipIfUnsupportedMinOpsetVersion(11) @disableScriptTest() # Faster RCNN model is not scriptable From 98449f5bbaed2ae8b2ac5f3e4d1cbe6445011961 Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Mon, 23 Aug 2021 09:28:21 -0700 Subject: [PATCH 133/530] hotfix clone issue (#63770) Summary: This was discovered during https://github.com/pytorch/pytorch/issues/63408. For some reason only this checkout action is not correctly set fetch-depth Pull Request resolved: https://github.com/pytorch/pytorch/pull/63770 Reviewed By: malfet, janeyx99 Differential Revision: D30486110 Pulled By: walterddr fbshipit-source-id: a67395cca2487407ed0d49c8c89587935ca5f212 --- .github/templates/linux_ci_workflow.yml.j2 | 1 + .github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 1 + .github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml | 1 + .github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 1 + .github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 1 + .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 1 + .../generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 1 + 7 files changed, 7 insertions(+) diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index bceeba51f20bc..9b4ba87194065 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -316,6 +316,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 1e1aec057c7d4..7e6006ad300de 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -284,6 +284,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 28180e3e98727..2cd316b2b8674 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -284,6 +284,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index ddb1522962dff..a88191469ebde 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -284,6 +284,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml index fb6d83a0f2432..264553f1877ee 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -284,6 +284,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index eabc42408fa91..7b100e6d1294d 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -284,6 +284,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml index 47ac9f73d422f..9d7da989e8b9a 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -282,6 +282,7 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v2 with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive - name: Pull docker image run: | From db1b27fa8db52402d23ff2f357856b1a5d08d258 Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Mon, 23 Aug 2021 09:44:09 -0700 Subject: [PATCH 134/530] fix mpi ssh runtime error (#63580) Summary: should fix https://github.com/pytorch/pytorch/issues/60756. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63580 Test Plan: - this CI. - validated by running on the bionic_cuda container: https://app.circleci.com/pipelines/github/pytorch/pytorch/366632/workflows/478602fb-698f-4210-ac09-d9c61af5c62b/jobs/15472104 Reviewed By: malfet Differential Revision: D30486472 Pulled By: walterddr fbshipit-source-id: d83ab88d163d4a468f03961a13d891b658668a7f --- .circleci/docker/common/install_openmpi.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.circleci/docker/common/install_openmpi.sh b/.circleci/docker/common/install_openmpi.sh index 7bd32c71f16fb..8c45279b8b464 100644 --- a/.circleci/docker/common/install_openmpi.sh +++ b/.circleci/docker/common/install_openmpi.sh @@ -1,4 +1,10 @@ #!/bin/bash sudo apt-get update +# also install ssh to avoid error of: +# -------------------------------------------------------------------------- +# The value of the MCA parameter "plm_rsh_agent" was set to a path +# that could not be found: +# plm_rsh_agent: ssh : rsh +sudo apt-get install -y ssh sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev From 560cd881956bbf425251d63f0ff0f9085a759447 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 23 Aug 2021 12:05:51 -0700 Subject: [PATCH 135/530] Kill THCUNN (#63429) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63429 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30441308 Pulled By: ngimel fbshipit-source-id: 3ae342a2f8d5c7f8827b637c4055c5d1b0a1be26 --- BUILD.bazel | 18 ----- CONTRIBUTING.md | 3 +- README.md | 2 +- aten/CMakeLists.txt | 7 -- aten/src/ATen/TensorUtils.cpp | 1 - aten/src/ATen/TensorUtils.h | 1 - aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp | 1 - aten/src/ATen/cuda/detail/KernelUtils.h | 3 - aten/src/README.md | 1 - aten/src/THCUNN/CMakeLists.txt | 10 --- aten/src/THCUNN/README.md | 26 ------ aten/src/THCUNN/SharedMem.cuh | 43 ---------- aten/src/THCUNN/THCHalfAutoNumerics.cuh | 38 --------- aten/src/THCUNN/THCUNN.h | 13 --- aten/src/THCUNN/common.h | 83 -------------------- aten/src/THCUNN/doc/api_reference.md | 26 ------ aten/src/THCUNN/doc/style_guidelines.md | 64 --------------- aten/src/THCUNN/generic/THCUNN.h | 29 ------- setup.py | 2 - tools/README.md | 4 - tools/amd_build/build_amd.py | 2 - torch/utils/hipify/hipify_python.py | 1 - 22 files changed, 2 insertions(+), 376 deletions(-) delete mode 100644 aten/src/THCUNN/CMakeLists.txt delete mode 100644 aten/src/THCUNN/README.md delete mode 100644 aten/src/THCUNN/SharedMem.cuh delete mode 100644 aten/src/THCUNN/THCHalfAutoNumerics.cuh delete mode 100644 aten/src/THCUNN/THCUNN.h delete mode 100644 aten/src/THCUNN/common.h delete mode 100644 aten/src/THCUNN/doc/api_reference.md delete mode 100644 aten/src/THCUNN/doc/style_guidelines.md delete mode 100644 aten/src/THCUNN/generic/THCUNN.h diff --git a/BUILD.bazel b/BUILD.bazel index dab227590072d..5acbe4082d38e 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -409,21 +409,6 @@ filegroup( ], ) -filegroup( - name = "thcunn_srcs_cu", - srcs = [ - "aten/src/THCUNN/BCECriterion.cu.cc", - "aten/src/THCUNN/ELU.cu.cc", - "aten/src/THCUNN/HardTanh.cu.cc", - "aten/src/THCUNN/LeakyReLU.cu.cc", - "aten/src/THCUNN/MultiMarginCriterion.cu.cc", - "aten/src/THCUNN/SoftMarginCriterion.cu.cc", - "aten/src/THCUNN/SoftPlus.cu.cc", - "aten/src/THCUNN/SoftShrink.cu.cc", - "aten/src/THCUNN/Tanh.cu.cc", - ], -) - filegroup( name = "aten_srcs_cu", srcs = [ @@ -573,8 +558,6 @@ cc_library( "aten/src/THC/**/*.cpp", "aten/src/THC/*.cuh", "aten/src/THC/generic/*.cu.cc", - "aten/src/THCUNN/*.cuh", - "aten/src/THCUNN/generic/*.cu.cc", ], exclude = [ "aten/src/ATen/Config.h", @@ -716,7 +699,6 @@ cu_library( srcs = [ ":aten_srcs_cu", ":thc_srcs_cu", - ":thcunn_srcs_cu", ], copts = ATEN_COPTS + torch_cuda_half_options, visibility = ["//visibility:public"], diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7d8659a8babff..e1a049cf9a979 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -242,8 +242,7 @@ into the repo directory. * [aten](aten) - C++ tensor library for PyTorch (no autograd support) * [src](aten/src) - [README](aten/src/README.md) * [TH](aten/src/TH) - [THC](aten/src/THC) - [THCUNN](aten/src/THCUNN) - Legacy library code from the original + [THC](aten/src/THC) - Legacy library code from the original Torch. Try not to add things here; we're slowly porting these to [native](aten/src/ATen/native). * generic - Contains actual implementations of operators, diff --git a/README.md b/README.md index 53ebfb1a4bec6..9b2a854ef3557 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ We hope you never spend hours debugging your code because of bad stack traces or PyTorch has minimal framework overhead. We integrate acceleration libraries such as [Intel MKL](https://software.intel.com/mkl) and NVIDIA ([cuDNN](https://developer.nvidia.com/cudnn), [NCCL](https://developer.nvidia.com/nccl)) to maximize speed. At the core, its CPU and GPU Tensor and neural network backends -(TH, THC, THNN, THCUNN) are mature and have been tested for years. +are mature and have been tested for years. Hence, PyTorch is quite fast – whether you run small or large neural networks. diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index 400b00f8e858a..7ba92a6decee7 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -80,21 +80,14 @@ if(USE_ROCM) # ATen proper) set(AT_CUDA_ENABLED 1) add_subdirectory(src/THH) - add_subdirectory(src/THHUNN) message("ROCm is enabled.") elseif(USE_CUDA) set(AT_CUDA_ENABLED 1) add_subdirectory(src/THC) - add_subdirectory(src/THCUNN) else() message("disabling CUDA because USE_CUDA is set false") set(AT_CUDA_ENABLED 0) endif() -if(NOT USE_CUDA) - # we still parse THCUNN even if cuda is disabled to make sure to - # install it - install(FILES src/THCUNN/generic/THCUNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THCUNN/generic") -endif() if(NOT USE_NNPACK) set(AT_NNPACK_ENABLED 0) diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index af9a8a1b22153..1ec9f9c291c0a 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -282,7 +282,6 @@ bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) { return contig_if_nonempty; } -// Correspond to THCUNN_check_dim_size/THNN_check_dim_size void check_dim_size( const Tensor& tensor, int64_t dim, diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index 8e84ecaa4a3a2..1417174a1f6d3 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -144,7 +144,6 @@ TORCH_API void* maybe_data_ptr(const TensorArg& tensor); // on whether a subgeometry is contiguous. TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides); -// Correspond to THCUNN_check_dim_size/THNN_check_dim_size TORCH_API void check_dim_size( const Tensor& tensor, int64_t dim, diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp index 4ead51e6bd26e..0ad6dc8256ff0 100644 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #undef THNN_ #undef THCIndexTensor_ #include diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h index 836504a729fea..91a61b04b8590 100644 --- a/aten/src/ATen/cuda/detail/KernelUtils.h +++ b/aten/src/ATen/cuda/detail/KernelUtils.h @@ -2,9 +2,6 @@ #include -// Contents of this file are copied from THCUNN/common.h for the ease of porting -// THCUNN functions into ATen. - namespace at { namespace cuda { namespace detail { // CUDA: grid stride looping diff --git a/aten/src/README.md b/aten/src/README.md index e3e01515afb0f..183ec09a97efd 100644 --- a/aten/src/README.md +++ b/aten/src/README.md @@ -7,7 +7,6 @@ multiple variants of the library, summarized here: * TH = TorcH * THC = TorcH Cuda * THCS = TorcH Cuda Sparse (now defunct) -* THCUNN = TorcH CUda Neural Network (see cunn) * THNN = TorcH Neural Network (now defunct) * THS = TorcH Sparse (now defunct) diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt deleted file mode 100644 index f84005e7e92f6..0000000000000 --- a/aten/src/THCUNN/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} -PARENT_SCOPE) - -set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} - "${CMAKE_CURRENT_SOURCE_DIR}" -PARENT_SCOPE) - -install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DESTINATION ${ATEN_INSTALL_INCLUDE_SUBDIR} - FILES_MATCHING PATTERN "*.h" PATTERN "*.cuh") diff --git a/aten/src/THCUNN/README.md b/aten/src/THCUNN/README.md deleted file mode 100644 index 5c4662322cbb5..0000000000000 --- a/aten/src/THCUNN/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# THCUNN - -THCUNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions; most users will want to use ATen, which provides a C++ wrapper around these functions. - - -Looking to add an implementation? Consider writing an ATen native function -instead! See [../ATen/native](../ATen/native). - -## Links - -* [API reference](doc/api_reference.md) -* [Style guidelines](doc/style_guidelines.md) - -## API - -THCUNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations: - -* **updateOutput** - applies the module to an input -* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input -* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters - -For information on argument types please check the [API reference](doc/api_reference.md). - -## Developer docs - -* [Style guidelines](doc/style_guidelines.md) diff --git a/aten/src/THCUNN/SharedMem.cuh b/aten/src/THCUNN/SharedMem.cuh deleted file mode 100644 index 8d83d9f9a9c58..0000000000000 --- a/aten/src/THCUNN/SharedMem.cuh +++ /dev/null @@ -1,43 +0,0 @@ -// Based on the simpleTempltes CUDA example - -#ifndef THCUNN_SHAREDMEM_H -#define THCUNN_SHAREDMEM_H - -template -struct SharedMem { - __device__ T *getPointer() - { - extern __device__ void error(void); - error(); - return NULL; - } -}; - -template <> -struct SharedMem -{ - __device__ half *getPointer() { - extern __shared__ half s_half[]; - return s_half; - } -}; - -template <> -struct SharedMem -{ - __device__ float *getPointer() { - extern __shared__ float s_float[]; - return s_float; - } -}; - -template <> -struct SharedMem -{ - __device__ double *getPointer() { - extern __shared__ double s_double[]; - return s_double; - } -}; - -#endif diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh deleted file mode 100644 index 62691b9df7c21..0000000000000 --- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef THC_HALF_AUTO_NUMERICS_INC -#define THC_HALF_AUTO_NUMERICS_INC - -#include -#include - -// WARNING: THCNumerics is being deprecated. Read the comments and function usage -// in THCNumerics to learn about the deprecation -// -// Half numerics functions defined as free functions, so cunn code can be -// written generically, i.e. without excessive calling of THCNumerics functions. - -// these functions should move to THCNumerics - -inline __host__ __device__ THHalf fmaxType(THHalf x, THHalf y) { - return THCNumerics::ge(x, y) ? x : y; -} - -inline __host__ __device__ float fmaxType(float x, THHalf y) { - return fmaxf(x, ScalarConvert::to(y)); -} - -inline __host__ __device__ float fmaxType(float x, float y) { - return fmaxf(x, y); -} - -inline __host__ __device__ double fmaxType(double x, double y) { - return fmax(x, y); -} - - -// arithmetic functions - -inline __host__ __device__ THHalf pow(THHalf a, THHalf b) { - return THCNumerics::pow(a, b); -} - -#endif diff --git a/aten/src/THCUNN/THCUNN.h b/aten/src/THCUNN/THCUNN.h deleted file mode 100644 index a4392ddaba166..0000000000000 --- a/aten/src/THCUNN/THCUNN.h +++ /dev/null @@ -1,13 +0,0 @@ -#include - -#define THCIndexTensor THCudaLongTensor -#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME -typedef int64_t THCIndex_t; - -#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME) - -#include -#include - -#include -#include diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h deleted file mode 100644 index 69b7f3a4d3fa8..0000000000000 --- a/aten/src/THCUNN/common.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef THCUNN_COMMON_H -#define THCUNN_COMMON_H - -#define THCUNN_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \ - "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") - -// Use 1024 threads per block, which requires cuda sm_2x or above -const int CUDA_NUM_THREADS = 1024; - -// CUDA: number of blocks for threads. -inline int GET_BLOCKS(const int64_t N) -{ - // Round up division for positive number - auto block_num = N / CUDA_NUM_THREADS + (N % CUDA_NUM_THREADS == 0 ? 0 : 1); - - constexpr int64_t max_int = std::numeric_limits::max(); - THAssertMsg(block_num <= max_int, "Can't schedule too many blocks on CUDA device"); - - return static_cast(block_num); -} - -#define THCUNN_resizeAs_indices(STATE, I1, I2) \ - if (!I1->sizes().equals(I2->sizes())) \ - { \ - THCudaLongTensor_resizeAs(STATE, I1, I2); \ - } - -#define THCUNN_check_shape(STATE, I1, I2) \ - if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \ - { \ - THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \ - THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \ - THError(#I1 " and " #I2 " shapes do not match: " \ - #I1 " %s, " #I2 " %s", s1.str, s2.str); \ - } - - -#define THCUNN_check_shape_indices(STATE, I1, I2) \ - if (!I1->sizes().equals(I2->sizes())) \ - { \ - THCDescBuff s1 = THCIndexTensor_(sizeDesc)(STATE, I1); \ - THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \ - THError(#I1 " and " #I2 " shapes do not match: " \ - #I1 " %s, " #I2 " %s", s1.str, s2.str); \ - } - -#define THCUNN_check_nElement(STATE, I1, I2) \ - if (I1 != NULL && I2 != NULL ) { \ - ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \ - ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \ - if (n1 != n2) \ - { \ - THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \ - THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \ - THError(#I1 " and " #I2 " have different number of elements: " \ - #I1 "%s has %ld elements, while " \ - #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \ - } \ - } - -#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \ - if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \ - THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \ - THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ - THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ - " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ - } - -#define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE) \ - if (THCIndexTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \ - THCIndexTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \ - THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T); \ - THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ - " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ - } - -#define THCUNN_argCheck(STATE, COND, ARG, T, FORMAT) \ - if (!(COND)) { \ - THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ - THArgCheck(COND, ARG, FORMAT, s1.str); \ - } - -#endif diff --git a/aten/src/THCUNN/doc/api_reference.md b/aten/src/THCUNN/doc/api_reference.md deleted file mode 100644 index 3f49b9b6d1ce6..0000000000000 --- a/aten/src/THCUNN/doc/api_reference.md +++ /dev/null @@ -1,26 +0,0 @@ -# API docs - -This document describes the conventions behind the THCUNN API. - -### The API - -All functions provided by THCUNN are stored in `aten/src/THCUNN/generic/THCUNN.h`. -Look at this file. - -### Note on function names - -Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions: - -* `void THNN_FloatAbs_updateOutput(...)` -* `void THNN_DoubleAbs_updateOutput(...)` - -In these docs such function will be referred to as `void THCUNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type. - -### Argument types - -Some arguments have additional tags placed in square brackets in their header declarations: - -* **[OUT]** - This is the output argument. It will be reshaped if needed. -* **[OPTIONAL]** - This argument is optional and can be safely set to NULL -* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call. -* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output). diff --git a/aten/src/THCUNN/doc/style_guidelines.md b/aten/src/THCUNN/doc/style_guidelines.md deleted file mode 100644 index 086db8bcbe28a..0000000000000 --- a/aten/src/THCUNN/doc/style_guidelines.md +++ /dev/null @@ -1,64 +0,0 @@ -## API design guidelines - -Functions should return `void`. - -All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this: -``` -[weight], [bias], [any buffers], [additional arguments], [optional arguments] -``` - -### Modules -``` -updateOutput: state, input, output, ... -updateGradInput: state, input, gradOutput, gradInput, ... -accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ... -``` - -e.g. -```C -void THNN_(ClassNLLCriterion_updateGradInput)( - THCState *state, - THCTensor *input, - THCIndexTensor *target, - THCTensor *gradOutput, - THCTensor *gradInput, - int64_t reduction, - THCTensor *weights, - THCTensor *total_weight, - int64_t ignore_index) -``` - -### Criterions -``` -updateOutput: state, input, target, output, ... -updateGradInput: state, input, target, gradInput, ... -``` - -e.g. - -```C -void THNN_(ClassNLLCriterion_updateOutput)( - THCState *state, - THCTensor *input, - THCIndexTensor *target, - THCTensor *output, - int64_t reduction, - THCTensor *weights, - THCTensor *total_weight, - int64_t ignore_index) -``` - -## Code style guide - -```C -void THNN_(GatedLinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int dim) -//<- 10 -> -``` - -All arguments should start on a new line after function name, and they should be indented using 10 spaces. - -Use 2 spaces for block indentation. diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h deleted file mode 100644 index d624fdd090177..0000000000000 --- a/aten/src/THCUNN/generic/THCUNN.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/THCUNN.h" -#else - -#include -#include - -TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateOutput)( - THCState* state, - THCTensor* input, - THCIndexTensor* target, - THCTensor* output, - int64_t reduction, - int p, - THCTensor* weights, // [OPTIONAL] - accreal margin); - -TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateGradInput)( - THCState* state, - THCTensor* input, - THCIndexTensor* target, - THCTensor* gradOutput, - THCTensor* gradInput, - int64_t reduction, - int p, - THCTensor* weights, // [OPTIONAL] - accreal margin); - -#endif diff --git a/setup.py b/setup.py index 8135e1e4c2f7c..a20098232af3c 100644 --- a/setup.py +++ b/setup.py @@ -1028,8 +1028,6 @@ def print_box(msg): 'include/THC/*.cuh', 'include/THC/*.h*', 'include/THC/generic/*.h', - 'include/THCUNN/*.cuh', - 'include/THCUNN/generic/*.h', 'include/THH/*.cuh', 'include/THH/*.h*', 'include/THH/generic/*.h', diff --git a/tools/README.md b/tools/README.md index a28affa5f30aa..e4aba38afd851 100644 --- a/tools/README.md +++ b/tools/README.md @@ -15,10 +15,6 @@ Modern infrastructure: to import arbitrary Python files in a script, without having to add them to the PYTHONPATH first. -Legacy infrastructure (we should kill this): -* [cwrap](cwrap) - Implementation of legacy code generation for THNN/THCUNN. - This is used by nnwrap. - Build system pieces: * [setup_helpers](setup_helpers) - Helper code for searching for diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index 8cfecda82e328..70f7e7a83e1ec 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -81,12 +81,10 @@ "aten/src/ATen/native/sparse/cuda/*", "aten/src/ATen/native/quantized/cuda/*", "aten/src/THC/*", - "aten/src/THCUNN/*", "aten/src/ATen/test/*", # CMakeLists.txt isn't processed by default, but there are a few # we do want to handle, so explicitly specify them "aten/src/THC/CMakeLists.txt", - "aten/src/THCUNN/CMakeLists.txt", "torch/*", "tools/autograd/templates/python_variable_methods.cpp", ] diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py index 6697f1e014cf7..ad2903f7ad655 100644 --- a/torch/utils/hipify/hipify_python.py +++ b/torch/utils/hipify/hipify_python.py @@ -750,7 +750,6 @@ def repl(m): or f.startswith("ATen/native/quantized/cuda") or f.startswith("ATen/native/sparse/cuda") or f.startswith("THC/") - or f.startswith("THCUNN/") or (f.startswith("THC") and not f.startswith("THCP")) ): return templ.format(get_hip_file_path(m.group(1), is_pytorch_extension)) From a709ab34a8d847cae506e221ad8c8efa1eadc828 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 23 Aug 2021 12:41:32 -0700 Subject: [PATCH 136/530] [nnc] Re-enable CPU fusion" (#63665) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63665 This reverts commit 125e2d02e575612eb427104e7c67f1c28f090db8. Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30471646 Pulled By: bertmaher fbshipit-source-id: 4189869566f03b5f9ada78d78830f6a34946eed6 --- torch/_C/__init__.pyi.in | 2 ++ torch/csrc/jit/codegen/fuser/executor.cpp | 5 +++-- torch/csrc/jit/codegen/fuser/interface.cpp | 8 ++------ torch/csrc/jit/passes/graph_fuser.cpp | 12 +++++++++++- torch/csrc/jit/passes/graph_fuser.h | 3 +++ torch/csrc/jit/python/init.cpp | 2 ++ torch/testing/_internal/jit_utils.py | 2 ++ 7 files changed, 25 insertions(+), 9 deletions(-) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 4d0245c7786af..0b6bb6b64e0a4 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -208,6 +208,7 @@ def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ... def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ... def _jit_can_fuse_on_cpu() -> _bool: ... def _jit_can_fuse_on_gpu() -> _bool: ... +def _jit_can_fuse_on_cpu_legacy() -> _bool: ... def _debug_get_fusion_group_inlining() -> _bool: ... def _debug_set_fusion_group_inlining(enable: _bool): ... def _jit_texpr_fuser_enabled() -> _bool: ... @@ -215,6 +216,7 @@ def _jit_nvfuser_enabled() -> _bool: ... def _llvm_enabled() -> _bool: ... def _jit_override_can_fuse_on_cpu(override: _bool): ... def _jit_override_can_fuse_on_gpu(override: _bool): ... +def _jit_override_can_fuse_on_cpu_legacy(override: _bool): ... def _jit_set_symbolic_shapes_test_mode(override: _bool): ... def _jit_symbolic_shapes_test_mode_enabled() -> _bool: ... def _jit_set_texpr_fuser_enabled(enable: _bool): ... diff --git a/torch/csrc/jit/codegen/fuser/executor.cpp b/torch/csrc/jit/codegen/fuser/executor.cpp index b260e48b16c3f..46f2f41d07e36 100644 --- a/torch/csrc/jit/codegen/fuser/executor.cpp +++ b/torch/csrc/jit/codegen/fuser/executor.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include // TODO: remove, debugging only @@ -327,7 +328,7 @@ void launchFusion( bool runFusion(const int64_t key, Stack& stack, std::string* code_out) { // Short-circuits if fusion isn't enabled - if (!canFuseOnCPU() && !canFuseOnGPU()) + if (!canFuseOnCPULegacy() && !canFuseOnGPU()) return false; // Acquires the FusionSpec @@ -362,7 +363,7 @@ bool runFusion(const int64_t key, Stack& stack, std::string* code_out) { // Attempts to run fallback if device fusion is disabled if (device.is_cuda() && !canFuseOnGPU()) return false; - if (device.is_cpu() && !canFuseOnCPU()) + if (device.is_cpu() && !canFuseOnCPULegacy()) return false; if (device.is_xpu()) return false; diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp index ec67c4bd83773..ef7e9e0b629d5 100644 --- a/torch/csrc/jit/codegen/fuser/interface.cpp +++ b/torch/csrc/jit/codegen/fuser/interface.cpp @@ -8,15 +8,12 @@ #include #include -C10_DEFINE_bool(torch_jit_enable_cpu_fusion, false, "enable cpu fusion"); - namespace torch { namespace jit { namespace detail { -// Note: CPU fusion is currently disabled due to test flakiness -#if defined(FBCODE_CAFFE2) +#ifdef TORCH_ENABLE_LLVM bool cpu_fuser_enabled = true; #else bool cpu_fuser_enabled = false; @@ -37,8 +34,7 @@ void runFusion(const int64_t key, Stack& stack) { } bool canFuseOnCPU() { - return fuser::hasFusionBackend(DeviceType::CPU) && - (detail::cpu_fuser_enabled || FLAGS_torch_jit_enable_cpu_fusion); + return fuser::hasFusionBackend(DeviceType::CPU) && detail::cpu_fuser_enabled; } bool canFuseOnGPU() { diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index f7dd466de4ff4..653f9fec08b32 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -183,7 +183,7 @@ struct GraphFuser { return !strict_fuser_check; } if ((*device).is_cpu()) { - return canFuseOnCPU(); + return canFuseOnCPULegacy(); } else if ((*device).is_cuda()) { return canFuseOnGPU(); } else if ((*device).is_xpu()) { @@ -1244,6 +1244,16 @@ void PeepholeOptimizeShapeExpressions(Block* block, AliasDb* db) { } // anonymous namespace +static bool cpu_fuser_enabled_legacy = false; + +bool canFuseOnCPULegacy() { + return cpu_fuser_enabled_legacy; +} + +void overrideCanFuseOnCPULegacy(bool value) { + cpu_fuser_enabled_legacy = value; +} + void FuseGraph(std::shared_ptr& graph, bool strict_fuser_check) { AliasDb db(graph); GraphFuser(&db, graph->block(), strict_fuser_check).run(); diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h index 0cdcc2e20f469..aafb442eafb6f 100644 --- a/torch/csrc/jit/passes/graph_fuser.h +++ b/torch/csrc/jit/passes/graph_fuser.h @@ -5,6 +5,9 @@ namespace torch { namespace jit { +TORCH_API bool canFuseOnCPULegacy(); +TORCH_API void overrideCanFuseOnCPULegacy(bool value); + // NB: Be sure to run DCE before fusion, because dead instructions // can prevent fusion opportunities from being exploited. // On Windows will noop, NYI diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index c92ab1b46e41c..baea47d63ed18 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -590,6 +590,8 @@ void initJITBindings(PyObject* module) { .def("_jit_override_can_fuse_on_gpu", &overrideCanFuseOnGPU) .def("_jit_can_fuse_on_cpu", &canFuseOnCPU) .def("_jit_can_fuse_on_gpu", &canFuseOnGPU) + .def("_jit_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy) + .def("_jit_override_can_fuse_on_cpu_legacy", &overrideCanFuseOnCPULegacy) .def( "_jit_differentiate", [](Graph& g) { diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py index 6086572039033..7f9fb976934d3 100644 --- a/torch/testing/_internal/jit_utils.py +++ b/torch/testing/_internal/jit_utils.py @@ -668,11 +668,13 @@ def wrapper(func): def enable_cpu_fuser(fn): def wrapper(*args, **kwargs): + torch._C._jit_override_can_fuse_on_cpu_legacy(True) torch._C._jit_override_can_fuse_on_cpu(True) torch._C._jit_set_te_must_use_llvm_cpu(False) try: fn(*args, **kwargs) finally: + torch._C._jit_override_can_fuse_on_cpu_legacy(False) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_set_te_must_use_llvm_cpu(True) return wrapper From 0bc7fef406447cbafe00b0337527ae37e315a064 Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Mon, 23 Aug 2021 12:53:42 -0700 Subject: [PATCH 137/530] [Static Runtime] Remove unused fusion patterns (#63636) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63636 Reviewed By: d1jang Differential Revision: D30446573 fbshipit-source-id: 3abb7f697380f3b4e865b98c594de359b5e26b96 --- torch/csrc/jit/runtime/static/passes.cpp | 115 +++++------------------ 1 file changed, 23 insertions(+), 92 deletions(-) diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp index bbd7dd17f2feb..2e9eb5746d276 100644 --- a/torch/csrc/jit/runtime/static/passes.cpp +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -165,125 +165,57 @@ C10_UNUSED void ClipRangesGather(std::shared_ptr& graph) { fuse.runOnGraph(graph); } +C10_UNUSED void PrecomputeMultiplierShiftForSigridHash( + std::shared_ptr& graph) { + std::string pattern = R"IR( + graph(%a, %b, %c, %d): + %y0 : Tensor = fb::sigrid_hash(%a, %b, %c, %d) + return (%y0) + )IR"; + std::string split_pattern = R"IR( + graph(%a, %b, %c, %d): + %y0 : Tensor = fb::sigrid_hash_compute_multipler_shift(%c) + %y2 : Tensor = fb::sigrid_hash_precompute(%a, %b, %c, %y0, %d) + return (%y2) + )IR"; + SubgraphRewriter fuse; + fuse.RegisterRewritePattern(pattern, split_pattern); + fuse.runOnGraph(graph); +} + C10_UNUSED void ClipRangesGatherSigridHash(std::shared_ptr& graph) { // TODO:: check restrictions for inputs; outputs not used elsewhere - std::string pattern_1 = R"IR( - graph(%a, %b, %c, %d, %e, %f, %g): - %y0 : Tensor, %y1 : Tensor = fb::clip_ranges_gather_lengths_to_offsets(%a, %b, %c, %d) - %y2 : Tensor = fb::sigrid_hash(%y0, %e, %f, %g) - return (%y2, %y1))IR"; - std::string fused_pattern_1 = R"IR( - graph(%a, %b, %c, %d, %e, %f, %g): - %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_offsets(%b, %a, %c, %e, %f, %g, %d) - return (%out, %off))IR"; - - std::string pattern_2 = R"IR( + std::string pattern = R"IR( graph(%a, %b, %c, %d, %e, %f, %g, %h): %y0 : Tensor, %y1 : Tensor = fb::clip_ranges_gather_lengths_to_offsets(%a, %b, %c, %d) %y2 : Tensor = fb::sigrid_hash_precompute(%y0, %e, %f, %g, %h) return (%y2, %y1))IR"; - std::string fused_pattern_2 = R"IR( + std::string fused_pattern = R"IR( graph(%a, %b, %c, %d, %e, %f, %g, %h): %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_precompute_offsets(%b, %a, %c, %e, %f, %g, %h, %d) return (%out, %off))IR"; SubgraphRewriter fuse; - fuse.RegisterRewritePattern(pattern_1, fused_pattern_1); - fuse.runOnGraph(graph); - - fuse.RegisterRewritePattern(pattern_2, fused_pattern_2); + fuse.RegisterRewritePattern(pattern, fused_pattern); fuse.runOnGraph(graph); } C10_UNUSED void ClipRangesGatherRangesSigridHash( std::shared_ptr& graph) { - std::string pattern_1 = R"IR( - graph(%a, %b, %c, %d, %e, %f): - %y0 : Tensor = fb::clip_ranges(%b, %c) - %y1 : Tensor, %y2 : Tensor = fb::gather_ranges(%a, %y0) - %y3 : Tensor = fb::sigrid_hash(%y1, %d, %e, %f) - return (%y3, %y2))IR"; - std::string fused_pattern_1 = R"IR( - graph(%a, %b, %c, %d, %e, %f): - %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_v3(%b, %a, %c, %d, %e, %f) - return (%out, %off))IR"; - - std::string pattern_2 = R"IR( + std::string pattern = R"IR( graph(%a, %b, %c, %d, %e, %f, %g): %y0 : Tensor = fb::clip_ranges(%b, %c) %y1 : Tensor, %y2 : Tensor = fb::gather_ranges(%a, %y0) %y3 : Tensor = fb::sigrid_hash_precompute(%y1, %d, %e, %f, %g) return (%y3, %y2))IR"; - std::string fused_pattern_2 = R"IR( + std::string fused_pattern = R"IR( graph(%a, %b, %c, %d, %e, %f, %g): %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_precompute_v3(%b, %a, %c, %d, %e, %f, %g) return (%out, %off))IR"; - SubgraphRewriter fuse; - fuse.RegisterRewritePattern(pattern_1, fused_pattern_1); - fuse.runOnGraph(graph); - - fuse.RegisterRewritePattern(pattern_2, fused_pattern_2); - fuse.runOnGraph(graph); -} - -C10_UNUSED void PrecomputeMultiplierShiftForSigridHash( - std::shared_ptr& graph) { - std::string pattern = R"IR( - graph(%a, %b, %c, %d): - %y0 : Tensor = fb::sigrid_hash(%a, %b, %c, %d) - return (%y0) - )IR"; - std::string split_pattern = R"IR( - graph(%a, %b, %c, %d): - %y0 : Tensor = fb::sigrid_hash_compute_multipler_shift(%c) - %y2 : Tensor = fb::sigrid_hash_precompute(%a, %b, %c, %y0, %d) - return (%y2) - )IR"; - SubgraphRewriter fuse; - fuse.RegisterRewritePattern(pattern, split_pattern); - fuse.runOnGraph(graph); -} - -C10_UNUSED void ClipRangesGatherRangesX2SigridHash( - std::shared_ptr& graph) { - // Placeholder is a dummy op used to capture the first subgraph - std::string pattern = R"IR( - graph(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32): - %clipped : Tensor = fb::clip_ranges(%ranges, %max_length) - %output : Tensor, %unused : Tensor = fb::gather_ranges(%values, %clipped) - %sigrid_hash_out : Tensor = fb::sigrid_hash(%output, %salt, %max_value, %hash_into_int32) - return (%sigrid_hash_out, %clipped))IR"; - std::string fused_pattern = R"IR( - graph(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32): - %sigrid_hash_out : Tensor, %clipped : Tensor = fb::placeholder(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32) - return (%sigrid_hash_out, %clipped))IR"; - - // the second gather_ranges can be eliminated because the `lengths` is - // produces is identical to the lengths produced by - // clip_ranges_gather_sigrid_hash_v3 (caveat, the fused ops makes some - // simplifying assumptions about the ranges input) - std::string pattern2 = R"IR( - graph(%gather2_values, %ranges, %values, %max_length, %salt, %max_value, %hash_into_int32): - %sigrid_hash_out : Tensor, %clipped : Tensor = fb::placeholder(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32) - %unused : Tensor, %lengths : Tensor = fb::gather_ranges(%gather2_values, %clipped) - return (%lengths, %sigrid_hash_out))IR"; - - std::string fused_pattern2 = R"IR( - graph(%gather2_values, %ranges, %values, %max_length, %salt, %max_value, %hash_into_int32): - %lengths : Tensor, %sigrid_hash_out : Tensor = fb::clip_ranges_gather_sigrid_hash_v3(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32) - return (%lengths, %sigrid_hash_out))IR"; - SubgraphRewriter fuse; fuse.RegisterRewritePattern(pattern, fused_pattern); fuse.runOnGraph(graph); - - fuse.RegisterRewritePattern(pattern2, fused_pattern2); - fuse.runOnGraph(graph); - - // reverse the ops that got fused in step 1 but not in step2 - fuse.RegisterRewritePattern(fused_pattern, pattern); - fuse.runOnGraph(graph); } C10_UNUSED void ClipRangesGatherRangesX2SigridHashPrecompute( @@ -349,7 +281,6 @@ void FuseInferenceOpsForSparseNN(std::shared_ptr& graph) { ClipRangesGatherSigridHash(graph); ClipRangesGatherRangesSigridHash(graph); - ClipRangesGatherRangesX2SigridHash(graph); ClipRangesGatherRangesX2SigridHashPrecompute(graph); // prioritize clip_ranges+gather_ranges+sigrid_hash fusion over From a781340bf7f610c14c42bf2e0a5f06d9b7e67193 Mon Sep 17 00:00:00 2001 From: Zeina Migeed Date: Mon, 23 Aug 2021 14:09:10 -0700 Subject: [PATCH 138/530] Add equality constraints for some acc opeartions for symbolic inference (#63689) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63689 Test Plan: buck run mode/opt-clang caffe2/torch/fb/model_transform/experimental:fx_ir_lower_inline_cvr -- \ --action=lower_and_run \ --filename=inline_cvr_7x_dec_2020.model \ --print_glow_glog=True Reviewed By: jamesr66a Differential Revision: D30462113 fbshipit-source-id: 0b2a1ce9770561248527d47c07b80112491dc949 --- .../experimental/graph_gradual_typechecker.py | 12 +++++++++--- torch/fx/experimental/unify_refinements.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py index 5ce53a7ff6896..6e05f918e810e 100644 --- a/torch/fx/experimental/graph_gradual_typechecker.py +++ b/torch/fx/experimental/graph_gradual_typechecker.py @@ -436,7 +436,7 @@ def adaptiveavgpool2d_inference_rule(n: Node, module_instance): def flatten_check(tensor_type, start_dim, end_dim): l = len(tensor_type.__args__) - start_dim = l if start_dim == -1 else start_dim + start_dim = l if start_dim == -1 else abs(start_dim) end_dim = l + end_dim + 1 if end_dim < 0 else end_dim + 1 if 0 <= start_dim <= (l - 1) and 0 <= end_dim <= l and start_dim < end_dim: @@ -668,6 +668,10 @@ def replace_dyn_with_fresh_var(self, typ): elif isinstance(typ, TensorType): new_args = [self.replace_dyn_with_fresh_var(a) for a in typ.__args__] return TensorType(tuple(new_args)) + elif isinstance(typ, list): + return [self.replace_dyn_with_fresh_var(t) for t in typ] + elif isinstance(typ, tuple): + return (self.replace_dyn_with_fresh_var(t) for t in typ) else: return typ @@ -698,8 +702,10 @@ def refine_node(self, n: Node): pass if n.op == 'output': - assert isinstance(n.args[0], Node) - n.type = n.args[0].type + def get_node_type(a): + return a.type + n.type = torch.fx.node.map_arg(n.args[0], get_node_type) + return n.type else: pass diff --git a/torch/fx/experimental/unify_refinements.py b/torch/fx/experimental/unify_refinements.py index c8561041472ae..5074377ebf2dc 100644 --- a/torch/fx/experimental/unify_refinements.py +++ b/torch/fx/experimental/unify_refinements.py @@ -52,6 +52,8 @@ def substitute_solution_one_type(mapping, t): if isinstance(t, Var): if t in mapping.keys(): return mapping[t] + else: + return t elif isinstance(t, TensorType): new_type = [] @@ -62,6 +64,21 @@ def substitute_solution_one_type(mapping, t): new_type.append(typ) return TensorType(tuple(new_type)) + elif isinstance(t, list): + new_type = [] + for typ in t: + new_type.append(substitute_solution_one_type(mapping, typ)) + return new_type + + elif isinstance(t, tuple): + new_type = [] + for typ in t: + new_type.append(substitute_solution_one_type(mapping, typ)) + return tuple(new_type) + + else: + return t + def substitute_all_types(graph, mapping): """ From 7946f8a9f6a020a89f534f4a2b921357935ee975 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Mon, 23 Aug 2021 14:32:56 -0700 Subject: [PATCH 139/530] Rename DataPipe to Op-er (#63325) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63325 Rename each DataPipe to an operation name ending with er. Functional API should remain `verb` such as `read_from_tar` , `shuffle`, ... (Discussed in [here](https://github.com/facebookexternal/torchdata/pull/97#discussion_r688553905)) - Batch -> Batcher - Collate -> Collator - Concat -> Concater - GroupByKey - > ByKeyGrouper ? - ListDirFiles -> FileLister - LoadFilesFromDisk -> FileLoader - Map -> Mapper - ReadFilesFromTar -> TarArchiveReader - ReadFilesFromZip -> ZipArchiveReader - ReadLinesFromFile -> LineReader - Shuffle -> Shuffler - ToBytes -> StreamReader - Transforms -> Transformer - Zip -> Zipper Let me know if you have better name for each DataPipe Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30466950 Pulled By: ejguan fbshipit-source-id: 72909dca7b3964ab83b965891f96cc1ecf62d049 --- test/test_datapipe.py | 127 ++++++------------ torch/utils/data/datapipes/iter/__init__.py | 74 +++++----- torch/utils/data/datapipes/iter/callable.py | 41 +----- .../data/datapipes/iter/combinatorics.py | 4 +- torch/utils/data/datapipes/iter/combining.py | 8 +- .../iter/{listdirfiles.py => filelister.py} | 4 +- .../{loadfilesfromdisk.py => fileloader.py} | 4 +- torch/utils/data/datapipes/iter/grouping.py | 12 +- torch/utils/data/datapipes/iter/linereader.py | 18 +++ .../data/datapipes/iter/readlinesfromfile.py | 19 --- torch/utils/data/datapipes/iter/selecting.py | 4 +- .../iter/{tobytes.py => streamreader.py} | 4 +- ...eadfilesfromtar.py => tararchivereader.py} | 4 +- ...eadfilesfromzip.py => ziparchivereader.py} | 4 +- torch/utils/data/datapipes/map/__init__.py | 7 +- torch/utils/data/datapipes/map/callable.py | 4 +- torch/utils/data/datapipes/map/combining.py | 4 +- 17 files changed, 128 insertions(+), 214 deletions(-) rename torch/utils/data/datapipes/iter/{listdirfiles.py => filelister.py} (93%) rename torch/utils/data/datapipes/iter/{loadfilesfromdisk.py => fileloader.py} (93%) create mode 100644 torch/utils/data/datapipes/iter/linereader.py delete mode 100644 torch/utils/data/datapipes/iter/readlinesfromfile.py rename torch/utils/data/datapipes/iter/{tobytes.py => streamreader.py} (85%) rename torch/utils/data/datapipes/iter/{readfilesfromtar.py => tararchivereader.py} (96%) rename torch/utils/data/datapipes/iter/{readfilesfromzip.py => ziparchivereader.py} (96%) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 9c2380112705d..86e53fa699142 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -34,7 +34,6 @@ import numpy as np import torch -import torch.nn as nn import torch.utils.data.backward_compatibility import torch.utils.data.datapipes as dp import torch.utils.data.graph @@ -54,13 +53,6 @@ basichandlers as decoder_basichandlers, ) -try: - import torchvision.transforms - HAS_TORCHVISION = True -except ImportError: - HAS_TORCHVISION = False -skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision") - try: import dill # XXX: By default, dill writes the Pickler dispatch table to inject its @@ -177,7 +169,7 @@ def tearDown(self): def test_listdirfiles_iterable_datapipe(self): temp_dir = self.temp_dir.name - datapipe = dp.iter.ListDirFiles(temp_dir, '') + datapipe = dp.iter.FileLister(temp_dir, '') count = 0 for pathname in datapipe: @@ -186,7 +178,7 @@ def test_listdirfiles_iterable_datapipe(self): self.assertEqual(count, len(self.temp_files)) count = 0 - datapipe = dp.iter.ListDirFiles(temp_dir, '', recursive=True) + datapipe = dp.iter.FileLister(temp_dir, '', recursive=True) for pathname in datapipe: count = count + 1 self.assertTrue((pathname in self.temp_files) or (pathname in self.temp_sub_files)) @@ -195,13 +187,13 @@ def test_listdirfiles_iterable_datapipe(self): def test_loadfilesfromdisk_iterable_datapipe(self): # test import datapipe class directly from torch.utils.data.datapipes.iter import ( - ListDirFiles, - LoadFilesFromDisk, + FileLister, + FileLoader, ) temp_dir = self.temp_dir.name - datapipe1 = ListDirFiles(temp_dir, '') - datapipe2 = LoadFilesFromDisk(datapipe1) + datapipe1 = FileLister(temp_dir, '') + datapipe2 = FileLoader(datapipe1) count = 0 for rec in datapipe2: @@ -220,9 +212,9 @@ def test_readfilesfromtar_iterable_datapipe(self): tar.add(self.temp_files[0]) tar.add(self.temp_files[1]) tar.add(self.temp_files[2]) - datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar') - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) - datapipe3 = dp.iter.ReadFilesFromTar(datapipe2) + datapipe1 = dp.iter.FileLister(temp_dir, '*.tar') + datapipe2 = dp.iter.FileLoader(datapipe1) + datapipe3 = dp.iter.TarArchiveReader(datapipe2) # read extracted files before reaching the end of the tarfile for rec, temp_file in itertools.zip_longest(datapipe3, self.temp_files): self.assertTrue(rec is not None and temp_file is not None) @@ -247,9 +239,9 @@ def test_readfilesfromzip_iterable_datapipe(self): myzip.write(self.temp_files[0]) myzip.write(self.temp_files[1]) myzip.write(self.temp_files[2]) - datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.zip') - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) - datapipe3 = dp.iter.ReadFilesFromZip(datapipe2) + datapipe1 = dp.iter.FileLister(temp_dir, '*.zip') + datapipe2 = dp.iter.FileLoader(datapipe1) + datapipe3 = dp.iter.ZipArchiveReader(datapipe2) # read extracted files before reaching the end of the zipfile for rec, temp_file in itertools.zip_longest(datapipe3, self.temp_files): self.assertTrue(rec is not None and temp_file is not None) @@ -271,8 +263,8 @@ def test_routeddecoder_iterable_datapipe(self): temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png") png_data = np.array([[[1., 0., 0.], [1., 0., 0.]], [[1., 0., 0.], [1., 0., 0.]]], dtype=np.single) np.save(temp_pngfile_pathname, png_data) - datapipe1 = dp.iter.ListDirFiles(temp_dir, ['*.png', '*.txt']) - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) + datapipe1 = dp.iter.FileLister(temp_dir, ['*.png', '*.txt']) + datapipe2 = dp.iter.FileLoader(datapipe1) def _png_decoder(extension, data): if extension != 'png': @@ -321,10 +313,10 @@ def test_groupbykey_iterable_datapipe(self): f.write('12345abcde') tar.add(file_pathname) - datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar') - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) - datapipe3 = dp.iter.ReadFilesFromTar(datapipe2) - datapipe4 = dp.iter.GroupByKey(datapipe3, group_size=2) + datapipe1 = dp.iter.FileLister(temp_dir, '*.tar') + datapipe2 = dp.iter.FileLoader(datapipe1) + datapipe3 = dp.iter.TarArchiveReader(datapipe2) + datapipe4 = dp.iter.ByKeyGrouper(datapipe3, group_size=2) expected_result = [("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), ( "f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.json", "h.txt")] @@ -447,13 +439,14 @@ def _get_data_from_tuple_fn(data, *args, **kwargs): create_temp_files_for_serving(tmpdir, test_file_count, test_file_size, file_url_template) - datapipe_dir_f = dp.iter.ListDirFiles(tmpdir, '*_list') - datapipe_f_lines = dp.iter.ReadLinesFromFile(datapipe_dir_f) + datapipe_dir_f = dp.iter.FileLister(tmpdir, '*_list') + datapipe_stream = dp.iter.FileLoader(datapipe_dir_f) + datapipe_f_lines = dp.iter.LineReader(datapipe_stream) datapipe_line_url: IterDataPipe[str] = \ - dp.iter.Map(datapipe_f_lines, _get_data_from_tuple_fn, (1,)) + dp.iter.Mapper(datapipe_f_lines, _get_data_from_tuple_fn, (1,)) datapipe_http = dp.iter.HttpReader(datapipe_line_url, timeout=timeout) - datapipe_tob = dp.iter.ToBytes(datapipe_http, chunk=chunk) + datapipe_tob = dp.iter.StreamReader(datapipe_http, chunk=chunk) for (url, data) in datapipe_tob: self.assertGreater(len(url), 0) @@ -539,18 +532,18 @@ class TestFunctionalIterDataPipe(TestCase): def _test_picklable(self): arr = range(10) picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ - (dp.iter.Map, IDP(arr), (), {}), - (dp.iter.Map, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), - (dp.iter.Collate, IDP(arr), (), {}), - (dp.iter.Collate, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Mapper, IDP(arr), (), {}), + (dp.iter.Mapper, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Collator, IDP(arr), (), {}), + (dp.iter.Collator, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), (dp.iter.Filter, IDP(arr), (_fake_filter_fn, (0, ), {'test': True}), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] unpicklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ - (dp.iter.Map, IDP(arr), (lambda x: x, ), {}), - (dp.iter.Collate, IDP(arr), (lambda x: x, ), {}), + (dp.iter.Mapper, IDP(arr), (lambda x: x, ), {}), + (dp.iter.Collator, IDP(arr), (lambda x: x, ), {}), (dp.iter.Filter, IDP(arr), (lambda x: x >= 5, ), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes: @@ -566,10 +559,10 @@ def test_concat_datapipe(self): input_dp2 = IDP(range(5)) with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"): - dp.iter.Concat() + dp.iter.Concater() with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `IterDataPipe`"): - dp.iter.Concat(input_dp1, ()) # type: ignore[arg-type] + dp.iter.Concater(input_dp1, ()) # type: ignore[arg-type] concat_dp = input_dp1.concat(input_dp2) self.assertEqual(len(concat_dp), 15) @@ -913,59 +906,17 @@ def test_shuffle_datapipe(self): with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(shuffle_dp_nl) - @skipIfNoTorchVision - def test_transforms_datapipe(self): - torch.set_default_dtype(torch.float) - # A sequence of numpy random numbers representing 3-channel images - w = h = 32 - inputs = [np.random.randint(0, 255, (h, w, 3), dtype=np.uint8) for i in range(10)] - tensor_inputs = [torch.tensor(x, dtype=torch.float).permute(2, 0, 1) / 255. for x in inputs] - - input_dp = IDP(inputs) - # Raise TypeError for python function - with self.assertRaisesRegex(TypeError, r"`transforms` are required to be"): - input_dp.legacy_transforms(_fake_fn) - - # transforms.Compose of several transforms - transforms = torchvision.transforms.Compose([ - torchvision.transforms.ToTensor(), - torchvision.transforms.Pad(1, fill=1, padding_mode='constant'), - ]) - tsfm_dp = input_dp.legacy_transforms(transforms) - self.assertEqual(len(tsfm_dp), len(input_dp)) - for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs): - self.assertEqual(tsfm_data[:, 1:(h + 1), 1:(w + 1)], input_data) - - # nn.Sequential of several transforms (required to be instances of nn.Module) - input_dp = IDP(tensor_inputs) - transforms = nn.Sequential( - torchvision.transforms.Pad(1, fill=1, padding_mode='constant'), - ) - tsfm_dp = input_dp.legacy_transforms(transforms) - self.assertEqual(len(tsfm_dp), len(input_dp)) - for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs): - self.assertEqual(tsfm_data[:, 1:(h + 1), 1:(w + 1)], input_data) - - # Single transform - input_dp = IDP_NoLen(inputs) # type: ignore[assignment] - transform = torchvision.transforms.ToTensor() - tsfm_dp = input_dp.legacy_transforms(transform) - with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): - len(tsfm_dp) - for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs): - self.assertEqual(tsfm_data, input_data) - def test_zip_datapipe(self): with self.assertRaises(TypeError): - dp.iter.Zip(IDP(range(10)), list(range(10))) # type: ignore[arg-type] + dp.iter.Zipper(IDP(range(10)), list(range(10))) # type: ignore[arg-type] - zipped_dp = dp.iter.Zip(IDP(range(10)), IDP_NoLen(range(5))) # type: ignore[var-annotated] + zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP_NoLen(range(5))) # type: ignore[var-annotated] with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(zipped_dp) exp = list((i, i) for i in range(5)) self.assertEqual(list(zipped_dp), exp) - zipped_dp = dp.iter.Zip(IDP(range(10)), IDP(range(5))) + zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP(range(5))) self.assertEqual(len(zipped_dp), 5) self.assertEqual(list(zipped_dp), exp) # Reset @@ -979,8 +930,8 @@ def _test_picklable(self): picklable_datapipes: List[ Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]] ] = [ - (dp.map.Map, MDP(arr), (), {}), - (dp.map.Map, MDP(arr), (_fake_fn, (0,), {'test': True}), {}), + (dp.map.Mapper, MDP(arr), (), {}), + (dp.map.Mapper, MDP(arr), (_fake_fn, (0,), {'test': True}), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] @@ -988,7 +939,7 @@ def _test_picklable(self): unpicklable_datapipes: List[ Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]] ] = [ - (dp.map.Map, MDP(arr), (lambda x: x,), {}), + (dp.map.Mapper, MDP(arr), (lambda x: x,), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes: with warnings.catch_warnings(record=True) as wa: @@ -1005,10 +956,10 @@ def test_concat_datapipe(self): input_dp2 = MDP(range(5)) with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"): - dp.map.Concat() + dp.map.Concater() with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `MapDataPipe`"): - dp.map.Concat(input_dp1, ()) # type: ignore[arg-type] + dp.map.Concater(input_dp1, ()) # type: ignore[arg-type] concat_dp = input_dp1.concat(input_dp2) self.assertEqual(len(concat_dp), 15) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index bdaef95e9fa56..5af2ab661da40 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -1,38 +1,31 @@ from torch.utils.data.datapipes.iter.callable import ( - CollateIterDataPipe as Collate, - MapIterDataPipe as Map, - TransformsIterDataPipe as Transforms, + CollatorIterDataPipe as Collator, + MapperIterDataPipe as Mapper, ) from torch.utils.data.datapipes.iter.combinatorics import ( SamplerIterDataPipe as Sampler, - ShuffleIterDataPipe as Shuffle, + ShufflerIterDataPipe as Shuffler, ) from torch.utils.data.datapipes.iter.combining import ( - ConcatIterDataPipe as Concat, - ZipIterDataPipe as Zip, + ConcaterIterDataPipe as Concater, + ZipperIterDataPipe as Zipper, +) +from torch.utils.data.datapipes.iter.filelister import ( + FileListerIterDataPipe as FileLister, +) +from torch.utils.data.datapipes.iter.fileloader import ( + FileLoaderIterDataPipe as FileLoader, ) from torch.utils.data.datapipes.iter.grouping import ( - BatchIterDataPipe as Batch, + BatcherIterDataPipe as Batcher, BucketBatcherIterDataPipe as BucketBatcher, - GroupByKeyIterDataPipe as GroupByKey, + ByKeyGrouperIterDataPipe as ByKeyGrouper, ) from torch.utils.data.datapipes.iter.httpreader import ( HTTPReaderIterDataPipe as HttpReader, ) -from torch.utils.data.datapipes.iter.listdirfiles import ( - ListDirFilesIterDataPipe as ListDirFiles, -) -from torch.utils.data.datapipes.iter.loadfilesfromdisk import ( - LoadFilesFromDiskIterDataPipe as LoadFilesFromDisk, -) -from torch.utils.data.datapipes.iter.readfilesfromtar import ( - ReadFilesFromTarIterDataPipe as ReadFilesFromTar, -) -from torch.utils.data.datapipes.iter.readfilesfromzip import ( - ReadFilesFromZipIterDataPipe as ReadFilesFromZip, -) -from torch.utils.data.datapipes.iter.readlinesfromfile import ( - ReadLinesFromFileIterDataPipe as ReadLinesFromFile, +from torch.utils.data.datapipes.iter.linereader import ( + LineReaderIterDataPipe as LineReader, ) from torch.utils.data.datapipes.iter.routeddecoder import ( RoutedDecoderIterDataPipe as RoutedDecoder, @@ -40,33 +33,38 @@ from torch.utils.data.datapipes.iter.selecting import ( FilterIterDataPipe as Filter, ) -from torch.utils.data.datapipes.iter.tobytes import ( - ToBytesIterDataPipe as ToBytes, +from torch.utils.data.datapipes.iter.streamreader import ( + StreamReaderIterDataPipe as StreamReader, +) +from torch.utils.data.datapipes.iter.tararchivereader import ( + TarArchiveReaderIterDataPipe as TarArchiveReader, +) +from torch.utils.data.datapipes.iter.ziparchivereader import ( + ZipArchiveReaderIterDataPipe as ZipArchiveReader, ) from torch.utils.data.datapipes.iter.utils import ( IterableAsDataPipeIterDataPipe as IterableAsDataPipe, ) -__all__ = ['Batch', +__all__ = ['Batcher', 'BucketBatcher', - 'Collate', - 'Concat', + 'ByKeyGrouper', + 'Collator', + 'Concater', + 'FileLister', + 'FileLoader', 'Filter', - 'GroupByKey', 'HttpReader', 'IterableAsDataPipe', - 'ListDirFiles', - 'LoadFilesFromDisk', - 'Map', - 'ReadFilesFromTar', - 'ReadFilesFromZip', - 'ReadLinesFromFile', + 'LineReader', + 'Mapper', 'RoutedDecoder', 'Sampler', - 'Shuffle', - 'ToBytes', - 'Transforms', - 'Zip'] + 'Shuffler', + 'StreamReader', + 'TarArchiveReader', + 'ZipArchiveReader', + 'Zipper'] # Please keep this list sorted assert __all__ == sorted(__all__) diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py index cc0f9e13b3adf..18f6f17fff156 100644 --- a/torch/utils/data/datapipes/iter/callable.py +++ b/torch/utils/data/datapipes/iter/callable.py @@ -1,5 +1,4 @@ import warnings -import torch.nn as nn from torch.utils.data import IterDataPipe, _utils, functional_datapipe, DataChunk from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar @@ -26,8 +25,8 @@ def default_fn(data): @functional_datapipe('map') -class MapIterDataPipe(IterDataPipe[T_co]): - r""" :class:`MapIterDataPipe`. +class MapperIterDataPipe(IterDataPipe[T_co]): + r""" :class:`MapperIterDataPipe`. Iterable DataPipe to run a function over each item from the source DataPipe. The function can be any regular python function or partial object. Lambda @@ -108,8 +107,8 @@ def __setstate__(self, state): @functional_datapipe('collate') -class CollateIterDataPipe(MapIterDataPipe): - r""" :class:`CollateIterDataPipe`. +class CollatorIterDataPipe(MapperIterDataPipe): + r""" :class:`CollatorIterDataPipe`. Iterable DataPipe to collate samples from datapipe to Tensor(s) by `util_.collate.default_collate`, or customized Data Structure by collate_fn. @@ -153,35 +152,3 @@ def __init__(self, fn_kwargs: Optional[Dict] = None, ) -> None: super().__init__(datapipe, fn=collate_fn, fn_args=fn_args, fn_kwargs=fn_kwargs) - - -@functional_datapipe('legacy_transforms') -class TransformsIterDataPipe(MapIterDataPipe): - r""" :class:`TransformsIterDataPipe`. - - Iterable DataPipe to use transform(s) from torchvision or torchaudio to transform - data from datapipe. - args: - datapipe: Iterable DataPipe being transformed - transforms: A transform or a sequence of transforms from torchvision or torchaudio. - """ - - def __init__(self, - datapipe: IterDataPipe, - transforms: Callable, - ) -> None: - # Type checking for transforms - transforms_types: Tuple = (nn.Module, ) - try: - # Specific types of transforms other than `nn.Module` from torchvision - import torchvision.transforms as tsfm - transforms_types += (tsfm.Compose, tsfm.RandomChoice, tsfm.RandomOrder, - tsfm.ToPILImage, tsfm.ToTensor, tsfm.Lambda) - except ImportError: - pass - - if not isinstance(transforms, transforms_types): - raise TypeError("`transforms` are required to be a callable from " - "torchvision.transforms or torchaudio.transforms") - - super().__init__(datapipe, fn=transforms) diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py index a8b1e3d9737fa..d1a7dd0368221 100644 --- a/torch/utils/data/datapipes/iter/combinatorics.py +++ b/torch/utils/data/datapipes/iter/combinatorics.py @@ -44,8 +44,8 @@ def __len__(self) -> int: @functional_datapipe('shuffle') -class ShuffleIterDataPipe(IterDataPipe[T_co]): - r""" :class:`ShuffleIterDataPipe` +class ShufflerIterDataPipe(IterDataPipe[T_co]): + r""" :class:`ShufflerIterDataPipe` Iterable DataPipe to shuffle the input DataPipe with a buffer. The buffer with `buffer_size` is filled with elements from the datapipe first. Then, diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index 0693b1f0ad6de..4b28e0926c42b 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -7,8 +7,8 @@ @functional_datapipe('concat') -class ConcatIterDataPipe(IterDataPipe): - r""" :class:`ConcatIterDataPipe`. +class ConcaterIterDataPipe(IterDataPipe): + r""" :class:`ConcaterIterDataPipe`. Iterable DataPipe to concatenate multiple Iterable DataPipes. args: @@ -54,7 +54,7 @@ def __iter__(self): @functional_datapipe('fork') -class ForkIterDataPipe(IterDataPipe): +class ForkerIterDataPipe(IterDataPipe): def __new__(cls, datapipe, instances): result = [] @@ -96,7 +96,7 @@ def __iter__(self): @functional_datapipe('zip') -class ZipIterDataPipe(IterDataPipe[Tuple[T_co]]): +class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]): r""" :class:`ZipIterDataPipe`. Iterable DataPipe aggregates elements into a tuple from each of diff --git a/torch/utils/data/datapipes/iter/listdirfiles.py b/torch/utils/data/datapipes/iter/filelister.py similarity index 93% rename from torch/utils/data/datapipes/iter/listdirfiles.py rename to torch/utils/data/datapipes/iter/filelister.py index 91ef8a3b080a4..48fdce9f52ef5 100644 --- a/torch/utils/data/datapipes/iter/listdirfiles.py +++ b/torch/utils/data/datapipes/iter/filelister.py @@ -2,8 +2,8 @@ from torch.utils.data.datapipes.utils.common import get_file_pathnames_from_root from typing import List, Union, Iterator -class ListDirFilesIterDataPipe(IterDataPipe[str]): - r""" :class:`ListDirFilesIterDataPipe` +class FileListerIterDataPipe(IterDataPipe[str]): + r""" :class:`FileListerIterDataPipe` Iterable DataPipe to load file pathname(s) (path + filename), yield pathname from given disk root dir. args: diff --git a/torch/utils/data/datapipes/iter/loadfilesfromdisk.py b/torch/utils/data/datapipes/iter/fileloader.py similarity index 93% rename from torch/utils/data/datapipes/iter/loadfilesfromdisk.py rename to torch/utils/data/datapipes/iter/fileloader.py index c9dd5daf9a17a..2b73e4e156b70 100644 --- a/torch/utils/data/datapipes/iter/loadfilesfromdisk.py +++ b/torch/utils/data/datapipes/iter/fileloader.py @@ -5,8 +5,8 @@ from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames -class LoadFilesFromDiskIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): - r""" :class:`LoadFilesFromDiskIterDataPipe`. +class FileLoaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): + r""" :class:`FileLoaderIterDataPipe`. Iterable Datapipe to load file streams from given pathnames, yield pathname and file stream in a tuple. diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index e6304c2de8217..5f449489ac756 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -32,8 +32,8 @@ def __iter__(self): @functional_datapipe('batch') -class BatchIterDataPipe(IterDataPipe[DataChunk[T_co]]): - r""" :class:`BatchIterDataPipe`. +class BatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]): + r""" :class:`BatcherIterDataPipe`. Iterable DataPipe to create mini-batches of data. An outer dimension will be added as `batch_size` if `drop_last` is set to `True`, or `length % batch_size` for the @@ -93,8 +93,8 @@ def __len__(self) -> int: @functional_datapipe('unbatch') -class UnBatchIterDataPipe(IterDataPipe): - r""" :class:`UnBatchIterDataPipe`. +class UnBatcherIterDataPipe(IterDataPipe): + r""" :class:`UnBatcherIterDataPipe`. Iterable DataPipe to undo batching of data. In other words, it flattens the data up to the specified level within a batched DataPipe. @@ -255,7 +255,7 @@ def cmp_fn(a: Tuple[str, Any], b: Tuple[str, Any]): @functional_datapipe('groupby') -class GroupByIterDataPipe(IterDataPipe): +class GrouperIterDataPipe(IterDataPipe): # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks) def __init__(self, datapipe: IterDataPipe[T_co], @@ -329,7 +329,7 @@ def __iter__(self): @functional_datapipe('group_by_key') -class GroupByKeyIterDataPipe(IterDataPipe[list]): +class ByKeyGrouperIterDataPipe(IterDataPipe[list]): r""" :class:`GroupByKeyIterDataPipe`. Iterable datapipe to group data from input iterable by keys which are generated from `group_key_fn`, diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py new file mode 100644 index 0000000000000..2b15b93c9c60a --- /dev/null +++ b/torch/utils/data/datapipes/iter/linereader.py @@ -0,0 +1,18 @@ +from typing import Tuple +from torch.utils.data import IterDataPipe + + +class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]): + r""" :class:`LineReaderIterDataPipe` + + Iterable DataPipe to load file name and stream as source IterDataPipe + and yield filename and line(s). + """ + + def __init__(self, source_datapipe): + self.source_datapipe = source_datapipe + + def __iter__(self): + for file_name, stream in self.source_datapipe: + for line in stream: + yield file_name, line diff --git a/torch/utils/data/datapipes/iter/readlinesfromfile.py b/torch/utils/data/datapipes/iter/readlinesfromfile.py deleted file mode 100644 index c8366af3b475f..0000000000000 --- a/torch/utils/data/datapipes/iter/readlinesfromfile.py +++ /dev/null @@ -1,19 +0,0 @@ -from typing import Tuple -from torch.utils.data import IterDataPipe - - -class ReadLinesFromFileIterDataPipe(IterDataPipe[Tuple[str, str]]): - r""" :class:`ReadLinesFromFileDataPipe` - - Iterable DataPipe to load file names as source iter data pipe - and yield filename and line(s). - """ - - def __init__(self, source_datapipe): - self.source_datapipe = source_datapipe - - def __iter__(self): - for file_name in self.source_datapipe: - with open(file_name) as file: - for line in file: - yield (file_name, line) diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index 46a613a7d91a4..83872cebdb53d 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -1,13 +1,13 @@ from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk from typing import Callable, TypeVar, Iterator, Optional, Tuple, Dict -from .callable import MapIterDataPipe +from .callable import MapperIterDataPipe T_co = TypeVar('T_co', covariant=True) @functional_datapipe('filter') -class FilterIterDataPipe(MapIterDataPipe): +class FilterIterDataPipe(MapperIterDataPipe): r""" :class:`FilterIterDataPipe`. Iterable DataPipe to filter elements from datapipe according to filter_fn. diff --git a/torch/utils/data/datapipes/iter/tobytes.py b/torch/utils/data/datapipes/iter/streamreader.py similarity index 85% rename from torch/utils/data/datapipes/iter/tobytes.py rename to torch/utils/data/datapipes/iter/streamreader.py index 21fd82d381bcb..f74efe746a759 100644 --- a/torch/utils/data/datapipes/iter/tobytes.py +++ b/torch/utils/data/datapipes/iter/streamreader.py @@ -2,8 +2,8 @@ from torch.utils.data import IterDataPipe -class ToBytesIterDataPipe(IterDataPipe[Tuple[str, bytes]]): - r""" :class:`ToBytesIterDataPipe` +class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]): + r""" :class:`StreamReaderIterDataPipe` Iterable DataPipe to load IO stream with label name, and to yield bytes with label name in a tuple diff --git a/torch/utils/data/datapipes/iter/readfilesfromtar.py b/torch/utils/data/datapipes/iter/tararchivereader.py similarity index 96% rename from torch/utils/data/datapipes/iter/readfilesfromtar.py rename to torch/utils/data/datapipes/iter/tararchivereader.py index f4566021fcc7f..9145f5f1dbc11 100644 --- a/torch/utils/data/datapipes/iter/readfilesfromtar.py +++ b/torch/utils/data/datapipes/iter/tararchivereader.py @@ -7,8 +7,8 @@ import tarfile import warnings -class ReadFilesFromTarIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): - r""":class:`ReadFilesFromTarIterDataPipe`. +class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): + r""" :class:`TarArchiveReaderIterDataPipe`. Iterable datapipe to extract tar binary streams from input iterable which contains tuples of pathname and tar binary stream, yields pathname and extracted binary stream in a tuple. diff --git a/torch/utils/data/datapipes/iter/readfilesfromzip.py b/torch/utils/data/datapipes/iter/ziparchivereader.py similarity index 96% rename from torch/utils/data/datapipes/iter/readfilesfromzip.py rename to torch/utils/data/datapipes/iter/ziparchivereader.py index edb8320aece9f..e98bd179760c8 100644 --- a/torch/utils/data/datapipes/iter/readfilesfromzip.py +++ b/torch/utils/data/datapipes/iter/ziparchivereader.py @@ -8,8 +8,8 @@ import zipfile import warnings -class ReadFilesFromZipIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): - r""" :class:`ReadFilesFromZipIterDataPipe`. +class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): + r""" :class:`ZipArchiveReaderIterDataPipe`. Iterable data pipe to extract zip binary streams from input iterable which contains tuples of pathname and zip binary stream, yields pathname and extracted binary stream in a tuple. diff --git a/torch/utils/data/datapipes/map/__init__.py b/torch/utils/data/datapipes/map/__init__.py index b7609957baaa8..5879165aff2eb 100644 --- a/torch/utils/data/datapipes/map/__init__.py +++ b/torch/utils/data/datapipes/map/__init__.py @@ -1,7 +1,6 @@ # Functional DataPipe -from torch.utils.data.datapipes.map.callable import MapMapDataPipe as Map -from torch.utils.data.datapipes.map.combining import \ - (ConcatMapDataPipe as Concat) +from torch.utils.data.datapipes.map.callable import MapperMapDataPipe as Mapper +from torch.utils.data.datapipes.map.combining import ConcaterMapDataPipe as Concater -__all__ = ['Map', 'Concat'] +__all__ = ['Concater', 'Mapper'] diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py index 00457299316ae..8dbad957e069d 100644 --- a/torch/utils/data/datapipes/map/callable.py +++ b/torch/utils/data/datapipes/map/callable.py @@ -26,8 +26,8 @@ def default_fn(data): @functional_datapipe('map') -class MapMapDataPipe(MapDataPipe[T_co]): - r""":class:`MapMapDataPipe`. +class MapperMapDataPipe(MapDataPipe[T_co]): + r""":class:`MapperMapDataPipe`. Map DataPipe to run a function over each item from the source DataPipe. The function can be any regular python function or partial object. Lambda diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py index 234d45382efe6..4743c3726b356 100644 --- a/torch/utils/data/datapipes/map/combining.py +++ b/torch/utils/data/datapipes/map/combining.py @@ -5,8 +5,8 @@ @functional_datapipe('concat') -class ConcatMapDataPipe(MapDataPipe): - r""" :class:`ConcatMapDataPipe`. +class ConcaterMapDataPipe(MapDataPipe): + r""" :class:`ConcaterMapDataPipe`. Map DataPipe to concatenate multiple Map DataPipes. The actual index of is the cumulative sum of source datapipes. From 49be16d50ab7d7fe38b241347ff72e381feba071 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Mon, 23 Aug 2021 15:02:10 -0700 Subject: [PATCH 140/530] .github: Add ec2 information as a step (#63784) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63784 Also creates the common.yml.j2 file as a place to store common code amongst the templates Should look like: ![image](https://user-images.githubusercontent.com/1700823/130495226-f18b8c0f-1ea7-4097-8bbb-e998fabb71f2.png) Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: malfet, driazati Differential Revision: D30490682 Pulled By: seemethere fbshipit-source-id: 18028b4acff938ef54cd6e4877561b2d830a11cf --- .github/scripts/display_ec2_information.sh | 14 +++++++++++++ .github/templates/common.yml.j2 | 6 ++++++ .github/templates/linux_ci_workflow.yml.j2 | 6 ++++++ .github/templates/windows_ci_workflow.yml.j2 | 4 ++++ ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 9 ++++++++ ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 9 ++++++++ ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 17 +++++++++++++++ ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 17 +++++++++++++++ ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 17 +++++++++++++++ ...rated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 17 +++++++++++++++ .../generated-linux-xenial-py3.6-gcc5.4.yml | 21 +++++++++++++++++++ ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 5 +++++ ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 9 ++++++++ ...iodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 17 +++++++++++++++ ...rated-periodic-win-vs2019-cuda11.3-py3.yml | 13 ++++++++++++ .../generated-win-vs2019-cpu-py3.yml | 13 ++++++++++++ .../generated-win-vs2019-cuda10.1-py3.yml | 13 ++++++++++++ .../generated-win-vs2019-cuda11.1-py3.yml | 13 ++++++++++++ 18 files changed, 220 insertions(+) create mode 100755 .github/scripts/display_ec2_information.sh create mode 100644 .github/templates/common.yml.j2 diff --git a/.github/scripts/display_ec2_information.sh b/.github/scripts/display_ec2_information.sh new file mode 100755 index 0000000000000..be47418966025 --- /dev/null +++ b/.github/scripts/display_ec2_information.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -euo pipefail + +function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" +} + +echo "ami-id: $(get_ec2_metadata ami-id)" +echo "instance-id: $(get_ec2_metadata instance-id)" +echo "instance-type: $(get_ec2_metadata instance-type)" diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 new file mode 100644 index 0000000000000..12108f1f95f46 --- /dev/null +++ b/.github/templates/common.yml.j2 @@ -0,0 +1,6 @@ +{%- macro display_ec2_information() -%} + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh +{%- endmacro -%} diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 9b4ba87194065..25099b46cbc8e 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -1,3 +1,4 @@ +{% import 'common.yml.j2' as common %} {# squid_proxy is an private ELB that only available for GHA custom runners #} {%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%} {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} @@ -89,6 +90,7 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + !{{ common.display_ec2_information() }} - name: Calculate docker image tag id: calculate-tag run: | @@ -165,6 +167,7 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + !{{ common.display_ec2_information() }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -318,6 +321,7 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + !{{ common.display_ec2_information() }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -476,6 +480,7 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + !{{ common.display_ec2_information() }} - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: @@ -541,6 +546,7 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + !{{ common.display_ec2_information() }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index c1160fe32de60..f00f4b19a903d 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -1,3 +1,4 @@ +{% import 'common.yml.j2' as common %} {# squid_proxy is an private ELB that only available for GHA custom runners #} {%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%} {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} @@ -82,6 +83,7 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + !{{ common.display_ec2_information() }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -190,6 +192,7 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + !{{ common.display_ec2_information() }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -290,6 +293,7 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + !{{ common.display_ec2_information() }} - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index 780de8e1919e9..f45ed052e3838 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: libtorch-linux-xenial-cuda10.2-py3.6-gcc7 @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index 3d586ae322e8e..ba59027969b7a 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: libtorch-linux-xenial-cuda11.1-py3.6-gcc7 @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 7e6006ad300de..67bb2064863bc 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-bionic-cuda10.2-py3.9-gcc7 @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -286,6 +295,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -436,6 +449,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 2cd316b2b8674..59061662341a3 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-bionic-py3.8-gcc9-coverage @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -286,6 +295,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -440,6 +453,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index a88191469ebde..d897e28ace007 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-cuda10.2-py3.6-gcc7 @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -286,6 +295,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -436,6 +449,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml index 264553f1877ee..30514cab07ea4 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-cuda11.1-py3.6-gcc7 @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -286,6 +295,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -436,6 +449,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 7b100e6d1294d..bd4d65027c066 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-py3.6-gcc5.4 @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -139,6 +144,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -286,6 +295,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -436,6 +449,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: @@ -498,6 +515,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index dbfba5f1fa74d..1827249beae99 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/bazel_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-py3.6-gcc7-bazel-test @@ -63,6 +64,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 8352b229f4fae..1f4c6d270ec54 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7 @@ -61,6 +62,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -137,6 +142,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml index 9d7da989e8b9a..5fd0e99f002f7 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: periodic-linux-xenial-cuda11.3-py3.6-gcc7 @@ -61,6 +62,10 @@ jobs: with: # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Calculate docker image tag id: calculate-tag run: | @@ -137,6 +142,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -284,6 +293,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -434,6 +447,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml index 9487ea5a8fdb3..407aace6e4aee 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: periodic-win-vs2019-cuda11.3-py3 @@ -57,6 +58,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -154,6 +159,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -248,6 +257,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index fb2a097c64452..35f9feccaf26c 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: win-vs2019-cpu-py3 @@ -57,6 +58,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -146,6 +151,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -232,6 +241,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index 2fbc8650f43e1..26b703500b0d1 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: win-vs2019-cuda10.1-py3 @@ -59,6 +60,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -156,6 +161,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -250,6 +259,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml index 8b52a07055608..d4175aca5f02d 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: win-vs2019-cuda11.1-py3 @@ -59,6 +60,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -156,6 +161,10 @@ jobs: path: pytorch-${{ github.run_id }} # deep clone, to allow use of git merge-base fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -250,6 +259,10 @@ jobs: with: # deep clone, to allow tools/stats/print_test_stats.py to use Git commands fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - uses: actions/download-artifact@v2 name: Download PyTorch Test Reports with: From 55f8f95ad4b36d201b63bd4b53984d378cf3d672 Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Mon, 23 Aug 2021 15:36:59 -0700 Subject: [PATCH 141/530] fix git diff issue (#63408) Summary: Fixes https://github.com/pytorch/pytorch/issues/60111, ideally we should merge this before https://github.com/pytorch/pytorch/issues/63360 but we can also test this with https://github.com/pytorch/pytorch/issues/63360 easily. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63408 Test Plan: - This is conform working with local test.sh run by setting PR_NUMBER - should be validated by GHA CI as well Concern: - currently GHA CI is running into proxy 403 rate-limit exceeded issue consistently. However the worst case is not generating any git diff files, which is going to be exactly the same as current behavior. - depends on https://github.com/pytorch/pytorch/issues/63770. Reviewed By: driazati, janeyx99 Differential Revision: D30489355 Pulled By: walterddr fbshipit-source-id: a638b7ae5820f29a7aca6cc40ff390ab253cb174 --- .jenkins/pytorch/common_utils.sh | 11 +++++++++++ .jenkins/pytorch/test.sh | 12 +++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index 49db051a0f484..cb7ef207af47c 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -49,6 +49,17 @@ function get_exit_code() { return $retcode } +function get_pr_change_files() { + # The fetch may fail on Docker hosts, this fetch is necessary for GHA + # accepts PR_NUMBER and extract filename as arguments + set +e + tmp_file=$(mktemp) + wget -O "$tmp_file" "https://api.github.com/repos/pytorch/pytorch/pulls/$1/files" + # this regex extracts the filename list according to the GITHUB REST API result. + sed -n "s/.*\"filename\": \"\(.*\)\",/\1/p" "$tmp_file" | tee "$2" + set -e +} + function file_diff_from_base() { # The fetch may fail on Docker hosts, this fetch is necessary for GHA set +e diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 9f3e378a45fb8..4fce9ab00dcc9 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -142,12 +142,14 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX512-* || $TEST_CONFIG == 'nogpu_NO_AVX export ATEN_CPU_CAPABILITY=avx2 fi -# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs -# see https://github.com/pytorch/pytorch/issues/60111 -# change it back to PR_NUMBER when issue is fixed. -if [ -n "$CIRCLE_PR_NUMBER" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then +if [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then + # if PR_NUMBER exist, use it to grab PR contents. DETERMINE_FROM=$(mktemp) - file_diff_from_base "$DETERMINE_FROM" + if [ -n "$PR_NUMBER" ]; then + get_pr_change_files "$PR_NUMBER" "$DETERMINE_FROM" + else + file_diff_from_base "$DETERMINE_FROM" + fi fi test_python_legacy_jit() { From 84890aae352ea6bf8dd2c501683a0da240ea7dec Mon Sep 17 00:00:00 2001 From: Don Jang Date: Mon, 23 Aug 2021 16:20:27 -0700 Subject: [PATCH 142/530] [Static Runtime] Add an out variant op for aten::abs (#63675) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63675 This change adds an out variant implementation for `aten::abs`. Test Plan: - Observed `V0820 14:14:08.880342 101788 impl.cpp:1394] Switch to out variant for node: %3 : Tensor = aten::abs(%a.1)` - Perf impact: TBD Reviewed By: hlu1 Differential Revision: D30461317 fbshipit-source-id: 0c0230bd40afe463ae1ccb222c2a1207ebcf4191 --- benchmarks/static_runtime/test_scripts.h | 5 +++++ .../static_runtime/test_static_runtime.cc | 9 +++++++++ torch/csrc/jit/runtime/static/ops.cpp | 17 +++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 73380129731ed..9e01d3b8d0b87 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -26,6 +26,11 @@ alias of the model output. */ +const auto abs_script = R"JIT( + def forward(self, a): + return a.abs().clone() +)JIT"; + const auto list_construct_script = R"JIT( def forward(self, a, b): return [a, b] diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index dfe2c14e9489b..71102215b4e2e 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -256,6 +256,15 @@ TEST(StaticRuntime, Addmm) { testStaticRuntime(addmm_script, args, args1); } +TEST(StaticRuntime, IndividualOps_Abs) { + auto a = at::randn({2, 3}); + auto b = at::randn({4, 2, 3}); + std::vector args{a}; + std::vector args2{b}; + testStaticRuntime(abs_script, args); + testStaticRuntime(abs_script, args, args2); +} + TEST(StaticRuntime, IndividualOps_Binary) { auto a = at::randn({2, 3}); auto b = at::ones({2, 3}); diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 2543182db138c..27f6e545ec7bc 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -301,6 +301,23 @@ REGISTER_OPERATOR_FUNCTOR( }; }); +REGISTER_OPERATOR_FUNCTOR(aten::abs, aten_abs, [](Node* n) -> SROperator { + if (!n->matches(torch::schema("aten::abs(Tensor self) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& in0_t = p_node->Input(0).toTensor(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::abs(in0_t); + } else { + auto& out_t = p_node->Output(0).toTensor(); + fastResizeToZero(out_t); + at::native::abs_out(in0_t, out_t); + } + }; +}); + REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator { if (!n->matches(torch::schema( "aten::mul.Tensor(Tensor self, Tensor other) -> Tensor"))) { From 130549d61ba34f3d70167be3dc88631385112625 Mon Sep 17 00:00:00 2001 From: Akshit Khurana Date: Mon, 23 Aug 2021 16:33:07 -0700 Subject: [PATCH 143/530] Fix typo in NNAPI tests (#63797) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63797 nnapi memory format test has a typo Test Plan: pytest test/test_nnapi.py::TestNNAPI Imported from OSS Reviewed By: Amyh11325 Differential Revision: D30495473 fbshipit-source-id: 8edad7c01a080847a64a2797e077ec4d6077552a --- test/test_nnapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_nnapi.py b/test/test_nnapi.py index d70bebf547a1e..f8db7e1a3df90 100644 --- a/test/test_nnapi.py +++ b/test/test_nnapi.py @@ -78,7 +78,7 @@ def check( # to get a nice message. self.assertEqual(eager_output, nnapi_output, atol=0, rtol=0) if expected_memory_format: - self.assertTrue(nnapi_out.is_contiguous(memory_format=expected_memory_format)) + self.assertTrue(nnapi_output.is_contiguous(memory_format=expected_memory_format)) def float_and_quant_and_nhwc(self, inp_float, scale, zero_point): torch.manual_seed(29) From fc6dd0bc008d1a1872626567506be6e9e5dcbae1 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Mon, 23 Aug 2021 17:26:27 -0700 Subject: [PATCH 144/530] [JIT] Move UseVariadicCat internals (#63577) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63577 Since other variadic ops will have an almost identical implementation, we can generalize the `UseVariadicCat` implementation and put it in a common folder. Also moved some test utilities that other variadic op tests will likely need. Test Plan: `buck test caffe2/test/cpp/jit:jit -- ConcatOptTest` Reviewed By: navahgar Differential Revision: D30409937 fbshipit-source-id: 925c11c27b58ce98cb8368d2a205e26ba66d3db9 --- test/cpp/jit/test_concat_opt.cpp | 64 ++++--------- test/cpp/jit/test_utils.cpp | 32 +++++++ test/cpp/jit/test_utils.h | 7 ++ tools/build_variables.bzl | 1 + torch/csrc/jit/passes/concat_opt.cpp | 90 ------------------ torch/csrc/jit/passes/concat_opt.h | 7 -- torch/csrc/jit/passes/variadic_ops.cpp | 126 +++++++++++++++++++++++++ torch/csrc/jit/passes/variadic_ops.h | 16 ++++ torch/csrc/jit/runtime/static/impl.cpp | 2 +- 9 files changed, 200 insertions(+), 145 deletions(-) create mode 100644 torch/csrc/jit/passes/variadic_ops.cpp create mode 100644 torch/csrc/jit/passes/variadic_ops.h diff --git a/test/cpp/jit/test_concat_opt.cpp b/test/cpp/jit/test_concat_opt.cpp index 03c0ce6a58dae..5cb73d234927e 100644 --- a/test/cpp/jit/test_concat_opt.cpp +++ b/test/cpp/jit/test_concat_opt.cpp @@ -1,45 +1,15 @@ #include +#include #include #include +#include #include #include namespace torch { namespace jit { -namespace { - -void checkOutputs( - const std::vector& out1, - const std::vector& out2) { - ASSERT_EQ(out1.size(), out2.size()); - for (size_t i = 0; i < out1.size(); ++i) { - ASSERT_EQ(out1[i].sizes(), out2[i].sizes()); - float max_diff = (out1[i] - out2[i]).abs().max().item(); - ASSERT_EQ(max_diff, 0); - } -} - -std::vector runGraph( - std::shared_ptr graph, - const std::vector inputs) { - std::vector stack = fmap(inputs); - Code code(graph, "test"); - InterpreterState(code).run(stack); - TORCH_INTERNAL_ASSERT(!stack.empty()); - // Graph outputs that are handled below: - // * A list of Tensors. - // * 1 Tensor. - if (stack.front().isTensorList()) { - return stack.front().toTensorVector(); - } - TORCH_INTERNAL_ASSERT(stack.front().isTensor()); - return {stack.front().toTensor()}; -} - -} // namespace - TEST(ConcatOptTest, SimpleCommonInputsEliminationPrefix) { auto graph = std::make_shared(); @@ -64,7 +34,7 @@ TEST(ConcatOptTest, SimpleCommonInputsEliminationPrefix) { ASSERT_TRUE(EliminateConcatCommonInputs(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // Graph after EliminateConcatCommonInputs: // graph(%0 : ..., @@ -109,7 +79,7 @@ TEST(ConcatOptTest, SimpleCommonInputsEliminationSuffix) { ASSERT_TRUE(EliminateConcatCommonInputs(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // Graph after EliminateConcatCommonInputs: // graph(%0 : ..., @@ -161,7 +131,7 @@ TEST(ConcatOptTest, CommonInputsEliminationWithDifferentOrderInputs) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // No optimizations should have happened in this case since the inputs // to the `cat` are in different order. @@ -198,7 +168,7 @@ TEST(ConcatOptTest, MoreCommonInputsElimination) { ASSERT_TRUE(EliminateConcatCommonInputs(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); testing::FileCheck() .check_count("= prim::VarConcat(%0, %1, %5)", 1, /*exactly*/ true) @@ -233,7 +203,7 @@ TEST(ConcatOptTest, ExpandConcat) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // After full concat optimization we should have the following graph: // @@ -289,7 +259,7 @@ TEST(ConcatOptTest, ConcatWithoutResultShape) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // No optimizations should have happened in this case since the output // shape of `aten::cat` is not known. @@ -324,7 +294,7 @@ TEST(ConcatOptTest, ConcatWithoutInputShape) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // No optimizations should have happened in this case since the shape of %5, // which is an input to `aten::cat`, is not known. @@ -361,7 +331,7 @@ TEST(ConcatOptTest, UseVariadicCat) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // After replacing `aten::cat` with `prim::VarConcat` we should have the // following graph: @@ -406,7 +376,7 @@ TEST(OptimizeConcatTest, UseVariadicCatReplaceMultiple) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // After full concat optimization we should have the following graph: // @@ -446,7 +416,7 @@ TEST(ConcatOptTest, UseVariadicCatWithMultipleListUses) { graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // After replacing `aten::cat` with `prim::VarConcat` we should have the // following graph: @@ -488,7 +458,7 @@ TEST(ConcatOptTest, UseVariadicCatWithListMutationAfterCat) { ASSERT_TRUE(UseVariadicCat(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // The input list to `aten::cat` is mutated only after `aten::cat` op. So, // it should have been replaced with `prim::VarConcat`. The transformed graph @@ -534,7 +504,7 @@ TEST(ConcatOptTest, UseVariadicCatWithListMutationBeforeCat) { ASSERT_FALSE(UseVariadicCat(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // No transformation should have happened since the `prim::ListConstruct` is // mutated before `aten::cat`. @@ -549,7 +519,7 @@ TEST(ConcatOptTest, UseVariadicCatWithListMutationBeforeCat) { ASSERT_TRUE(RemoveListMutationAndUseVariadicCat(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // The mutation of the list must be removed and the `aten::cat` op must // be replaced with the `prim::VarConcat` op in the graph. The transformed @@ -602,7 +572,7 @@ TEST(ConcatOptTest, UseVariadicCatWithMultipleListMutations) { ASSERT_TRUE(RemoveListMutationAndUseVariadicCat(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // All the mutations of the list must be removed and the `aten::cat` ops must // be replaced with `prim::VarConcat` ops in the graph. The transformed graph @@ -659,7 +629,7 @@ TEST( ASSERT_TRUE(EliminateConcatCommonInputs(graph)); graph->lint(); auto opt_outputs = runGraph(graph, inputs); - checkOutputs(orig_outputs, opt_outputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); // After performing: // * Remove list mutation diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp index 7750ba8f10fee..27667f068588b 100644 --- a/test/cpp/jit/test_utils.cpp +++ b/test/cpp/jit/test_utils.cpp @@ -198,6 +198,7 @@ bool checkRtol(const at::Tensor& diff, const std::vector inputs) { } return diff.abs().max().item() < 2e-6 * maxValue; } + bool almostEqual(const at::Tensor& a, const at::Tensor& b) { return checkRtol(a - b, {a, b}); } @@ -206,6 +207,20 @@ bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) { return (a - b).abs().max().item() == 0.f; } +bool exactlyEqual( + const std::vector& a, + const std::vector& b) { + if (a.size() != b.size()) { + return false; + } + for (size_t i = 0; i < a.size(); ++i) { + if (!exactlyEqual(a[i], b[i])) { + return false; + } + } + return true; +} + std::pair lstm( at::Tensor input, at::Tensor hx, @@ -248,5 +263,22 @@ RegisterOperators reg({ }); } // namespace +std::vector runGraph( + std::shared_ptr graph, + const std::vector& inputs) { + std::vector stack = fmap(inputs); + Code code(graph, "test"); + InterpreterState(code).run(stack); + TORCH_INTERNAL_ASSERT(!stack.empty()); + // Graph outputs that are handled below: + // * A list of Tensors. + // * 1 Tensor. + if (stack.front().isTensorList()) { + return stack.front().toTensorVector(); + } + TORCH_INTERNAL_ASSERT(stack.front().isTensor()); + return {stack.front().toTensor()}; +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h index 676759dca480f..5e640aed0114a 100644 --- a/test/cpp/jit/test_utils.h +++ b/test/cpp/jit/test_utils.h @@ -88,6 +88,13 @@ bool checkRtol(const at::Tensor& diff, const std::vector inputs); bool almostEqual(const at::Tensor& a, const at::Tensor& b); bool exactlyEqual(const at::Tensor& a, const at::Tensor& b); +bool exactlyEqual( + const std::vector& a, + const std::vector& b); + +std::vector runGraph( + std::shared_ptr graph, + const std::vector& inputs); std::pair lstm( at::Tensor input, diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index e20d97333c83e..2eabbd0a8b230 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -244,6 +244,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/passes/symbolic_shape_analysis.cpp", "torch/csrc/jit/passes/specialize_autogradzero.cpp", "torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp", + "torch/csrc/jit/passes/variadic_ops.cpp", "torch/csrc/jit/passes/subgraph_rewrite.cpp", "torch/csrc/jit/passes/tensorexpr_fuser.cpp", "torch/csrc/jit/passes/utils/memory_dag.cpp", diff --git a/torch/csrc/jit/passes/concat_opt.cpp b/torch/csrc/jit/passes/concat_opt.cpp index aa2573ebb42f2..81c8a6745007a 100644 --- a/torch/csrc/jit/passes/concat_opt.cpp +++ b/torch/csrc/jit/passes/concat_opt.cpp @@ -497,95 +497,5 @@ void ExpandConcatAndEliminateRedundancy(const std::shared_ptr& graph) { GRAPH_DUMP("After expanding Concat and eliminating redundancy", graph); } -namespace { - -class VariadicCatUpdater { - public: - explicit VariadicCatUpdater(std::shared_ptr graph) - : graph_(std::move(graph)) {} - - bool run() { - collectCatNodes(graph_->block()); - bool changed = false; - for (auto c : cat_nodes_) { - changed = replaceWithVariadicCat(c) || changed; - } - return changed; - } - - private: - void collectCatNodes(Block* block) { - for (auto node : block->nodes()) { - if (node->kind() == aten::cat) { - cat_nodes_.push_back(node); - } - for (Block* b : node->blocks()) { - collectCatNodes(b); - } - } - } - - bool replaceWithVariadicCat(Node* cat) { - if (cat->input(0)->node()->kind() != prim::ListConstruct) { - return false; - } - auto list = cat->input(0)->node(); - // We do not transform cat ops whose list input can not be moved to the - // position before cat. This in turn implies that there is some mutation - // of the input list before cat. - if (!getOrCreateAliasDb()->couldMoveBeforeTopologically(list, cat)) { - return false; - } - std::vector inputs = list->inputs().vec(); - inputs.push_back(cat->input(1)); - auto var_cat = cat->owningGraph()->create(prim::VarConcat, inputs); - GRAPH_UPDATE("Adding\n", *var_cat); - var_cat->insertBefore(cat); - GRAPH_UPDATE("Replacing\n", *cat, "with\n", *var_cat); - cat->output()->replaceAllUsesWith(var_cat->output()); - GRAPH_UPDATE("Deleting\n", *cat); - cat->destroy(); - if (!list->hasUses()) { - GRAPH_UPDATE("Deleting\n", *list); - list->destroy(); - } - return true; - } - - AliasDb* getOrCreateAliasDb() { - if (!aliasDb_) { - aliasDb_ = std::make_unique(graph_); - } - return aliasDb_.get(); - } - - std::shared_ptr graph_; - std::unique_ptr aliasDb_ = nullptr; - - std::vector cat_nodes_; -}; - -} // namespace - -bool UseVariadicCat(const std::shared_ptr& graph) { - GRAPH_DUMP("Before VariadicCat", graph); - bool changed = VariadicCatUpdater(graph).run(); - if (changed) { - GRAPH_DUMP("After VariadicCat", graph); - } - return changed; -} - -bool RemoveListMutationAndUseVariadicCat(const std::shared_ptr& graph) { - bool changed_in_last_iter = true; - bool changed = false; - while (changed_in_last_iter) { - changed_in_last_iter = RemoveListMutation(graph); - changed_in_last_iter = changed_in_last_iter || UseVariadicCat(graph); - changed = changed || changed_in_last_iter; - } - return changed; -} - } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/concat_opt.h b/torch/csrc/jit/passes/concat_opt.h index b82dc25e612a4..ef4d9432438e6 100644 --- a/torch/csrc/jit/passes/concat_opt.h +++ b/torch/csrc/jit/passes/concat_opt.h @@ -13,12 +13,5 @@ TORCH_API bool EliminateConcatCommonInputs(const std::shared_ptr& graph); TORCH_API void ExpandConcatAndEliminateRedundancy( const std::shared_ptr& graph); -// Replaces the `aten::cat` ops in the given graph with variadic cat ops. -// Returns true if the graph is modified. -TORCH_API bool UseVariadicCat(const std::shared_ptr& graph); - -TORCH_API bool RemoveListMutationAndUseVariadicCat( - const std::shared_ptr& graph); - } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/variadic_ops.cpp b/torch/csrc/jit/passes/variadic_ops.cpp new file mode 100644 index 0000000000000..aeb70747b3a57 --- /dev/null +++ b/torch/csrc/jit/passes/variadic_ops.cpp @@ -0,0 +1,126 @@ +#include + +#include +#include +#include + +namespace torch { +namespace jit { + +namespace { + +class VariadicUpdater { + public: + explicit VariadicUpdater( + std::shared_ptr graph, + NodeKind op, + NodeKind variadic_op) + : graph_(std::move(graph)), op_(op), variadic_op_(variadic_op) {} + + bool run() { + collectOpNodes(graph_->block()); + bool changed = false; + for (auto n : op_nodes_) { + changed |= replaceWithVariadicOp(n); + } + return changed; + } + + private: + void collectOpNodes(Block* block) { + for (auto node : block->nodes()) { + if (node->kind() == op_) { + op_nodes_.push_back(node); + } + for (Block* b : node->blocks()) { + collectOpNodes(b); + } + } + } + + bool replaceWithVariadicOp(Node* op_node) { + if (op_node->input(0)->node()->kind() != prim::ListConstruct) { + return false; + } + auto list = op_node->input(0)->node(); + // We do not transform ops whose list input can not be moved to the + // position before op. This in turn implies that there is some mutation + // of the input list before op. + if (!getOrCreateAliasDb()->couldMoveBeforeTopologically(list, op_node)) { + return false; + } + std::vector inputs = list->inputs().vec(); + // Add non-list inputs + for (size_t i = 1; i < op_node->inputs().size(); ++i) { + inputs.push_back(op_node->input(i)); + } + auto var_op_node = op_node->owningGraph()->create(variadic_op_, inputs); + GRAPH_UPDATE("Adding\n", *var_op_node); + var_op_node->insertBefore(op_node); + GRAPH_UPDATE("Replacing\n", *op_node, "with\n", *var_op_node); + op_node->output()->replaceAllUsesWith(var_op_node->output()); + GRAPH_UPDATE("Deleting\n", *op_node); + op_node->destroy(); + if (!list->hasUses()) { + GRAPH_UPDATE("Deleting\n", *list); + list->destroy(); + } + return true; + } + + AliasDb* getOrCreateAliasDb() { + if (!aliasDb_) { + aliasDb_ = std::make_unique(graph_); + } + return aliasDb_.get(); + } + + std::shared_ptr graph_; + std::unique_ptr aliasDb_ = nullptr; + + std::vector op_nodes_; + + NodeKind op_; + NodeKind variadic_op_; +}; + +} // namespace + +bool UseVariadicOp( + const std::shared_ptr& graph, + NodeKind op, + NodeKind variadic_op) { + const std::string pass_name = std::string("variadic ") + op.toQualString(); + GRAPH_DUMP("Before " + pass_name, graph); + bool changed = VariadicUpdater(graph, op, variadic_op).run(); + if (changed) { + GRAPH_DUMP("After " + pass_name, graph); + } + return changed; +} + +bool RemoveListMutationAndUseVariadicOp( + const std::shared_ptr& graph, + NodeKind op, + NodeKind variadic_op) { + bool changed_in_last_iter = true; + bool changed = false; + while (changed_in_last_iter) { + changed_in_last_iter = RemoveListMutation(graph); + changed_in_last_iter = + UseVariadicOp(graph, op, variadic_op) || changed_in_last_iter; + changed = changed || changed_in_last_iter; + } + return changed; +} + +bool UseVariadicCat(const std::shared_ptr& graph) { + return UseVariadicOp(graph, aten::cat, prim::VarConcat); +} + +bool RemoveListMutationAndUseVariadicCat(const std::shared_ptr& graph) { + return RemoveListMutationAndUseVariadicOp(graph, aten::cat, prim::VarConcat); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/passes/variadic_ops.h b/torch/csrc/jit/passes/variadic_ops.h new file mode 100644 index 0000000000000..1c52e9513ae2b --- /dev/null +++ b/torch/csrc/jit/passes/variadic_ops.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +namespace torch { +namespace jit { + +// Replaces the `aten::cat` ops in the given graph with variadic cat ops. +// Returns true if the graph is modified. +TORCH_API bool UseVariadicCat(const std::shared_ptr& graph); + +TORCH_API bool RemoveListMutationAndUseVariadicCat( + const std::shared_ptr& graph); + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 1ee69a642384f..4219be5040ba7 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -9,11 +9,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include From 172e5c76ab05f1a137eb065b7f221a20eaef514a Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 23 Aug 2021 17:28:33 -0700 Subject: [PATCH 145/530] Fix some memory bugs in onnx passes (#63754) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63754 Running onnx tests with ASAN uncovers several memory errors. These two are caused by: (1) iterating the uses list of a node after mutation, and (2) accessing the `blocks` attribute of a possibly deleted node. To reproduce (this is on a CentOS 7 box): ``` DEBUG=1 CFLAGS="-fsanitize=address" CXXFLAGS="-fsanitize=address" USE_LLVM=$(realpath ../llvm-project/install) CMAKE_PREFIX_PATH=$CONDA_PREFIX python setup.py install LD_PRELOAD=$(realpath /lib64/libasan.so.5) numactl -C3 pytest -v --cov --cov-report xml:test/coverage.xml --cov-append onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset11 -s ``` Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30493939 Pulled By: bertmaher fbshipit-source-id: e16e19dc9b4c9896e102ca8bf04c8bedfdde87af --- .../jit/passes/onnx/list_model_parameters.cpp | 6 ++- .../passes/onnx/pattern_conversion/common.cpp | 4 +- .../onnx/remove_inplace_ops_for_onnx.cpp | 45 +++++++++++-------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp index ccadf53713466..9c751bbae9e12 100644 --- a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp +++ b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp @@ -76,6 +76,7 @@ std::vector getParamAttributes( WithInsertPoint guard(m); std::vector parameterIValues = {}; + std::unordered_set nodesToDestroy; for (auto it = block->nodes().begin(); it != block->nodes().end();) { Node* n = *it; it++; // node n can be destroyed @@ -142,7 +143,7 @@ std::vector getParamAttributes( // This attr is constant for ONNX. auto attrVal = tryInsertConstant(*graph, attr); n->output()->replaceAllUsesWith(*attrVal); - n->destroy(); + nodesToDestroy.emplace(n); } } } @@ -156,6 +157,9 @@ std::vector getParamAttributes( std::end(nextParameterIValues)); } } + for (auto n : nodesToDestroy) { + n->destroy(); + } return parameterIValues; } diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp index 2854c3ab2fe2e..bc646308424b0 100644 --- a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp +++ b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp @@ -4,8 +4,8 @@ namespace torch { namespace jit { bool IndexingPatternFinder::IsSameSource(const Node* n, const Node* m) { - const auto& source_n = n->sourceRange().source(); - const auto& source_m = m->sourceRange().source(); + const auto source_n = n->sourceRange().source(); + const auto source_m = m->sourceRange().source(); return ( (source_n->text() == source_m->text()) && (source_n->starting_line_no() == source_m->starting_line_no())); diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp index 913f4dc2b6edb..2cef76a7391ae 100644 --- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp +++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp @@ -317,26 +317,33 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) { } for (auto input : b->inputs()) { - for (auto use : input->uses()) { - Node* node = use.user; - if (!mr.inplaceOpVariant(node)) { - continue; - } - auto it = std::find(node->inputs().begin(), node->inputs().end(), input); - if (it != node->inputs().end()) { - int index = std::distance(node->inputs().begin(), it); - std::cerr << "Warning: ONNX Preprocess - Removing mutation from node " - << node->kind().toQualString() << " on block input: '" - << (*it)->debugName() << "'. This changes graph semantics." - << std::endl; - - Node* newNode = - addDummyClone(b->owningGraph(), input, false, b->return_node()); - TORCH_INTERNAL_ASSERT(nullptr != newNode); - node->replaceInput(index, newNode->output()); - input->replaceAllUsesAfterNodeWith(node, newNode->output()); + bool needsRestart = false; + do { + needsRestart = false; + for (auto use : input->uses()) { + Node* node = use.user; + if (!mr.inplaceOpVariant(node)) { + continue; + } + auto it = + std::find(node->inputs().begin(), node->inputs().end(), input); + if (it != node->inputs().end()) { + int index = std::distance(node->inputs().begin(), it); + std::cerr << "Warning: ONNX Preprocess - Removing mutation from node " + << node->kind().toQualString() << " on block input: '" + << (*it)->debugName() << "'. This changes graph semantics." + << std::endl; + + Node* newNode = + addDummyClone(b->owningGraph(), input, false, b->return_node()); + TORCH_INTERNAL_ASSERT(nullptr != newNode); + node->replaceInput(index, newNode->output()); + input->replaceAllUsesAfterNodeWith(node, newNode->output()); + needsRestart = true; + break; + } } - } + } while (needsRestart); } } From d9231dc3dfb9c218bbd62355f9a0349be2ceca75 Mon Sep 17 00:00:00 2001 From: driazati Date: Mon, 23 Aug 2021 17:30:51 -0700 Subject: [PATCH 146/530] Skip archiving useless build artifacts (#63785) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63785 We currently zip up everything in `build/` which includes a lot of cruft (`.o` files, random things copied in from dependencies, etc). This makes the artifact bigger (slower upload/download times, and takes about 1.5 minutes to archive). This change makes archiving instead take ~15 seconds and removes the 50 second upload to GitHub step that isn't as useful now that we have the HUD PR page that lists out all artifacts. Test Plan: Imported from OSS Reviewed By: seemethere, janeyx99 Differential Revision: D30494444 Pulled By: driazati fbshipit-source-id: 93202dba7387daeb4859a938110b02ff2dc2ccc4 --- .github/templates/linux_ci_workflow.yml.j2 | 13 +------------ .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 13 +------------ .../generated-linux-bionic-py3.8-gcc9-coverage.yml | 13 +------------ .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 13 +------------ .../generated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 13 +------------ .../generated-linux-xenial-py3.6-gcc5.4.yml | 13 +------------ ...ed-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 13 +------------ 7 files changed, 7 insertions(+), 84 deletions(-) diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 25099b46cbc8e..767760bf24d25 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -220,18 +220,7 @@ jobs: {%- if not is_libtorch %} - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 67bb2064863bc..61a817ea64bc1 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -199,18 +199,7 @@ jobs: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 59061662341a3..f07b8712b6ea1 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -199,18 +199,7 @@ jobs: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index d897e28ace007..cb8c6b55b1789 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -199,18 +199,7 @@ jobs: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml index 30514cab07ea4..4275cc31ebddd 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -199,18 +199,7 @@ jobs: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index bd4d65027c066..e3be43370a777 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -199,18 +199,7 @@ jobs: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml index 5fd0e99f002f7..0282b206a117c 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -197,18 +197,7 @@ jobs: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Archive artifacts into zip run: | - zip -r artifacts.zip dist/ build/ .pytorch-test-times.json - # Upload to github so that people can click and download artifacts - - uses: actions/upload-artifact@v2 - # Don't fail on upload to GH since it's only for user convenience - continue-on-error: true - name: Store PyTorch Build Artifacts on Github - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 name: Store PyTorch Build Artifacts on S3 with: From fc474979055c99d9aa48e1ee8d0c7a33a2a67daf Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 23 Aug 2021 17:39:45 -0700 Subject: [PATCH 147/530] Simplify ccache instructions in CONTRIBUTING.md (#62549) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62549 When building CUDA files with native CMake support, it will respect the `CMAKE_CUDA_COMPILER_LAUNCHER` setting. So, there's no need for symlinks. Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D30498488 Pulled By: malfet fbshipit-source-id: 71c2ae9d4570cfac2a64d777bc95cda3764332a0 --- CONTRIBUTING.md | 112 +++++++++++------------------------------------- 1 file changed, 24 insertions(+), 88 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e1a049cf9a979..baafcefdc59fe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -734,111 +734,47 @@ succeed. #### Use CCache -Even when dependencies are tracked with file modification, -there are many situations where files get rebuilt when a previous -compilation was exactly the same. - -Using ccache in a situation like this is a real time-saver. The ccache manual -describes [two ways to use ccache](https://ccache.samba.org/manual/latest.html#_run_modes). -In the PyTorch project, currently only the latter method of masquerading as -the compiler via symlinks works for CUDA compilation. - -Here are the instructions for installing ccache from source (tested at commit -`3c302a7` of the `ccache` repo): +Even when dependencies are tracked with file modification, there are many +situations where files get rebuilt when a previous compilation was exactly the +same. Using ccache in a situation like this is a real time-saver. +Before building pytorch, install ccache from your package manager of choice: ```bash -#!/bin/bash - -if ! ls ~/ccache/bin/ccache -then - set -ex - sudo apt-get update - sudo apt-get install -y cmake - mkdir -p ~/ccache - pushd ~/ccache - rm -rf ccache - git clone https://github.com/ccache/ccache.git - mkdir -p ccache/build - pushd ccache/build - cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ccache -DENABLE_TESTING=OFF -DZSTD_FROM_INTERNET=ON .. - make -j$(nproc) install - popd - popd - - mkdir -p ~/ccache/lib - mkdir -p ~/ccache/cuda - ln -s ~/ccache/bin/ccache ~/ccache/lib/cc - ln -s ~/ccache/bin/ccache ~/ccache/lib/c++ - ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc - ln -s ~/ccache/bin/ccache ~/ccache/lib/g++ - ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc - - ~/ccache/bin/ccache -M 25Gi -fi - -export PATH=~/ccache/lib:$PATH -export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc +conda install ccache -f conda-forge +sudo apt install ccache +sudo yum install ccache +brew install ccache ``` -Alternatively, `ccache` provided by newer Linux distributions (e.g. Debian/sid) -also works, but the `nvcc` symlink to `ccache` as described above is still required. - -Note that the original `nvcc` binary (typically at `/usr/local/cuda/bin`) must -be on your `PATH`, otherwise `ccache` will emit the following error: - - ccache: error: Could not find compiler "nvcc" in PATH - -For example, here is how to install/configure `ccache` on Ubuntu: +You may also find the default cache size in ccache is too small to be useful. +The cache sizes can be increased from the command line: ```bash -# install ccache -sudo apt install ccache - -# update symlinks and create/re-create nvcc link -sudo /usr/sbin/update-ccache-symlinks -sudo ln -s /usr/bin/ccache /usr/lib/ccache/nvcc - # config: cache dir is ~/.ccache, conf file ~/.ccache/ccache.conf # max size of cache ccache -M 25Gi # -M 0 for unlimited # unlimited number of files ccache -F 0 - -# deploy (and add to ~/.bashrc for later) -export PATH="/usr/lib/ccache:$PATH" ``` -It is also possible to install `ccache` via `conda` by installing it from the -community-maintained `conda-forge` channel. Here is how to set up `ccache` this -way: +To check this is working, do two clean builds of pytorch in a row. The second +build should be substantially and noticeably faster than the first build. If +this doesn't seem to be the case, check the `CMAKE__COMPILER_LAUNCHER` +rules in `build/CMakeCache.txt`, where `` is `C`, `CXX` and `CUDA`. +Each of these 3 variables should contain ccache, e.g. +``` +//CXX compiler launcher +CMAKE_CXX_COMPILER_LAUNCHER:STRING=/usr/bin/ccache +``` +If not, you can define these variables on the command line before invoking `setup.py`. ```bash -# install ccache -conda install -c conda-forge ccache - -# set up ccache compiler symlinks -mkdir ~/ccache -mkdir ~/ccache/lib -mkdir ~/ccache/cuda -ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/cc -ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/c++ -ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/gcc -ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/g++ -ln -s $CONDA_PREFIX/bin/ccache ~/ccache/cuda/nvcc - -# update PATH to reflect symlink locations, consider -# adding this to your .bashrc -export PATH=~/ccache/lib:$PATH -export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc - -# increase ccache cache size to 25 GiB -ccache -M 25Gi +export CMAKE_C_COMPILER_LAUNCHER=ccache +export CMAKE_CXX_COMPILER_LAUNCHER=ccache +export CMAKE_CUDA_COMPILER_LAUNCHER=ccache +python setup.py develop ``` -To check this is working, do two clean builds of pytorch in a row. The second -build should be substantially and noticeably faster than the first build. If this doesn't seem to be the case, check that each of the symlinks above actually link to your installation of `ccache`. For example, if you followed the first option and installed `ccache` from source on a Linux machine, running `readlink -e $(which g++)` should return `~/ccache/bin/ccache`. - - #### Use a faster linker If you are editing a single file and rebuilding in a tight loop, the time spent linking will dominate. The system linker available in most Linux distributions From e4f44bec27bc458c5dc8021ed87f0e6fae904ef4 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 23 Aug 2021 17:39:50 -0700 Subject: [PATCH 148/530] Fix pocketfft include path in mobile build (#63714) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63714 PocketFFT was disabled for CMake < 3.9 but CMake 3.11 is the first version to support `INCLUDE_DIRECTORIES` as a target property. So updating to CMake 3.10 causes the mobile builds to fail. Instead of limiting the CMake support, this just adds the include directory to the entire target, Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D30498369 Pulled By: malfet fbshipit-source-id: 83372e29c477c97e7015763b7c29d6d7e456bcef --- caffe2/CMakeLists.txt | 16 +++++++++++----- cmake/Dependencies.cmake | 7 ++++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 523fea8181cf8..67ab08f9b0fc5 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -529,11 +529,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_codegen.cpp PROPERTIES COMPILE_FLAGS -Wno-init-list-lifetime) endif() - # Pass path to PocketFFT - if(AT_POCKETFFT_ENABLED) - set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/mkl/SpectralOps.cpp PROPERTIES INCLUDE_DIRECTORIES "${POCKETFFT_INCLUDE_DIR}") - endif() - if(NOT INTERN_DISABLE_MOBILE_INTERP) set(MOBILE_SRCS ${TORCH_SRC_DIR}/csrc/jit/mobile/function.cpp @@ -795,6 +790,17 @@ if(USE_PRECOMPILED_HEADERS) PROPERTIES SKIP_PRECOMPILE_HEADERS ON) endif() +# Pass path to PocketFFT +if(AT_POCKETFFT_ENABLED) + if(CMAKE_VERSION VERSION_LESS "3.11") + target_include_directories(torch_cpu PRIVATE "${POCKETFFT_INCLUDE_DIR}") + else() + set_source_files_properties( + "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/mkl/SpectralOps.cpp" + PROPERTIES INCLUDE_DIRECTORIES "${POCKETFFT_INCLUDE_DIR}") + endif() +endif() + if(CMAKE_COMPILER_IS_GNUCXX AND BUILD_LIBTORCH_CPU_WITH_DEBUG) # To enable debug fission we need to build libtorch_cpu with debug info on, # but this increases link time and peak memory usage if we use the diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3e37c3538f6fd..b3cc23ccac8f4 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -242,14 +242,15 @@ endif() # --- [ PocketFFT set(AT_POCKETFFT_ENABLED 0) -if(NOT MKL_FOUND) +if(NOT AT_MKL_ENABLED) find_path(POCKETFFT_INCLUDE_DIR NAMES pocketfft_hdronly.h PATHS /usr/local/include - "$ENV{POCKETFFT_HOME}" + ENV POCKETFFT_HOME "${PROJECT_SOURCE_DIR}/third_party/pocketfft" ) - if(POCKETFFT_INCLUDE_DIR AND CMAKE_VERSION VERSION_GREATER "3.9") + if(POCKETFFT_INCLUDE_DIR) set(AT_POCKETFFT_ENABLED 1) + message(STATUS "Using pocketfft in directory: ${POCKETFFT_INCLUDE_DIR}") endif() endif() From 630ec2e190fe866e8b5c87844d6dd09bf134aac5 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Mon, 23 Aug 2021 17:41:38 -0700 Subject: [PATCH 149/530] [fx_acc] Add mapper for torch.log1p (#63792) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63792 Map `torch.log1p` to `acc_ops.add` + `acc_ops.log`. Test Plan: buck test mode/opt glow/fb/fx/oss_acc_tracer:test_acc_tracer -- test_log1p Reviewed By: wushirong Differential Revision: D30491706 fbshipit-source-id: bcbeddf06131113185d2019cfd7cf5e9193a8a78 --- torch/fx/experimental/fx_acc/acc_ops.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 7c9520660ef77..0c0965a430afd 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -509,6 +509,21 @@ def div(*, input, other): def relu(*, input, inplace=False): return nn.functional.relu(**locals()) +@register_custom_acc_mapper_fn( + op_and_target=("call_function", torch.log1p), + arg_replacement_tuples=[ + ("input", "input"), + ], +) +def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node: + with node.graph.inserting_before(node): + add_kwargs = {"input": node.kwargs["input"], "other": 1} + add_node = node.graph.call_function(add, kwargs=add_kwargs) + add_node.meta = node.meta.copy() + log_kwargs = {"input": add_node} + log_node = node.graph.call_function(log, kwargs=log_kwargs) + log_node.meta = node.meta.copy() + return log_node @register_custom_acc_mapper_fn( op_and_target=("call_method", "sum"), From 16a44344229e89ed8c275503580c7a6a4997b9bd Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Mon, 23 Aug 2021 17:45:39 -0700 Subject: [PATCH 150/530] [BE] Enable functional optim tests for windows (#63462) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63462 Now that `torch.distributed.optim` gates DistributedOptimizer on RPC availability, these tests can be run on windows. ghstack-source-id: 136437635 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D30358923 fbshipit-source-id: 36739bdfe7214789f17de652d30c62c2bc124c73 --- test/distributed/test_c10d_nccl.py | 8 +++----- test/test_functional_optim.py | 16 +--------------- .../_internal/distributed/distributed_test.py | 19 ++++--------------- 3 files changed, 8 insertions(+), 35 deletions(-) diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index e42c5c6be1759..1378aa07f0903 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -39,7 +39,6 @@ with_nccl_blocking_wait, ) from torch.testing._internal.common_utils import ( - IS_WINDOWS, TestCase, run_tests, retry_on_connect_failures, @@ -51,10 +50,9 @@ from torch.utils.checkpoint import checkpoint from torch.distributed.optim import functional_optim_map -if not IS_WINDOWS: - from torch.distributed.optim.functional_sgd import _FunctionalSGD - from torch.distributed.optim.functional_adam import _FunctionalAdam - from torch.distributed.optim.functional_adamw import _FunctionalAdamW +from torch.distributed.optim.functional_sgd import _FunctionalSGD +from torch.distributed.optim.functional_adam import _FunctionalAdam +from torch.distributed.optim.functional_adamw import _FunctionalAdamW if TEST_WITH_DEV_DBG_ASAN: print( diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py index 98a3f06805dba..accc72058578d 100644 --- a/test/test_functional_optim.py +++ b/test/test_functional_optim.py @@ -1,10 +1,8 @@ -import unittest - import torch import torch.nn as nn import torch.nn.functional as F from torch.optim import SGD, Adam, AdamW -from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS +from torch.testing._internal.common_utils import TestCase, run_tests from torch.distributed.optim import functional_optim_map class MyModule(torch.nn.Module): @@ -80,24 +78,12 @@ def _test_functional_optim_parity(self, optim_cls, *args, **kwargs): self.assertNotEqual(old_module_optim_params[i], optim_param) self.assertNotEqual(old_module_functional_params[i], functional_param) - @unittest.skipIf( - IS_WINDOWS, - "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137", - ) def test_functional_optim_parity_sgd(self): self._test_functional_optim_parity(SGD, 1e-2, momentum=0.9, weight_decay=0.01) - @unittest.skipIf( - IS_WINDOWS, - "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137", - ) def test_functional_optim_parity_adam(self): self._test_functional_optim_parity(Adam, 1e-2, betas=(0.9, 0.999), eps=1e-6) - @unittest.skipIf( - IS_WINDOWS, - "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137", - ) def test_functional_optim_parity_adam_w(self): self._test_functional_optim_parity(AdamW, 1e-2, betas=(0.9, 0.999), eps=1e-6) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 096b7182851c3..aa8841d3266bc 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -68,11 +68,12 @@ from torch.distributed.optim import functional_optim_map +from torch.distributed.optim.functional_sgd import _FunctionalSGD +from torch.distributed.optim.functional_adam import _FunctionalAdam +from torch.distributed.optim.functional_adamw import _FunctionalAdamW + if not IS_WINDOWS: import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer - from torch.distributed.optim.functional_sgd import _FunctionalSGD - from torch.distributed.optim.functional_adam import _FunctionalAdam - from torch.distributed.optim.functional_adamw import _FunctionalAdamW from torch.utils.data.distributed import DistributedSampler @@ -4003,10 +4004,6 @@ def _test_ddp_hook_with_optimizer_parity( BACKEND != "nccl" and BACKEND != "gloo", "Only Nccl & Gloo backend support DistributedDataParallel", ) - @sandcastle_skip_if( - IS_WINDOWS, - "FunctionalAdam not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137" - ) @skip_if_lt_x_gpu(2) @skip_if_rocm def test_ddp_hook_with_optimizer_parity_adamw(self): @@ -4029,10 +4026,6 @@ def test_ddp_hook_with_optimizer_parity_adamw(self): BACKEND != "nccl" and BACKEND != "gloo", "Only Nccl & Gloo backend support DistributedDataParallel", ) - @sandcastle_skip_if( - IS_WINDOWS, - "FunctionalAdam not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137" - ) @skip_if_lt_x_gpu(2) @skip_if_rocm def test_ddp_hook_with_optimizer_parity_adam(self): @@ -4055,10 +4048,6 @@ def test_ddp_hook_with_optimizer_parity_adam(self): BACKEND != "nccl" and BACKEND != "gloo", "Only Nccl & Gloo backend support DistributedDataParallel", ) - @sandcastle_skip_if( - IS_WINDOWS, - "FunctionalSGD not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137" - ) @skip_if_lt_x_gpu(2) @skip_if_rocm def test_ddp_hook_with_optimizer_parity_sgd(self): From fc07489ec52e628c10500f0ffeba0cda1cea1b49 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Mon, 23 Aug 2021 17:45:39 -0700 Subject: [PATCH 151/530] [BE] Enable PostLocalSGD tests on windows (#63463) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63463 Now that `torch.distributed.optim` gates DistributedOptimizer on RPC availability, local sgd optimizer can be used on windows. ghstack-source-id: 136437632 Test Plan: Ci Reviewed By: SciPioneer Differential Revision: D30358922 fbshipit-source-id: 9b56aebf1075f026637296d338805ad8851c9d40 --- torch/testing/_internal/distributed/distributed_test.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index aa8841d3266bc..1631983d32ec7 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -72,8 +72,7 @@ from torch.distributed.optim.functional_adam import _FunctionalAdam from torch.distributed.optim.functional_adamw import _FunctionalAdamW -if not IS_WINDOWS: - import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer +import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer from torch.utils.data.distributed import DistributedSampler @@ -4610,9 +4609,6 @@ def _test_DistributedDataParallel_SyncBatchNorm( BACKEND != "nccl" and BACKEND != "gloo", "Only NCCL and GLOO backend support DistributedDataParallel", ) - @sandcastle_skip_if( - IS_WINDOWS, "PostLocalSGDOptimizer not yet supported with Windows." - ) def test_post_localSGD_optimizer_parity(self, grad_is_view=False): learning_rate = 0.03 period = 4 From e1bdebf6858e16127673d1a362144089f99e6ec9 Mon Sep 17 00:00:00 2001 From: Vitaly Fedyunin Date: Mon, 23 Aug 2021 18:07:37 -0700 Subject: [PATCH 152/530] Adding DataLoader2 class as future replacement of DataLoader (#63742) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63742 Supports sharding and batching on loader level** Supports sharding and batching on loader level Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30494506 Pulled By: VitalyFedyunin fbshipit-source-id: 6648e09d955055ac38e3a4e3973f701acefca762 --- test/test_dataloader.py | 37 ++++++++- torch/utils/data/__init__.py | 6 +- torch/utils/data/dataloader_experimental.py | 89 +++++++++++++++++++++ 3 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 torch/utils/data/dataloader_experimental.py diff --git a/test/test_dataloader.py b/test/test_dataloader.py index c68d7e2e14b33..01136b9e4bb07 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -13,9 +13,20 @@ import warnings import tempfile from torch import multiprocessing as mp -from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset, Subset +from torch.utils.data import ( + ChainDataset, + ConcatDataset, + DataLoader, + DataLoader2, + Dataset, + IterableDataset, + Subset, + TensorDataset, + _utils +) from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL from torch.utils.data.dataset import random_split +from torch.utils.data.datapipes.iter import IterableAsDataPipe from torch._utils import ExceptionWrapper from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest, @@ -33,6 +44,17 @@ else: warnings.warn(err_msg) +try: + import dill + # XXX: By default, dill writes the Pickler dispatch table to inject its + # own logic there. This globally affects the behavior of the standard library + # pickler for any user who transitively depends on this module! + # Undo this extension to avoid altering the behavior of the pickler globally. + dill.extend(use_dill=False) + HAS_DILL = True +except ImportError: + HAS_DILL = False +skipIfNoDill = unittest.skipIf(not HAS_DILL, "no dill") # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -1934,6 +1956,19 @@ def test_excessive_thread_creation_warning(self): dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000) +@unittest.skipIf( + TEST_WITH_TSAN, + "Fails with TSAN with the following error: starting new threads after multi-threaded " + "fork is not supported. Dying (set die_after_fork=0 to override)") +class TestDataLoader2(TestCase): + @skipIfNoDill + def test_basics(self): + dp = IterableAsDataPipe(list(range(10))) + dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) + dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) + self.assertEquals(list(dl), list(dl2)) + + class StringDataset(Dataset): def __init__(self): self.s = '12345' diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py index 1d18b7b030894..0af9e6193af3d 100644 --- a/torch/utils/data/__init__.py +++ b/torch/utils/data/__init__.py @@ -11,9 +11,9 @@ from torch.utils.data.dataset import ( ChainDataset, ConcatDataset, + DataChunk, Dataset, Dataset as MapDataPipe, - DataChunk, IterableDataset, IterableDataset as IterDataPipe, Subset, @@ -34,11 +34,14 @@ runtime_validation, runtime_validation_disabled, ) +from torch.utils.data.dataloader_experimental import DataLoader2 + __all__ = ['BatchSampler', 'ChainDataset', 'ConcatDataset', 'DataLoader', + 'DataLoader2', 'Dataset', 'DistributedSampler', 'IterDataPipe', @@ -68,4 +71,3 @@ ################################################################################ # import subpackage ################################################################################ -from torch.utils.data import datapipes diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py new file mode 100644 index 0000000000000..85028afd22124 --- /dev/null +++ b/torch/utils/data/dataloader_experimental.py @@ -0,0 +1,89 @@ + +import functools + +import torch.utils.data.backward_compatibility +from torch.utils.data import DataLoader, IterDataPipe +from torch.utils.data.datapipes.iter import IterableAsDataPipe + +class DataLoader2: + def __new__(cls, + dataset, + batch_size=1, + shuffle=False, + sampler=None, + batch_sampler=None, + num_workers=0, + collate_fn=None, + pin_memory=False, + drop_last=False, + timeout=0, + worker_init_fn=None, + *, + prefetch_factor=2, + persistent_workers=False, + batch_outside_worker=False): + if isinstance(dataset, IterDataPipe): + datapipe = dataset + if batch_sampler is not None: + raise Exception( + 'batch_sampler is not yet supported for DataPipes') + if sampler is not None: + raise Exception( + 'sampler is not yet supported for DataPipes') + if shuffle: + datapipe = datapipe.shuffle() + if batch_outside_worker and pin_memory: + raise Exception( + 'pin_memory is not yet compatible with batch_outside_worker') + if not batch_outside_worker: + if batch_size is not None: + datapipe = datapipe.batch(batch_size, drop_last=drop_last) + if collate_fn is None: + collate_fn = torch.utils.data._utils.collate.default_collate + + def sharding_worker_init_fn(worker_init_fn, worker_id): + if worker_init_fn is not None: + worker_init_fn(worker_id) + torch.utils.data.backward_compatibility.worker_init_fn( + worker_id) + + my_worker_init_fn = functools.partial( + sharding_worker_init_fn, worker_init_fn) + + data_loader = DataLoader(datapipe, + batch_size=None, # Replaced by .batch DataPipe + shuffle=False, # Replaced by .shuffle DataPipe + sampler=None, + batch_sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=False, # Replaced by .batch DataPipe + timeout=timeout, + worker_init_fn=my_worker_init_fn, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers) + + if not batch_outside_worker: + return data_loader + else: + if collate_fn is None: + collate_fn = torch.utils.data._utils.collate.default_collate + datapipe = IterableAsDataPipe(data_loader).batch( + batch_size, drop_last=drop_last).map(collate_fn) + return datapipe + + else: + return DataLoader(dataset, + batch_size=batch_size, + shuffle=shuffle, + sampler=sampler, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=drop_last, + timeout=timeout, + worker_init_fn=worker_init_fn, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers) From 195c60d84460d16311ad606e504eda17b795a820 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Mon, 23 Aug 2021 18:17:20 -0700 Subject: [PATCH 153/530] [fx2trt] Add acc op and converter for torch.pow (#63795) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63795 att Test Plan: buck run mode/opt caffe2/torch/fb/fx2trt:test_binary_ops Reviewed By: jackm321, wushirong Differential Revision: D30492488 fbshipit-source-id: 6d615770567b13720316f06fd2f866ea2fdc2995 --- .../fx/experimental/fx2trt/converters/acc_ops_converters.py | 5 +++++ torch/fx/experimental/fx_acc/acc_ops.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index eddb079afcac5..566359bf2af0d 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -764,6 +764,11 @@ def acc_ops_mul(network, target, args, kwargs, name): network, kwargs["input"], kwargs["other"], trt.ElementWiseOperation.PROD, name ) +@tensorrt_converter(acc_ops.pow) +def acc_ops_pow(network, target, args, kwargs, name): + return add_binary_elementwise_layer( + network, kwargs["input"], kwargs["exponent"], trt.ElementWiseOperation.POW, name + ) @tensorrt_converter(acc_ops.min_two_tensors_input) def acc_ops_min_two_tensors_input(network, target, args, kwargs, name): diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 0c0965a430afd..95fffaa479c9e 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -496,6 +496,12 @@ def div(*, input, other): return input / other +@register_acc_op_mapping(op_and_target=("call_function", torch.pow)) +@register_acc_op +def pow(*, input, exponent): + return torch.pow(input, exponent) + + @register_acc_op_mapping(op_and_target=("call_function", nn.functional.relu)) @register_acc_op_mapping( op_and_target=("call_function", torch.relu), From d96ef8c1b1860185f0bd91699f71a087cf9e9efe Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Mon, 23 Aug 2021 18:43:17 -0700 Subject: [PATCH 154/530] [Static Runtime] SR clones graph input (#63704) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63704 Previously SR did not clone the graph. This was leading to subtle bugs in `testStaticRuntime`; static runtime would modify its graph, and the graph used by the JIT interpreter would change as well. The JIT interpreter would then crash if SR-only ops were added! Cloning the graph is more consistent with the behavior of the `Module` ctor. Test Plan: `buck test caffe2/benchmarks/static_runtime/...` Reviewed By: hlu1 Differential Revision: D30463294 fbshipit-source-id: b771551a1f55f95fde79373b23babcf3e5ddf726 --- torch/csrc/jit/runtime/static/impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 4219be5040ba7..1b5ee724b45a4 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -548,7 +548,7 @@ PrepareForStaticModule( StaticModule::StaticModule( std::shared_ptr g, const StaticModuleOptions& opts) - : StaticModule(PrepareForStaticModule(g, opts), opts) {} + : StaticModule(PrepareForStaticModule(g->copy(), opts), opts) {} StaticModule::StaticModule( const torch::jit::Module& m, From f5d585391d13287250e85cbd55a17c5e0b8ac2a8 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Mon, 23 Aug 2021 18:44:46 -0700 Subject: [PATCH 155/530] Add ROCm as a platform for which tests can be disabled (#63813) Summary: Realized we were missing ROCm as a platform on which one could disable a flaky test. (like how this issue specifies windows https://github.com/pytorch/pytorch/issues/61655) cc jeffdaily sunway513 jithunnair-amd ROCmSupport Pull Request resolved: https://github.com/pytorch/pytorch/pull/63813 Reviewed By: seemethere Differential Revision: D30498478 Pulled By: janeyx99 fbshipit-source-id: f1abe8677e1ddd01de3291e1618272ad8e287dc4 --- torch/testing/_internal/common_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index a16056cd55cf7..b8e5b097bd6c0 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -906,8 +906,10 @@ def check_if_enable(test: unittest.TestCase): platform_to_conditional: Dict = { "mac": IS_MACOS, "macos": IS_MACOS, + "win": IS_WINDOWS, "windows": IS_WINDOWS, - "linux": IS_LINUX + "linux": IS_LINUX, + "rocm": TEST_WITH_ROCM } if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]): raise unittest.SkipTest( From 5b7cdc5a3ddb9a1a3d46d05b2925b5b4713b0025 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Mon, 23 Aug 2021 22:53:35 -0700 Subject: [PATCH 156/530] add channels last for GroupNorm (#49821) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49821 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D26007053 Pulled By: VitalyFedyunin fbshipit-source-id: 34a48d5d3b66a159febf3c3d96748fbaba1b9e31 --- .../src/ATen/native/cpu/group_norm_kernel.cpp | 162 ++++++++++++++++-- aten/src/ATen/native/group_norm.cpp | 17 +- test/test_nn.py | 34 ++++ 3 files changed, 193 insertions(+), 20 deletions(-) diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp index 290a6315da445..fb8db7e61800f 100644 --- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp @@ -74,6 +74,136 @@ void GroupNormKernelImplInternal( }); } +template +void GroupNormKernelImplChannelsLastInternal( + const Tensor& X, + const Tensor& gamma, + const Tensor& beta, + int64_t N, + int64_t C, + int64_t HxW, + int64_t group, + T eps, + Tensor& Y, + Tensor& mean, + Tensor& rstd) { + TORCH_CHECK(X.numel() == N * C * HxW); + TORCH_CHECK(!gamma.defined() || gamma.numel() == C); + TORCH_CHECK(!beta.defined() || beta.numel() == C); + const int64_t G = group; + const int64_t D = C / G; + const T* X_data = X.data_ptr(); + const T* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + const T* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + T* Y_data = Y.data_ptr(); + T* mean_data = mean.data_ptr(); + T* rstd_data = rstd.data_ptr(); + const T s = T(1) / static_cast(D * HxW); + const bool gamma_null = (gamma_data == nullptr); + const bool beta_null = beta_data == nullptr; + + // temp buffer holding x and x2 + Tensor buffer = at::empty({N, 2 * C}, X.options()).zero_(); + T* buffer_data = buffer.data_ptr(); + + using Vec = vec::Vectorized; + at::parallel_for(0, N, 1, [&](int64_t start, int64_t end) { + constexpr int64_t K = Vec::size(); + const int64_t inner_size = C / K * K; + for (int64_t n = start; n < end; ++n) { + T* mean_ptr = buffer_data + n * 2 * C; + T* rstd_ptr = mean_ptr + C; + for (int64_t i = 0; i < HxW; ++i) { + const T* X_ptr = X_data + n * HxW * C + i * C; + for (int64_t j = 0; j < inner_size; j += K) { + const Vec x_vec = Vec::loadu(X_ptr + j); + Vec mean_vec = Vec::loadu(mean_ptr + j) + x_vec; + Vec rstd_vec = Vec::loadu(rstd_ptr + j) + x_vec * x_vec; + mean_vec.store(mean_ptr + j); + rstd_vec.store(rstd_ptr + j); + } + for (int64_t j = inner_size; j < C; ++j) { + mean_ptr[j] += X_ptr[j]; + rstd_ptr[j] += X_ptr[j] * X_ptr[j]; + } + } + + for (int64_t g = 0; g < G; ++g) { + T mean_val = T(0); + T rstd_val = T(0); + for (int64_t d = 0; d < D; ++d) { + mean_val += mean_ptr[g * D + d]; + rstd_val += rstd_ptr[g * D + d]; + } + mean_val *= s; + rstd_val = std::max(rstd_val * s - mean_val * mean_val, T(0)); + rstd_val = T(1) / std::sqrt(rstd_val + eps); + + // continue to use the temp buffer for mean and rstd value, + // so that we can vectorize the following math on entire C dimension. + for (int64_t d = 0; d < D; ++d) { + mean_ptr[g * D + d] = mean_val; + rstd_ptr[g * D + d] = rstd_val; + } + + mean_data[n * G + g] = mean_val; + rstd_data[n * G + g] = rstd_val; + } + + // expand gamma_null and beta_null to reduce if-else on critial path. + if (!gamma_null && !beta_null) { + for (int64_t i = 0; i < HxW; ++i) { + const T* X_ptr = X_data + n * HxW * C + i * C; + T* Y_ptr = Y_data + n * HxW * C + i * C; + for (int64_t j = 0; j < inner_size; j += K) { + Vec scale_vec = Vec::loadu(rstd_ptr + j) * Vec::loadu(gamma_data + j); + Vec bias_vec = Vec::loadu(beta_data + j) - scale_vec * Vec::loadu(mean_ptr + j); + Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec; + y_vec.store(Y_ptr + j); + } + for (int64_t j = inner_size; j < C; ++j) { + T scale = rstd_ptr[j] * gamma_data[j]; + T bias = -scale * mean_ptr[j] + beta_data[j]; + Y_ptr[j] = scale * X_ptr[j] + bias; + } + } + } else if (gamma_null && beta_null) { + for (int64_t i = 0; i < HxW; ++i) { + const T* X_ptr = X_data + n * HxW * C + i * C; + T* Y_ptr = Y_data + n * HxW * C + i * C; + for (int64_t j = 0; j < inner_size; j += K) { + Vec scale_vec = Vec::loadu(rstd_ptr + j); + Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) - scale_vec * Vec::loadu(mean_ptr + j); + y_vec.store(Y_ptr + j); + } + for (int64_t j = inner_size; j < C; ++j) { + T scale = rstd_ptr[j]; + Y_ptr[j] = scale * X_ptr[j] -scale * mean_ptr[j]; + } + } + } else { + for (int64_t i = 0; i < HxW; ++i) { + const T* X_ptr = X_data + n * HxW * C + i * C; + T* Y_ptr = Y_data + n * HxW * C + i * C; + for (int64_t j = 0; j < inner_size; j += K) { + Vec gamma_vec = gamma_null ? Vec(1) : Vec::loadu(gamma_data + j); + Vec beta_vec = beta_null ? Vec(0) : Vec::loadu(beta_data + j); + Vec scale_vec = Vec::loadu(rstd_ptr + j) * gamma_vec; + Vec bias_vec = beta_vec - scale_vec * Vec::loadu(mean_ptr + j); + Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec; + y_vec.store(Y_ptr + j); + } + for (int64_t j = inner_size; j < C; ++j) { + T scale = rstd_ptr[j] * (gamma_null ? T(1) : gamma_data[j]); + T bias = -scale * mean_ptr[j] + (beta_null ? T(0) : beta_data[j]); + Y_ptr[j] = scale * X_ptr[j] + bias; + } + } + } + } + }); +} + void GroupNormKernelImpl( const Tensor& X, const Tensor& gamma, @@ -86,20 +216,24 @@ void GroupNormKernelImpl( Tensor& Y, Tensor& mean, Tensor& rstd) { - AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() { - GroupNormKernelImplInternal( - X, - gamma, - beta, - N, - C, - HxW, - group, - static_cast(eps), - Y, - mean, - rstd); - }); + switch (X.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() { + GroupNormKernelImplInternal( + X, gamma, beta, N, C, HxW, group, static_cast(eps), Y, mean, rstd); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() { + GroupNormKernelImplChannelsLastInternal( + X, gamma, beta, N, C, HxW, group, static_cast(eps), Y, mean, rstd); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } } template diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 3a60d19959f83..5533780a4547e 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -31,7 +31,10 @@ std::tuple native_group_norm( const Tensor& gamma = *gamma_maybe_owned; const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); }); - TORCH_CHECK(X.is_contiguous()); + auto memory_format = X.device().is_cpu() ? + X.suggest_memory_format() : at::MemoryFormat::Contiguous; + + TORCH_CHECK(X.is_contiguous(memory_format)); Tensor Y = at::native::empty_like( X, @@ -39,7 +42,7 @@ std::tuple native_group_norm( c10::nullopt /* layout */, c10::nullopt /* device */, c10::nullopt /* pin_memory */, - LEGACY_CONTIGUOUS_MEMORY_FORMAT); + memory_format); Tensor mean = at::empty({N, group}, X.options()); Tensor rstd = at::empty({N, group}, X.options()); GroupNormKernel( @@ -73,7 +76,7 @@ std::tuple native_group_norm_backward( c10::nullopt /* layout */, c10::nullopt /* device */, c10::nullopt /* pin_memory */, - LEGACY_CONTIGUOUS_MEMORY_FORMAT); + at::MemoryFormat::Contiguous); } if (grad_input_mask[1]) { dgamma = at::native::empty_like( @@ -82,7 +85,7 @@ std::tuple native_group_norm_backward( c10::nullopt /* layout */, c10::nullopt /* device */, c10::nullopt /* pin_memory */, - LEGACY_CONTIGUOUS_MEMORY_FORMAT); + at::MemoryFormat::Contiguous); } if (grad_input_mask[2]) { dbeta = at::native::empty_like( @@ -91,7 +94,7 @@ std::tuple native_group_norm_backward( c10::nullopt /* layout */, c10::nullopt /* device */, c10::nullopt /* pin_memory */, - LEGACY_CONTIGUOUS_MEMORY_FORMAT); + at::MemoryFormat::Contiguous); } GroupNormBackwardKernel( X.device().type(), @@ -153,7 +156,9 @@ Tensor group_norm( c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend()); const Tensor kEmpty; - const auto& X = input.is_contiguous() ? input : input.contiguous(); + auto memory_format = input.suggest_memory_format(); + const auto& X = input.device().is_cpu() ? + input.contiguous(memory_format) : input.contiguous(); const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty; const auto& beta = bias.defined() ? bias.contiguous() : kEmpty; TORCH_CHECK(!gamma.defined() || gamma.numel() == C); diff --git a/test/test_nn.py b/test/test_nn.py index bb109cf20e459..f4691e6a5fa03 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -13026,6 +13026,40 @@ def test_GroupNorm_empty(self, device): with torch.backends.cudnn.flags(enabled=False): self._test_module_empty_input(mod, inp) + @onlyCPU + @dtypes(torch.float, torch.double) + def test_groupnorm_nhwc(self, device, dtype): + def helper(self, size, groups): + channels = size[1] + input = torch.randn(size, dtype=dtype, device=device, requires_grad=True) + input = input.contiguous(memory_format=torch.channels_last) + input.retain_grad() + grad = torch.randn(size, dtype=dtype, device=device) + grad = grad.contiguous(memory_format=torch.channels_last) + gn = nn.GroupNorm(groups, channels).to(device).to(dtype) + gn.weight.data.uniform_() + gn.bias.data.uniform_() + + ref_input = input.detach().clone().contiguous().requires_grad_(True) + ref_grad = grad.detach().clone().contiguous() + ref_gn = nn.GroupNorm(groups, channels).to(device).to(dtype) + ref_gn.load_state_dict(gn.state_dict()) + + out = gn(input) + out.backward(grad) + ref_out = ref_gn(ref_input) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertEqual(out, ref_out) + self.assertEqual(gn.weight.grad, ref_gn.weight.grad) + self.assertEqual(gn.bias.grad, ref_gn.bias.grad) + self.assertEqual(input.grad, ref_input.grad) + + helper(self, (4, 8, 10, 10), 4) + helper(self, (2, 30, 9, 9), 3) + @onlyOnCPUAndCUDA def test_GroupNorm_numeric(self, device): def group_norm_ref(X, gamma, beta, groups, channels, eps): From dd96c26066fd8e31dc768002e207477c38f86b7a Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 24 Aug 2021 00:29:22 -0700 Subject: [PATCH 157/530] [TensorExpr] More NFC changes like Expr* -> ExprPtr. (#63778) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63778 This is a preparation for a switch from raw pointers to shared pointers as a memory model for TE expressions and statements. Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30487425 Pulled By: ZolotukhinM fbshipit-source-id: 9cbe817b7d4e5fc2f150b29bb9b3bf578868f20c --- benchmarks/cpp/tensorexpr/bench_approx.cpp | 26 +++--- benchmarks/cpp/tensorexpr/bench_batchnorm.cpp | 4 +- benchmarks/cpp/tensorexpr/bench_compile.cpp | 4 +- benchmarks/cpp/tensorexpr/bench_concat.cpp | 42 +++++----- benchmarks/cpp/tensorexpr/bench_gemm.cpp | 80 +++++++++---------- benchmarks/cpp/tensorexpr/bench_parallel.cpp | 4 +- benchmarks/cpp/tensorexpr/bench_reduce.cpp | 32 ++++---- test/cpp/tensorexpr/test_llvm.cpp | 2 +- test/cpp/tensorexpr/test_loopnest.cpp | 70 ++++++++-------- test/cpp/tensorexpr/test_memdependency.cpp | 6 +- test/cpp/tensorexpr/test_reductions.cpp | 24 +++--- test/cpp/tensorexpr/tutorial.cpp | 4 +- torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 29 +++---- torch/csrc/jit/tensorexpr/half_support.h | 3 +- .../jit/tensorexpr/mem_dependency_checker.h | 4 +- .../csrc/jit/tensorexpr/operators/reduction.h | 6 +- 16 files changed, 172 insertions(+), 168 deletions(-) diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp index 1f09b1dbac5c1..6e31697d586dd 100644 --- a/benchmarks/cpp/tensorexpr/bench_approx.cpp +++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp @@ -12,19 +12,19 @@ using namespace torch::jit::tensorexpr; void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) { auto loops = ln->getLoopStmtsFor(target); - For *inner, *tail; + ForPtr inner, tail; ln->splitWithTail(loops[0], width, &inner, &tail); ln->vectorize(inner); } void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) { - std::vector loops = ln->getLoopStmtsFor(target); - For *inner, *tail; + std::vector loops = ln->getLoopStmtsFor(target); + ForPtr inner, tail; ln->splitWithTail(loops[0], 16 * 8, &inner, &tail); - For* outer = loops[0]; + ForPtr outer = loops[0]; ln->vectorize(inner); ln->splitWithTail(outer, 8, &inner, &tail); - Stmt* unrolled; + StmtPtr unrolled; LoopNest::unroll(inner, &unrolled); } @@ -44,7 +44,7 @@ static void relu_nnc(benchmark::State& state) { LoopNest ln({B}); optimizePointwise(&ln, B); ln.prepareForCodegen(); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -74,7 +74,7 @@ static void log_nnc_sleef(benchmark::State& state) { LoopNest ln({B}); ln.prepareForCodegen(); vectorize(&ln, B, 8); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -104,7 +104,7 @@ static void log_nnc_fast(benchmark::State& state) { LoopNest ln({B}); optimizePointwise(&ln, B); ln.prepareForCodegen(); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -134,7 +134,7 @@ static void log_nnc_vml(benchmark::State& state) { LoopNest ln({B}); vectorize(&ln, B, 8); ln.prepareForCodegen(); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -181,7 +181,7 @@ static void logit_nnc_sleef(benchmark::State& state) { LoopNest ln({B}); ln.prepareForCodegen(); optimizePointwise(&ln, B); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -218,7 +218,7 @@ static void logit_nnc_fast(benchmark::State& state) { LoopNest ln({B}); ln.prepareForCodegen(); optimizePointwise(&ln, B); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -255,7 +255,7 @@ static void logit_nnc_vml(benchmark::State& state) { LoopNest ln({B}); ln.prepareForCodegen(); vectorize(&ln, B, 16); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); @@ -326,7 +326,7 @@ static void tanh_nnc_fast(benchmark::State& state) { LoopNest ln({B}); optimizePointwise(&ln, B); ln.prepareForCodegen(); - Stmt* s = ln.root_stmt(); + StmtPtr s = ln.root_stmt(); s = torch::jit::tensorexpr::IRSimplifier::simplify(s); std::vector args; args.emplace_back(B); diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp index 434cd6bfdbb8e..872594ec286b7 100644 --- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp @@ -105,7 +105,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) { loops = nest.getLoopStmtsFor(output); loops[0]->set_parallel(); nest.prepareForCodegen(); - Stmt* s = IRSimplifier::simplify(nest.root_stmt()); + StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps}); std::vector args; @@ -163,7 +163,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) { }); LoopNest nest({output}); nest.prepareForCodegen(); - Stmt* s = IRSimplifier::simplify(nest.root_stmt()); + StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps}); std::vector args; diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp index cc84e65a545b2..245d5d8b203c5 100644 --- a/benchmarks/cpp/tensorexpr/bench_compile.cpp +++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp @@ -33,7 +33,7 @@ static void BM_CompileSwish(benchmark::State& state) { nest.computeInline(tensor->buf()); } nest.prepareForCodegen(); - te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt()); + te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt()); te::LLVMCodeGen cg(s, {A, sixth, n}); } } @@ -63,7 +63,7 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) { nest.computeInline(tensor->buf()); } nest.prepareForCodegen(); - te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt()); + te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt()); for (auto _ : state) { te::LLVMCodeGen cg(s, {A, sixth, n}); } diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp index a437967a09497..cb9aa84150e88 100644 --- a/benchmarks/cpp/tensorexpr/bench_concat.cpp +++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp @@ -83,7 +83,7 @@ class ConcatBench : public benchmark::Fixture { }); LoopNest nest({output}); nest.prepareForCodegen(); - Stmt* s = IRSimplifier::simplify(nest.root_stmt()); + StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); std::vector buf_args(inputs.begin(), inputs.end()); buf_args.push_back(output); LLVMCodeGen cg(s, buf_args); @@ -108,47 +108,51 @@ class ConcatBench : public benchmark::Fixture { TORCH_INTERNAL_ASSERT(concat_dim_ == 1); - auto output_buf = new Buf( - new Var("aten_cat", kHandle), - {new IntImm(output_size_[0]), new IntImm(output_size_[1])}, + auto output_buf = alloc( + alloc("aten_cat", kHandle), + std::vector( + {alloc(output_size_[0]), alloc(output_size_[1])}), kFloat); std::vector inputs; - std::vector for_stmts(num_inputs); + std::vector for_stmts(num_inputs); int cumulative_input_sizes = 0; for (size_t i = 0; i < num_inputs; ++i) { inputs.emplace_back(Placeholder( "input" + std::to_string(i), kFloat, {input_sizes_[i][0], input_sizes_[i][1]})); - std::vector for_vars(num_inputs); + std::vector for_vars(num_inputs); for (size_t d = 0; d < num_dims; ++d) { for_vars[d] = - new Var("i" + std::to_string(i) + "_" + std::to_string(d), kInt); + alloc("i" + std::to_string(i) + "_" + std::to_string(d), kInt); } - auto store = new Store( + auto store = alloc( output_buf, - {for_vars[0], - new Add(for_vars[1], new IntImm(cumulative_input_sizes))}, - new Load(inputs[i].data(), {for_vars[0], for_vars[1]})); - auto for_st = new For( + std::vector( + {for_vars[0], + alloc(for_vars[1], alloc(cumulative_input_sizes))}), + alloc( + inputs[i].data(), + std::vector({for_vars[0], for_vars[1]}))); + auto for_st = alloc( for_vars[0], - new IntImm(0), - new IntImm(input_sizes_[i][0]), - new For( + alloc(0), + alloc(input_sizes_[i][0]), + alloc( for_vars[1], - new IntImm(0), - new IntImm(input_sizes_[i][1]), + alloc(0), + alloc(input_sizes_[i][1]), store)); for_stmts[i] = for_st; cumulative_input_sizes += input_sizes_[i][1]; } - auto output = new Tensor(output_buf, new Block(for_stmts)); + auto output = new Tensor(output_buf, alloc(for_stmts)); LoopNest nest({output}); nest.prepareForCodegen(); nest.vectorizeInnerLoops(); - Stmt* s = IRSimplifier::simplify(nest.root_stmt()); + StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); std::vector buf_args(inputs.begin(), inputs.end()); buf_args.push_back(output); LLVMCodeGen cg(s, buf_args); diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp index 792d457c2f23a..7ebaa87781514 100644 --- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp @@ -54,7 +54,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) { {{K, "K"}}); te::LoopNest loop({CT}); loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); @@ -80,41 +80,41 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) { { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* m = loops[0]; + te::ForPtr m = loops[0]; loop.splitWithMask(m, 32); } { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* n = loops[2]; + te::ForPtr n = loops[2]; loop.splitWithMask(n, 32); } // mo, mi, no, ni, k -> // mo, no, mi, ni, k { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[1]; - te::For* no = loops[2]; + te::ForPtr mi = loops[1]; + te::ForPtr no = loops[2]; loop.reorderAxis(mi, no); } // mo, no, mi, ni, k -> // mo, no, mi, k, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* ni = loops[3]; - te::For* k = loops[4]; + te::ForPtr ni = loops[3]; + te::ForPtr k = loops[4]; loop.reorderAxis(ni, k); } // mo, no, mi, k, ni -> // mo, no, k, mi, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[2]; - te::For* k = loops[3]; + te::ForPtr mi = loops[2]; + te::ForPtr k = loops[3]; loop.reorderAxis(mi, k); } loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); @@ -140,41 +140,41 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) { { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* m = loops[0]; + te::ForPtr m = loops[0]; loop.splitWithMask(m, 4); } { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* n = loops[2]; + te::ForPtr n = loops[2]; loop.splitWithMask(n, 16); } // mo, mi, no, ni, k -> // mo, no, mi, ni, k { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[1]; - te::For* no = loops[2]; + te::ForPtr mi = loops[1]; + te::ForPtr no = loops[2]; loop.reorderAxis(mi, no); } // mo, no, mi, ni, k -> // mo, no, mi, k, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* ni = loops[3]; - te::For* k = loops[4]; + te::ForPtr ni = loops[3]; + te::ForPtr k = loops[4]; loop.reorderAxis(ni, k); } // mo, no, mi, k, ni -> // mo, no, k, mi, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[2]; - te::For* k = loops[3]; + te::ForPtr mi = loops[2]; + te::ForPtr k = loops[3]; loop.reorderAxis(mi, k); } loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); @@ -200,49 +200,49 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) { { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* m = loops[0]; + te::ForPtr m = loops[0]; loop.splitWithMask(m, 4); } { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* n = loops[2]; + te::ForPtr n = loops[2]; loop.splitWithMask(n, 16); } // mo, mi, no, ni, k -> // mo, no, mi, ni, k { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[1]; - te::For* no = loops[2]; + te::ForPtr mi = loops[1]; + te::ForPtr no = loops[2]; loop.reorderAxis(mi, no); } // mo, no, mi, ni, k -> // mo, no, mi, k, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* ni = loops[3]; - te::For* k = loops[4]; + te::ForPtr ni = loops[3]; + te::ForPtr k = loops[4]; loop.reorderAxis(ni, k); } // mo, no, mi, k, ni -> // mo, no, k, mi, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[2]; - te::For* k = loops[3]; + te::ForPtr mi = loops[2]; + te::ForPtr k = loops[3]; loop.reorderAxis(mi, k); } { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[3]; - te::For* ni = loops[4]; - te::Stmt* unrolled; + te::ForPtr mi = loops[3]; + te::ForPtr ni = loops[4]; + te::StmtPtr unrolled; loop.vectorize(ni); loop.unroll(mi, &unrolled); } loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); @@ -268,36 +268,36 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* m = loops[0]; + te::ForPtr m = loops[0]; loop.splitWithMask(m, 4); } { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* n = loops[2]; + te::ForPtr n = loops[2]; loop.splitWithMask(n, 16); } // mo, mi, no, ni, k -> // mo, no, mi, ni, k { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[1]; - te::For* no = loops[2]; + te::ForPtr mi = loops[1]; + te::ForPtr no = loops[2]; loop.reorderAxis(mi, no); } // mo, no, mi, ni, k -> // mo, no, mi, k, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* ni = loops[3]; - te::For* k = loops[4]; + te::ForPtr ni = loops[3]; + te::ForPtr k = loops[4]; loop.reorderAxis(ni, k); } // mo, no, mi, k, ni -> // mo, no, k, mi, ni { auto const& loops = loop.getLoopStmtsFor(CT); - te::For* mi = loops[2]; - te::For* k = loops[3]; + te::ForPtr mi = loops[2]; + te::ForPtr k = loops[3]; loop.reorderAxis(mi, k); } { @@ -306,7 +306,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { } loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT}); diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp index fee326cdd4bd4..966c9e2a6853d 100644 --- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp +++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp @@ -44,10 +44,10 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) { }); LoopNest loop_nest({c_tensor}); auto const& loops = loop_nest.getLoopStmtsFor(c_tensor); - For* m = loops[0]; + ForPtr m = loops[0]; m->set_parallel(); loop_nest.prepareForCodegen(); - Stmt* stmt = loop_nest.root_stmt(); + StmtPtr stmt = loop_nest.root_stmt(); LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf}); float* a_ptr = A.data_ptr(); diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp index acd46ac1de410..be5dcc815bc68 100644 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -233,7 +233,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { te::LoopNest loop({BT}); loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); @@ -269,12 +269,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { { auto const& loops = loop.getLoopStmtsFor(BT); - te::For* m = loops[1]; + te::ForPtr m = loops[1]; loop.splitWithTail(m, kChunkSize); } loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); @@ -310,12 +310,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { { auto const& loops = loop.getLoopStmtsFor(BT); - te::For* m = loops[1]; + te::ForPtr m = loops[1]; loop.splitWithMask(m, kChunkSize); } loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); @@ -349,17 +349,17 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { {{M, "M"}}); te::LoopNest loop({BT}); - te::Buf* rfac_buf; + te::BufPtr rfac_buf; auto loops = loop.getLoopStmtsFor(BT); TORCH_CHECK(loops.size() == 1); - te::For* mi; + te::ForPtr mi; loop.splitWithMask(loops.at(0), kChunkSize, &mi); - te::For* mo = loops.at(0); + te::ForPtr mo = loops.at(0); loop.reorderAxis(mo, mi); loops = loop.getLoopStmtsFor(BT); - auto bt_body = const_cast(loop.getAllWritesToBuf(BT->buf())[1]); + auto bt_body = loop.getAllWritesToBuf(BT->buf())[1]; TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf)); loop.reorderAxis(loops.at(0), loops.at(1)); @@ -368,7 +368,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { loop.vectorize(loops.at(1)); loop.prepareForCodegen(); - te::Stmt* s = loop.root_stmt(); + te::StmtPtr s = loop.root_stmt(); s = te::IRSimplifier::simplify(s); auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); @@ -394,8 +394,8 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) { te::LoopNest nest({b}); auto loops = nest.getLoopStmtsFor(b); - te::For *mi, *mo; - te::Buf *rf; + te::ForPtr mi, mo; + te::BufPtr rf; nest.splitWithMask(loops[0], kChunkSize, &mi); loops = nest.reorder({loops[0], mi}, {1, 0}); nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf); @@ -566,8 +566,8 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) { auto sch = state.range(2); if (sch == 1) { auto loops = nest.getLoopStmtsFor(b); - te::For *mi, *mo; - te::Buf *rf; + te::ForPtr mi, mo; + te::BufPtr rf; nest.splitWithMask(loops[1], kChunkSize, &mi); loops = nest.reorder({loops[1], mi}, {1, 0}); TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf)); @@ -583,8 +583,8 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) { nest.reorderAxis(loops[1], loops[2]); } else if (sch == 3) { auto loops = nest.getLoopStmtsFor(b); - te::For *mi, *mo; - te::Buf *rf; + te::ForPtr mi, mo; + te::BufPtr rf; nest.splitWithMask(loops[1], kChunkSize, &mi); loops = nest.reorder({loops[1], mi}, {1, 0}); TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf)); diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 3776329a86a51..75e6a064d1ac5 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -1642,7 +1642,7 @@ TEST(LLVM, CompositeParallel) { [=](const VarHandle& m, const VarHandle& n) { return t3->load(m, n) + m + n; }); - LoopNest loop_nest({t4}, {t1, t2, t3, t4}); + LoopNest loop_nest(std::vector({t4}), {t1, t2, t3, t4}); std::vector loop_list; { auto const& loops = loop_nest.getLoopStmtsFor(t1); diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 898ee5293edab..c80dd5f492d95 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -1011,7 +1011,7 @@ TEST(LoopNest, ScheduleFunctionCall01) { return c->load(m, n, k) + 1; }); - LoopNest l({d}, {c, d}); + LoopNest l(std::vector({d}), {c, d}); l.prepareForCodegen(); StmtPtr stmt = l.root_stmt(); std::ostringstream oss; @@ -1071,7 +1071,7 @@ TEST(LoopNest, ScheduleInlineSimple) { return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k); }); - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); LoopNest l2(l1); l2.computeInline(x->buf()); @@ -1158,7 +1158,7 @@ void InlineFunc01Helper(const std::vector& inline_order) { return x->load(m, n, k) + y->load(m, n, k); }); - LoopNest l({z}, {x, y, z}); + LoopNest l(std::vector({z}), {x, y, z}); for (const std::string& order : inline_order) { if (order == "x") { l.computeInline(x->buf()); @@ -1267,7 +1267,7 @@ TEST(LoopNest, ScheduleInlineRandom) { return x->load(m, n, k) + x->load(m, n, k); }); - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); l1.computeInline(x->buf()); // would normally compare results but Rand isn't implemented in the @@ -1304,7 +1304,7 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) { Intrinsics::make(kRand, kInt); }); - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); l1.computeInline(x->buf()); // would normally compare results but Rand isn't implemented in the @@ -1337,7 +1337,7 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) { return x->load(m) + x->load(m); }); - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); l1.computeInline(x->buf()); // would normally compare results but Rand isn't implemented in the @@ -1389,7 +1389,7 @@ TEST(LoopNest, ScheduleInlineIntrinsics) { } } - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); LoopNest l2(l1); l2.computeInline(x->buf()); @@ -1434,7 +1434,7 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { return Intrinsics::make(kSqrt, x->load(m, n, k)); }); - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); l1.computeInline(x->buf()); StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt()); @@ -1457,7 +1457,7 @@ TEST(LoopNest, ScheduleSplitAThenInline) { return a->load(j + ExprHandle(8)); }); - LoopNest l({b}, {a, b}); + LoopNest l(std::vector({b}), {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); LoopNest::splitWithMask(loops[0], 4); ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); @@ -1472,7 +1472,7 @@ TEST(LoopNest, ScheduleSplitBThenInline) { return a->load(j + ExprHandle(8)); }); - LoopNest l({b}, {a, b}); + LoopNest l(std::vector({b}), {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0); LoopNest::splitWithMask(loops[0], 3); l.computeInline(a->buf()); @@ -1499,7 +1499,7 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr i_inner; - LoopNest l({b}, {a, b}); + LoopNest l(std::vector({b}), {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); LoopNest::splitWithMask(loops[0], 4, &i_inner); LoopNest::splitWithMask(i_inner, 2); @@ -1515,7 +1515,7 @@ TEST(LoopNest, ScheduleInlineThenSplit) { return a->load(j + ExprHandle(8)); }); - LoopNest l({b}, {a, b}); + LoopNest l(std::vector({b}), {a, b}); l.computeInline(a->buf()); std::vector loops = NodeFinder::find(l.root_stmt()); @@ -1540,7 +1540,7 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) { return a->load(j + ExprHandle(8)); }); - LoopNest l({b}, {a, b}); + LoopNest l(std::vector({b}), {a, b}); auto loops = NodeFinder::find(l.root_stmt()); LoopNest::splitWithMask(loops.back(), 2); l.computeInline(a->buf()); @@ -1568,7 +1568,7 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) { return a->load(j) - ExprHandle(1); }); - LoopNest l({b}, {a, b}); + LoopNest l(std::vector({b}), {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); LoopNest::splitWithMask(loops[0], 4); ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); @@ -1587,7 +1587,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) { return a->load(k) * b->load(l); }); - LoopNest l({c}, {a, b, c}); + LoopNest l(std::vector({c}), {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); l.computeInline(a->buf()); l.prepareForCodegen(); @@ -1617,7 +1617,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) { return a->load(k) * b->load(l); }); - LoopNest l({c}, {a, b, c}); + LoopNest l(std::vector({c}), {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); l.computeInline(a->buf()); l.computeInline(b->buf()); @@ -1648,7 +1648,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) { return a->load(k) * b->load(l); }); - LoopNest l({c}, {a, b, c}); + LoopNest l(std::vector({c}), {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); l.computeInline(b->buf()); l.prepareForCodegen(); @@ -1678,7 +1678,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedSplit) { return a->load(k) * b->load(l); }); - LoopNest l({c}, {a, b, c}); + LoopNest l(std::vector({c}), {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); LoopNest::splitWithMask(loops[0], 4); loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0); @@ -1782,7 +1782,7 @@ TEST(LoopNest, ScheduleFuserThreeArg) { return f->load(i) + d.load(i); }); - LoopNest l({g}, {e, f, g}); + LoopNest l(std::vector({g}), {e, f, g}); l.computeInline(l.getLoopBodyFor(e)); l.computeInline(l.getLoopBodyFor(f)); l.prepareForCodegen(); @@ -1846,7 +1846,7 @@ TEST(LoopNest, LoopNestComputeAt_1) { "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; }); Tensor* B = Compute( "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); }); - LoopNest l({B}, {A, B}); + LoopNest l(std::vector({B}), {A, B}); std::vector loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); l.prepareForCodegen(); @@ -1909,7 +1909,7 @@ TEST(LoopNest, LoopNestComputeAt_2) { c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1); } } - LoopNest orig_loopnest({c}, {p, c}); + LoopNest orig_loopnest(std::vector({c}), {p, c}); { // First let's try to compute P at axis cy (the outer loop) @@ -2009,7 +2009,7 @@ TEST(LoopNest, LoopNestComputeAt_3) { } } - LoopNest orig_loopnest({D}, {A, B, C, D}); + LoopNest orig_loopnest(std::vector({D}), {A, B, C, D}); { // First let's try to compute A at axis dy (the outer loop) LoopNest l(orig_loopnest); @@ -2100,7 +2100,7 @@ TEST(LoopNest, Reduce2dComputeAt) { c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1); } } - LoopNest orig_loopnest({c}, {p, c}); + LoopNest orig_loopnest(std::vector({c}), {p, c}); checkIR(orig_loopnest.root_stmt(), R"IR( # CHECK: for (int py = 0; py < H + 1; py++) { # CHECK: for (int px = 0; px < W + 1; px++) { @@ -2771,7 +2771,7 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { return x->load(m, n, k) + y->load(m, n, k); }); - LoopNest l({z}, {x, y, z}); + LoopNest l(std::vector({z}), {x, y, z}); ForPtr a = nullptr; ForPtr b = nullptr; auto fors = NodeFinder::find(l.root_stmt()); @@ -2983,7 +2983,7 @@ TEST(LoopNest, UnrollMultipleStatements) { Block::make( {Store::make(a_buf, {x}, x * 2), Store::make(b_buf, {x}, Load::make(a_buf, {x}))})); - Block::make({f}); + auto parent_block = Block::make({f}); StmtPtr unrolled = nullptr; LoopNest::unroll(f, &unrolled); checkIR(unrolled, R"IR( @@ -3069,7 +3069,7 @@ TEST(LoopNest, UnrollWithLet) { {Let::make(e, 7), Store::make(a_buf, {x}, e), Store::make(b_buf, {x}, e + 1)})); - Block::make({f}); + auto parent_block = Block::make({f}); StmtPtr unrolled = nullptr; LoopNest::unroll(f, &unrolled); std::ostringstream oss; @@ -3680,7 +3680,7 @@ TEST(LoopNest, DetectInlineRankMismatch) { "reshape", {{kTotalSize / 2, "i"}, {2, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); }); - LoopNest l({reshape}, {a, reshape}); + LoopNest l(std::vector({reshape}), {a, reshape}); ASSERT_THROWS_WITH( l.computeInline(l.getLoopBodyFor(a)), "Placeholder indexed access is inconsistent with its rank"); @@ -3702,7 +3702,7 @@ TEST(LoopNest, CacheReadsSimple) { return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); }); - LoopNest l({B, C}, {A, B, C}); + LoopNest l(std::vector({B, C}), {A, B, C}); StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1]; LoopNest::cacheAccesses(A->buf(), "A_local", j_loop); @@ -3770,7 +3770,7 @@ TEST(LoopNest, CacheReadsOuter) { return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); }); - LoopNest l({B, C}, {A, B, C}); + LoopNest l(std::vector({B, C}), {A, B, C}); StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0]; LoopNest::cacheAccesses(A->buf(), "A_local", i_loop); @@ -3818,7 +3818,7 @@ TEST(LoopNest, CacheReadsInternal) { return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); }); - LoopNest l({B, C}, {A, B, C}); + LoopNest l(std::vector({B, C}), {A, B, C}); StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1]; LoopNest::cacheAccesses(A->buf(), "A_local", j_loop); l.prepareForCodegen(); @@ -3866,7 +3866,7 @@ TEST(LoopNest, CacheReadsInner) { return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); }); - LoopNest l({B, C}, {A, B, C}); + LoopNest l(std::vector({B, C}), {A, B, C}); StmtPtr body = l.getLoopBodyFor(B); LoopNest::cacheAccesses(A->buf(), "A_local", body); l.prepareForCodegen(); @@ -3913,7 +3913,7 @@ TEST(LoopNest, CacheWritesSimple) { return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); }); - LoopNest l({B, C}, {A, B, C}); + LoopNest l(std::vector({B, C}), {A, B, C}); StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1]; LoopNest::cacheAccesses(A->buf(), "A_local", a_loop); @@ -4093,7 +4093,7 @@ TEST(LoopNest, InlineConstantIndex) { return y->load(m, n, o); }); - LoopNest l({z}, {y, z}); + LoopNest l(std::vector({z}), {y, z}); l.simplify(); ASSERT_TRUE(l.computeInline(y->buf())); } @@ -4121,7 +4121,7 @@ TEST(LoopNest, CompoundTensorUsed) { return A->load(i, j + 1) + A->load(i, j + 2); }); - LoopNest l({B}, {A, B}); + LoopNest l(std::vector({B}), {A, B}); ASSERT_FALSE(l.computeInline(A->buf())); l.prepareForCodegen(); @@ -4897,7 +4897,7 @@ TEST(LoopNest, VectorizeUse) { "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; }); Tensor* c = Compute( "c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; }); - LoopNest nest({c}, {b, c}); + LoopNest nest(std::vector({c}), {b, c}); auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0]; ASSERT_TRUE(LoopNest::vectorize(loops[0])); loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0]; diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp index 7f844c5ba4cf4..9503f9d57b726 100644 --- a/test/cpp/tensorexpr/test_memdependency.cpp +++ b/test/cpp/tensorexpr/test_memdependency.cpp @@ -2739,7 +2739,7 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) { return c->load(m, n, k) + 1; }); - LoopNest l({d}, {c, d}); + LoopNest l(std::vector({d}), {c, d}); MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()}); @@ -2786,7 +2786,7 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) { return c->load(m, n, k) + 1; }); - LoopNest l({d}, {c, d}); + LoopNest l(std::vector({d}), {c, d}); l.computeInline(c->buf()); MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()}); @@ -2935,7 +2935,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) { return b.load(l, n, m) * a.load(l, n, m); }); Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}}); - LoopNest l({d}, {c, d}); + LoopNest l(std::vector({d}), {c, d}); MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()}); diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index 0d033e0bd8a1f..449edac19823f 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -533,7 +533,7 @@ TEST(Reductions, ReduceAsProducer) { [&](const VarHandle& l, const VarHandle& n) { return c->load(l, n) * a.load(l, n); }); - LoopNest loop({d}, {c, d}); + LoopNest loop(std::vector({d}), {c, d}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); s = IRSimplifier::simplify(s); @@ -578,7 +578,7 @@ TEST(Reductions, ReduceAsConsumer) { return b.load(l, n, m) * a.load(l, n, m); }); Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}}); - LoopNest loop({d}, {c, d}); + LoopNest loop(std::vector({d}), {c, d}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); s = IRSimplifier::simplify(s); @@ -1201,7 +1201,7 @@ TEST(Reductions, ReduceInlineReduction) { } } - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); // Cannot inline a reduction computation ASSERT_FALSE(l1.computeInline(x->buf())); } @@ -1235,7 +1235,7 @@ TEST(Reductions, ReduceInlineConsumer) { } } - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); LoopNest l2(l1); l2.computeInline(x->buf()); @@ -1293,7 +1293,7 @@ TEST(Reductions, ReduceInlineReducerInternal) { } } - LoopNest l1({y}, {x, y}); + LoopNest l1(std::vector({y}), {x, y}); LoopNest l2(l1); l2.computeInline(x->buf()); @@ -1340,7 +1340,7 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); @@ -1417,7 +1417,7 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); @@ -1492,7 +1492,7 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); @@ -1563,7 +1563,7 @@ TEST(Reductions, ReductionCacheBodyAccess) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); StmtPtr d_loop = l.getLoopStmtsFor(d)[1]; l.cacheAccesses(c->buf(), "scale_local", d_loop); @@ -1604,7 +1604,7 @@ TEST(Reductions, ReductionCacheConsumerAccess) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4); @@ -1645,7 +1645,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr inner; @@ -1693,7 +1693,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { return b.load(0, 0, l) * d->load(l); }); - LoopNest l({e}, {c, d, e}); + LoopNest l(std::vector({e}), {c, d, e}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr inner; diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index 9320f47bfb3d8..5a6f257d6a79b 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -256,7 +256,9 @@ int main(int argc, char* argv[]) { // Creating a loop nest is as quite simple, we just need to specify a list // of all and a list of output tensors: // NOLINTNEXTLINE(bugprone-argument-comment) - LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y}); + std::vector outputs = {Y}; + std::vector all = {X, Y}; + LoopNest loopnest(outputs, all); // An IR used in LoopNest is based on tensor statements, represented by // `Stmt` class. Statements are used to specify the loop nest structure, and diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index 2d00b1e4ab481..b342f1464b0c2 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -389,34 +389,33 @@ class AtomicAddFuser : public IRMutator { StmtPtr mutate(StorePtr v) override { BufPtr buf = v->buf(); - StorePtr orig = const_cast(v); // NOLINT // Thread locals never need to be atomic. if (thread_local_bufs_.count(buf->base_handle()) != 0) { - return orig; + return v; } ScalarType dtype = v->value()->dtype().scalar_type(); if (dtype != ScalarType::Float && dtype != ScalarType::Double) { - return orig; + return v; } AddPtr add_v = to(v->value()); if (!add_v) { - return orig; + return v; } LoadPtr load_v = to(add_v->lhs()); if (!load_v) { - return orig; + return v; } if (v->base_handle() != load_v->base_handle()) { - return orig; + return v; } if (v->indices().empty() && load_v->indices().empty()) { - return orig; + return v; } bool index_equal = CheckEqual(v->flat_index(), load_v->flat_index()); if (!index_equal) { - return orig; + return v; } // TODO: this checks that the metavars occur directly as an index, but this @@ -431,7 +430,7 @@ class AtomicAddFuser : public IRMutator { if (vars_to_find.empty()) { // All metavars accounted for. - return orig; + return v; } return alloc(buf, v->indices(), add_v->rhs()); @@ -609,23 +608,21 @@ class PrioritizeLoad : public IRMutator { } StmtPtr mutate(BlockPtr v) override { - BlockPtr v1 = const_cast(v); // NOLINT - assert(v1); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - std::list stmts = v1->stmts(); + std::list stmts = v->stmts(); for (StmtPtr stmt : stmts) { PushList(); StmtPtr stmt_new = stmt->accept_mutator(this); - AddMemLoadsFromList(v1, stmt); + AddMemLoadsFromList(v, stmt); PopList(); if (stmt_new == stmt) { continue; } - v1->replace_stmt(stmt, stmt_new); + v->replace_stmt(stmt, stmt_new); } - return v1; + return v; } ExprPtr mutate(IfThenElsePtr v) override { @@ -821,7 +818,7 @@ StmtPtr GPUMetaVarRewriter::mutate(BlockPtr v) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector stmts; for (auto& v : innerSegments) { - for (auto* s : v.stmts()) { + for (auto s : v.stmts()) { stmts.push_back(s); } } diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h index 15d48cd8952e0..eaf74d3c79d82 100644 --- a/torch/csrc/jit/tensorexpr/half_support.h +++ b/torch/csrc/jit/tensorexpr/half_support.h @@ -72,7 +72,8 @@ class HalfRewriter : public IRMutator { inserted_half_casts_.insert(new_val); } - return alloc(v->buf(), v->indices(), new_val); + v->set_value(new_val); + return v; } ExprPtr mutate(HalfImmPtr v) override { diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.h b/torch/csrc/jit/tensorexpr/mem_dependency_checker.h index 5363d2fc5ae93..1965b05009125 100644 --- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.h +++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.h @@ -299,7 +299,7 @@ class TORCH_API MemDependencyChecker : public IRVisitor { DependencySet getAllReadsWithin(StmtOrExprPtr v) { DependencySet reads; auto insertAllReads = [&](const auto& nodes) { - for (auto* l : nodes) { + for (auto l : nodes) { auto bound = exprToAccess_.equal_range(l); for (auto it = bound.first; it != bound.second; ++it) { if (it->second->isRead()) { @@ -324,7 +324,7 @@ class TORCH_API MemDependencyChecker : public IRVisitor { // writes just Store currently. auto stores = NodeFinder::find(v); - for (auto* s : stores) { + for (auto s : stores) { auto bound = stmtToAccess_.equal_range(s); for (auto it = bound.first; it != bound.second; ++it) { if (it->second->isWrite()) { diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h index 29f051f323b28..4335d7b3bd7f7 100644 --- a/torch/csrc/jit/tensorexpr/operators/reduction.h +++ b/torch/csrc/jit/tensorexpr/operators/reduction.h @@ -6,14 +6,14 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeSum( +TORCH_API Tensor* computeSum( const std::vector& inputs, const c10::optional& outputType); -Tensor* computeMean( +TORCH_API Tensor* computeMean( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); -Tensor* computeAdaptiveAvgPool2d( +TORCH_API Tensor* computeAdaptiveAvgPool2d( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); From 4e15a6f495ac7f42927a175261238b91632e8494 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 24 Aug 2021 00:29:22 -0700 Subject: [PATCH 158/530] [TensorExpr] Switch Exprs and Stmt from kernel-arena to shared_ptr. (#63216) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63216 Currently there are three classes managed by KernelArena: Expr, Stmt, and Tensor (and derived classes). KernelArena has been a long standing painpoint for NNC devs and we're moving away from that memory management model to ref-count based memory model (using shared_ptr). This commit switches Expr and Stmt to shared_ptr and is the biggest change in this transition. Later commits will detach Tensor from KernelArena and kill the arena + scope altogether. Differential Revision: D30353195 D30353195 Test Plan: Imported from OSS Reviewed By: navahgar Pulled By: ZolotukhinM fbshipit-source-id: 9575225ada3d0fb65087ae40435f3dfea4792cae --- test/test_tensorexpr_pybind.py | 3 - torch/csrc/jit/tensorexpr/eval.cpp | 8 +- torch/csrc/jit/tensorexpr/expr.h | 12 ++- torch/csrc/jit/tensorexpr/fwd_decls.h | 9 +- torch/csrc/jit/tensorexpr/hash_provider.cpp | 7 ++ torch/csrc/jit/tensorexpr/hash_provider.h | 23 ++++-- torch/csrc/jit/tensorexpr/ir.h | 6 ++ torch/csrc/jit/tensorexpr/ir_cloner.cpp | 8 +- torch/csrc/jit/tensorexpr/ir_mutator.cpp | 8 +- torch/csrc/jit/tensorexpr/ir_printer.cpp | 8 +- torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 81 ++++++++++++++++++ torch/csrc/jit/tensorexpr/ir_simplifier.h | 82 ++----------------- torch/csrc/jit/tensorexpr/ir_verifier.cpp | 15 +++- torch/csrc/jit/tensorexpr/ir_visitor.cpp | 8 +- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 5 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 2 +- torch/csrc/jit/tensorexpr/stmt.h | 20 +++-- torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 55 +++++-------- 18 files changed, 213 insertions(+), 147 deletions(-) diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py index d838892975c0c..0ae59e1c56484 100644 --- a/test/test_tensorexpr_pybind.py +++ b/test/test_tensorexpr_pybind.py @@ -394,9 +394,6 @@ def f(a): np.testing.assert_allclose(res1.numpy(), correct.numpy(), atol=2e-3) np.testing.assert_allclose(res2.numpy(), correct.numpy(), atol=2e-3) - def test_forgot_kernel_arena(self): - self.assertRaises(RuntimeError, lambda: torch._C._te.VarHandle("n", torch._C._te.Dtype.Int)) - @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled") def test_alloc_in_loop(self): with kernel_arena_scope(): diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp index c7a28bdbb23ac..05c3ff8245221 100644 --- a/torch/csrc/jit/tensorexpr/eval.cpp +++ b/torch/csrc/jit/tensorexpr/eval.cpp @@ -281,8 +281,12 @@ class SimpleIREvaluatorImpl : public IRVisitor { return Value(result_v); } - template - void visit_binary_op(BinaryOpNode* v, bool option = false) { + template < + typename D, + typename std::enable_if())), + void>::value>::type* = nullptr> + void visit_binary_op(NodePtr v, bool option = false) { v->lhs()->accept(this); Value lhs_v = value_; v->rhs()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index fae24ec34be28..1b942eaf353fc 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -36,10 +36,11 @@ enum IRNodeType { }; // The common base between all expression node. -class TORCH_API Expr : public KernelScopedObject { +class TORCH_API Expr : public std::enable_shared_from_this { public: explicit Expr(Dtype dtype, IRNodeType expr_type = kOther) : dtype_(dtype), expr_type_(expr_type) {} + virtual ~Expr() = default; Dtype dtype() const { return dtype_; } @@ -66,6 +67,11 @@ class TORCH_API Expr : public KernelScopedObject { */ static ExprPtr clone(ExprPtr s); + protected: + std::shared_ptr getptr() { + return shared_from_this(); + } + private: Dtype dtype_; IRNodeType expr_type_; @@ -78,7 +84,7 @@ class ExprNode : public Base { public: using ExprNodeBase = ExprNode; void accept(IRVisitor* visitor) override { - visitor->visit(static_to(this)); + visitor->visit(static_to(Base::getptr())); } ExprPtr accept_mutator(IRMutator* mutator) override; // pass the constructor to the base class @@ -335,7 +341,7 @@ class TORCH_API VarHandle : public ExprHandle { template ExprPtr ExprNode::accept_mutator(IRMutator* mutator) { - return mutator->mutate(static_to(this)); + return mutator->mutate(static_to(Base::getptr())); } inline bool same_node(const ExprHandle& expr1, const ExprHandle& expr2) { diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h index 01a767067f620..1b3dde560b427 100644 --- a/torch/csrc/jit/tensorexpr/fwd_decls.h +++ b/torch/csrc/jit/tensorexpr/fwd_decls.h @@ -1,26 +1,27 @@ #pragma once #include +#include namespace torch { namespace jit { namespace tensorexpr { template -using NodePtr = Node*; +using NodePtr = std::shared_ptr; template NodePtr to(NodePtr x) { - return dynamic_cast>(x); + return std::dynamic_pointer_cast(x); } template NodePtr static_to(NodePtr x) { - return static_cast>(x); + return std::static_pointer_cast(x); } template NodePtr alloc(Args&&... args) { - return new Node(std::forward(args)...); + return std::make_shared(std::forward(args)...); } class Buf; diff --git a/torch/csrc/jit/tensorexpr/hash_provider.cpp b/torch/csrc/jit/tensorexpr/hash_provider.cpp index fbc257d1988df..dce25669bf323 100644 --- a/torch/csrc/jit/tensorexpr/hash_provider.cpp +++ b/torch/csrc/jit/tensorexpr/hash_provider.cpp @@ -63,6 +63,13 @@ void HashProvider::visit(ModPtr v) { putHash(v, hash_combine(hashOf(v->lhs()), "%", hashOf(v->rhs()))); } +void HashProvider::visit(RoundOffPtr v) { + CACHE_GUARD(); + v->lhs()->accept(this); + v->rhs()->accept(this); + putHash(v, hash_combine(hashOf(v->lhs()), "rof", hashOf(v->rhs()))); +} + void HashProvider::visit(MaxPtr v) { CACHE_GUARD(); v->lhs()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/hash_provider.h b/torch/csrc/jit/tensorexpr/hash_provider.h index 5a33f048fec84..91ce269edeb5c 100644 --- a/torch/csrc/jit/tensorexpr/hash_provider.h +++ b/torch/csrc/jit/tensorexpr/hash_provider.h @@ -59,12 +59,16 @@ class TORCH_API HashProvider : public IRVisitor { return hashOf(e); } - bool cachedHash(const KernelScopedObject* e) { + bool cachedHash(ExprPtr e) { return exprToHash_.find(e) != exprToHash_.end(); } + bool cachedHash(StmtPtr s) { + return stmtToHash_.find(s) != stmtToHash_.end(); + } void clearCache() { exprToHash_.clear(); + stmtToHash_.clear(); } void visit(AddPtr v) override; @@ -72,6 +76,7 @@ class TORCH_API HashProvider : public IRVisitor { void visit(MulPtr v) override; void visit(DivPtr v) override; void visit(ModPtr v) override; + void visit(RoundOffPtr v) override; void visit(MaxPtr v) override; void visit(MinPtr v) override; void visit(AndPtr v) override; @@ -133,8 +138,8 @@ class TORCH_API HashProvider : public IRVisitor { } SimplifierHashType hashOf(StmtPtr s) { - auto it = exprToHash_.find(s); - if (it != exprToHash_.end()) { + auto it = stmtToHash_.find(s); + if (it != stmtToHash_.end()) { return it->second; } @@ -182,15 +187,23 @@ class TORCH_API HashProvider : public IRVisitor { _hash_combine(seed, args...); } - void putHash(const KernelScopedObject* e, SimplifierHashType h) { + void putHash(ExprPtr e, SimplifierHashType h) { auto res = exprToHash_.emplace(e, h); if (res.second == false) { // This is always a logic bug since we should check the cache first. throw std::runtime_error("hash collision"); } } + void putHash(StmtPtr s, SimplifierHashType h) { + auto res = stmtToHash_.emplace(s, h); + if (res.second == false) { + // This is always a logic bug since we should check the cache first. + throw std::runtime_error("hash collision"); + } + } - std::unordered_map exprToHash_; + std::unordered_map exprToHash_; + std::unordered_map stmtToHash_; UniqueNameManager name_manager_; size_t te_hash(SimplifierHashType val) { diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 761b233fe8375..f9fc7dcfc4246 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -178,6 +178,12 @@ class BinaryOpNode : public ExprNode { ExprPtr rhs_; }; +namespace detail { +template +void bin_op_deducer(BinaryOpNode); +bool bin_op_deducer(...); +} // namespace detail + class TORCH_API Add : public BinaryOpNode { public: Add(ExprPtr lhs, ExprPtr rhs) : BinaryOpNode(lhs, rhs, IRNodeType::kAdd) {} diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.cpp b/torch/csrc/jit/tensorexpr/ir_cloner.cpp index f724f2cbeb16f..e225826df66e2 100644 --- a/torch/csrc/jit/tensorexpr/ir_cloner.cpp +++ b/torch/csrc/jit/tensorexpr/ir_cloner.cpp @@ -10,9 +10,13 @@ namespace torch { namespace jit { namespace tensorexpr { -template +template < + typename Op, + typename std::enable_if())), + void>::value>::type* = nullptr> static ExprPtr mutate_binary_op( - NodePtr> v, + NodePtr v, IRCloner* cloner, bool option = false) { ExprPtr lhs_new = v->lhs()->accept_mutator(cloner); diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index 96635acab8c90..45121581eebf0 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -11,9 +11,13 @@ namespace torch { namespace jit { namespace tensorexpr { -template +template < + typename Op, + typename std::enable_if())), + void>::value>::type* = nullptr> static ExprPtr mutate_binary_op( - BinaryOpNode* v, + NodePtr v, IRMutator* mutator, bool option = false) { ExprPtr lhs = v->lhs(); diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index 23466f39160c8..f885246e24d2b 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -28,9 +28,13 @@ void IRPrinter::print(Stmt& stmt) { // TODO: change whether to include the parenthesis to the parent expression, // we need to look at the operator precedence to make the output simpler. -template +template < + typename Op, + typename std::enable_if())), + void>::value>::type* = nullptr> void visitBinaryOp( - BinaryOpNode* v, + NodePtr v, const std::string& op_str, IRPrinter* printer, bool parens = true) { diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index cb731d2525e71..23216dd4002f7 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -6,6 +6,70 @@ namespace torch { namespace jit { namespace tensorexpr { +// Creates a new Expr of the given type with the provided lhs and rhs. +inline ExprPtr newBinaryOpOfType( + IRNodeType expr_type, + ExprPtr lhs, + ExprPtr rhs, + bool option) { + switch (expr_type) { + // NOLINTNEXTLINE(bugprone-branch-clone) + case IRNodeType::kAdd: + return alloc(lhs, rhs); + case IRNodeType::kSub: + return alloc(lhs, rhs); + case IRNodeType::kMul: + return alloc(lhs, rhs); + case IRNodeType::kDiv: + return alloc
(lhs, rhs); + case IRNodeType::kMod: + return alloc(lhs, rhs); + case IRNodeType::kMax: + return alloc(lhs, rhs, option); + case IRNodeType::kMin: + return alloc(lhs, rhs, option); + case IRNodeType::kAnd: + return alloc(lhs, rhs); + case IRNodeType::kXor: + return alloc(lhs, rhs); + case IRNodeType::kLshift: + return alloc(lhs, rhs); + case IRNodeType::kRshift: + return alloc(lhs, rhs); + default: + LOG(FATAL) << "unsupported expr_type: " << static_cast(expr_type); + return nullptr; + } +} + +template < + typename Op, + typename std::enable_if())), + void>::value>::type* = nullptr> +static ExprPtr mutateBinaryOp( + NodePtr v, + IRMutator* mutator, + bool option = false) { + ExprPtr lhs = v->lhs(); + ExprPtr rhs = v->rhs(); + ExprPtr lhs_new = lhs->accept_mutator(mutator); + ExprPtr rhs_new = rhs->accept_mutator(mutator); + + ExprPtr node = v; + + if (lhs != lhs_new || rhs != rhs_new) { + node = newBinaryOpOfType(v->expr_type(), lhs_new, rhs_new, option); + } + + // Can only fold if both sides are constant. + if (!lhs_new->isConstant() || !rhs_new->isConstant()) { + return node; + } + + return evaluateOp(node); +} + // Simple recursive GCD. template T gcd(T a, T b) { @@ -1499,6 +1563,22 @@ ExprPtr PolynomialTransformer::mutate(IfThenElsePtr v) { return alloc(condition_new, true_value_new, false_value_new); } +ExprPtr PolynomialTransformer::mutate(AndPtr v) { + return mutateBinaryOp(v, this); +} + +ExprPtr PolynomialTransformer::mutate(XorPtr v) { + return mutateBinaryOp(v, this); +} + +ExprPtr PolynomialTransformer::mutate(LshiftPtr v) { + return mutateBinaryOp(v, this); +} + +ExprPtr PolynomialTransformer::mutate(RshiftPtr v) { + return mutateBinaryOp(v, this); +} + StmtPtr PolynomialBase::mutate(CondPtr v) { ExprPtr cond_old = v->condition(); StmtPtr true_old = v->true_stmt(); @@ -1904,6 +1984,7 @@ c10::optional isModRound(TermPtr e) { scalar = getImmediateByType(multiplier->dtype(), 1); } + // TODO: this leaks memory! return new ModRound(scalar, denom, divisor, mod_divisor); } diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h index 87c476242e8de..1df8b5d8f3501 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.h +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h @@ -55,7 +55,7 @@ Dtype promoteTypesVec(std::vector& v) { template Dtype promoteTypesMap( ExprPtr s, - std::unordered_map& m) { + std::unordered_map& m) { Dtype t = s->dtype(); bool first = true; for (auto& e : m) { @@ -69,12 +69,12 @@ Dtype promoteTypesMap( } template -Dtype promoteTypesVar(ExprType* e) { +Dtype promoteTypesVar(ExprType e) { return e->dtype(); } template -Dtype promoteTypesVar(ExprType* e, Args... es) { +Dtype promoteTypesVar(ExprType e, Args... es) { Dtype lhs = e->dtype(); Dtype rhs = promoteTypesVar(es...); if (e->isConstant()) { @@ -84,42 +84,6 @@ Dtype promoteTypesVar(ExprType* e, Args... es) { return promoteTypes(lhs, rhs); } -// Creates a new Expr of the given type with the provided lhs and rhs. -inline ExprPtr newBinaryOpOfType( - IRNodeType expr_type, - ExprPtr lhs, - ExprPtr rhs, - bool option) { - switch (expr_type) { - // NOLINTNEXTLINE(bugprone-branch-clone) - case IRNodeType::kAdd: - return alloc(lhs, rhs); - case IRNodeType::kSub: - return alloc(lhs, rhs); - case IRNodeType::kMul: - return alloc(lhs, rhs); - case IRNodeType::kDiv: - return alloc
(lhs, rhs); - case IRNodeType::kMod: - return alloc(lhs, rhs); - case IRNodeType::kMax: - return alloc(lhs, rhs, option); - case IRNodeType::kMin: - return alloc(lhs, rhs, option); - case IRNodeType::kAnd: - return alloc(lhs, rhs); - case IRNodeType::kXor: - return alloc(lhs, rhs); - case IRNodeType::kLshift: - return alloc(lhs, rhs); - case IRNodeType::kRshift: - return alloc(lhs, rhs); - default: - LOG(FATAL) << "unsupported expr_type: " << static_cast(expr_type); - return nullptr; - } -} - // Uses the evaluator to fold an Expression with constant terms. // E.g. evaluateOp(Add(3, 4)) => 7. // Expr v must not have any unbound Vars. @@ -498,21 +462,13 @@ class TORCH_API PolynomialTransformer : public PolynomialBase { ExprPtr mutate(ModPtr v) override; - ExprPtr mutate(AndPtr v) override { - return mutateBinaryOp(v, this); - } + ExprPtr mutate(AndPtr v) override; - ExprPtr mutate(XorPtr v) override { - return mutateBinaryOp(v, this); - } + ExprPtr mutate(XorPtr v) override; - ExprPtr mutate(LshiftPtr v) override { - return mutateBinaryOp(v, this); - } + ExprPtr mutate(LshiftPtr v) override; - ExprPtr mutate(RshiftPtr v) override { - return mutateBinaryOp(v, this); - } + ExprPtr mutate(RshiftPtr v) override; ExprPtr mutate(MaxPtr v) override; @@ -526,30 +482,6 @@ class TORCH_API PolynomialTransformer : public PolynomialBase { ExprPtr mutate(IfThenElsePtr v) override; - template - static ExprPtr mutateBinaryOp( - BinaryOpNode* v, - IRMutator* mutator, - bool option = false) { - ExprPtr lhs = v->lhs(); - ExprPtr rhs = v->rhs(); - ExprPtr lhs_new = lhs->accept_mutator(mutator); - ExprPtr rhs_new = rhs->accept_mutator(mutator); - - ExprPtr node = v; - - if (lhs != lhs_new || rhs != rhs_new) { - node = newBinaryOpOfType(v->expr_type(), lhs_new, rhs_new, option); - } - - // Can only fold if both sides are constant. - if (!lhs_new->isConstant() || !rhs_new->isConstant()) { - return node; - } - - return evaluateOp(node); - } - static ExprPtr simplify(ExprPtr e); static ExprHandle simplify(const ExprHandle& e); static StmtPtr simplify(StmtPtr e); diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.cpp b/torch/csrc/jit/tensorexpr/ir_verifier.cpp index c88e92c9a7a82..f7adbdee93992 100644 --- a/torch/csrc/jit/tensorexpr/ir_verifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_verifier.cpp @@ -9,8 +9,19 @@ namespace torch { namespace jit { namespace tensorexpr { -template -void verifyBitwiseOp(const BitwiseOpNode* v, IRVerifier* verifier) { +namespace detail { +template +void deducer(BinaryOpNode); + +bool deducer(...); +} // namespace detail + +template < + typename D, + typename std::enable_if())), + void>::value>::type* = nullptr> +void verifyBitwiseOp(NodePtr v, IRVerifier* verifier) { if (!v->lhs()->dtype().is_integral()) { throw unsupported_dtype(); } diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp index 9066544bd2291..eb2a4280c4f88 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp +++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp @@ -11,8 +11,12 @@ namespace torch { namespace jit { namespace tensorexpr { -template -static void visit_binary_op(BinaryOpNode* v, IRVisitor* visitor) { +template < + typename Op, + typename std::enable_if())), + void>::value>::type* = nullptr> +static void visit_binary_op(NodePtr v, IRVisitor* visitor) { v->lhs()->accept(visitor); v->rhs()->accept(visitor); } diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index eac1f82f25c4b..4ab2d53cc4942 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -488,12 +488,13 @@ class LLVMIntrinsicsExpander : public GenericIntrinsicsExpander { if (v->op_type() == kTanh) { ScalarType stype = v->dtype().scalar_type(); if (stype == ScalarType::Float) { - return fast_tanh(v->param(0)->accept_mutator(this)).node(); + return fast_tanh(ExprHandle(v->param(0)->accept_mutator(this))).node(); } } else if (v->op_type() == kSigmoid) { ScalarType stype = v->dtype().scalar_type(); if (stype == ScalarType::Float) { - return fast_sigmoid(v->param(0)->accept_mutator(this)).node(); + return fast_sigmoid(ExprHandle(v->param(0)->accept_mutator(this))) + .node(); } } // TODO: fast exp diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index a296d8c7af79b..d9d20736057fb 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -2380,7 +2380,7 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) { void LoopNest::compressAllBuffers(StmtPtr stmt) { for (auto buf : BufFinder::find(stmt)) { - compressBuffer(const_cast(buf), stmt); + compressBuffer(buf, stmt); } } diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h index 0b4a2e4c5361c..7e4914fbc4aa7 100644 --- a/torch/csrc/jit/tensorexpr/stmt.h +++ b/torch/csrc/jit/tensorexpr/stmt.h @@ -14,14 +14,15 @@ namespace tensorexpr { class Placeholder; // The common base between all statement node. -class TORCH_API Stmt : public KernelScopedObject { +class TORCH_API Stmt : public std::enable_shared_from_this { public: Stmt() = default; + virtual ~Stmt() = default; virtual void accept(IRVisitor* visitor) = 0; virtual StmtPtr accept_mutator(IRMutator* mutator) = 0; StmtPtr get_parent() const { - return parent_; + return parent_ ? parent_->getptr() : nullptr; } /* @@ -34,12 +35,15 @@ class TORCH_API Stmt : public KernelScopedObject { static StmtPtr clone(StmtPtr s); protected: - static void set_parent(StmtPtr s, StmtPtr new_parent) { + static void set_parent(StmtPtr s, Stmt* new_parent) { s->parent_ = new_parent; } + std::shared_ptr getptr() { + return shared_from_this(); + } private: - StmtPtr parent_ = nullptr; + Stmt* parent_ = nullptr; }; template @@ -47,7 +51,7 @@ class StmtNode : public Stmt { public: using StmtNodeBase = StmtNode; void accept(IRVisitor* visitor) override { - visitor->visit(static_to(this)); + visitor->visit(static_to(getptr())); } StmtPtr accept_mutator(IRMutator* mutator) override; StmtNode() = default; @@ -55,7 +59,7 @@ class StmtNode : public Stmt { template StmtPtr StmtNode::accept_mutator(IRMutator* mutator) { - return mutator->mutate(static_to(this)); + return mutator->mutate(static_to(getptr())); } // Concrete Stmt classes @@ -193,7 +197,7 @@ class TORCH_API Block : public StmtNode { } void clear() { - for (auto* s : stmts_) { + for (auto s : stmts_) { set_parent(s, nullptr); } stmts_.clear(); @@ -281,7 +285,7 @@ class TORCH_API Block : public StmtNode { // returns the immediate child containing statement s. StmtPtr getEnclosedRoot(StmtPtr s) const { - while (s && s->get_parent() != this) { + while (s && s->get_parent().get() != this) { s = s->get_parent(); } return s; diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp index 304a317076c05..4e1618a8745d7 100644 --- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp +++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp @@ -184,10 +184,7 @@ void initTensorExprBindings(PyObject* module) { [](Placeholder& self, const std::vector& args, const ExprHandle& val) { return self.store(args, val); }) - .def( - "data", - [](Placeholder& self) { return BufHandle(self.data()); }, - py::return_value_policy::reference); + .def("data", [](Placeholder& self) { return BufHandle(self.data()); }); py::class_>(te, "Tensor") .def(py::init( [](BufHandle& b, StmtPtr s) { return new Tensor(b.node(), s); })) @@ -197,8 +194,9 @@ void initTensorExprBindings(PyObject* module) { return self.load(v); }) .def("buf", [](Tensor& self) { return BufHandle(self.buf()); }) - .def("stmt", &Tensor::stmt, py::return_value_policy::reference); - py::class_(te, "Cast").def_static("make", &Cast::make); + .def("stmt", &Tensor::stmt); + py::class_>(te, "Cast") + .def_static("make", &Cast::make); py::class_(te, "DimArg") .def(py::init()) @@ -321,7 +319,7 @@ void initTensorExprBindings(PyObject* module) { }, py::return_value_policy::reference); - py::class_>(te, "Stmt") + py::class_>(te, "Stmt") .def(py::init([](const std::vector& stmts) { return tensorexpr::Block::make(stmts); })) @@ -330,22 +328,18 @@ void initTensorExprBindings(PyObject* module) { ss << self; return ss.str(); }); - py::class_>(te, "Store") + py::class_>(te, "Store") .def_static( "make", [](const BufHandle& buf, std::vector& indices, const ExprHandle& value) { return Store::make(buf, indices, value); - }, - py::return_value_policy::reference); + }); - py::class_>(te, "For") - .def( - "index_var", - [](For& self) { return VarHandle(self.var()); }, - py::return_value_policy::reference) - .def("body", &For::body, py::return_value_policy::reference) + py::class_>(te, "For") + .def("index_var", [](For& self) { return VarHandle(self.var()); }) + .def("body", &For::body) .def("set_parallel", &For::set_parallel) .def( "set_gpu_block_index", @@ -362,35 +356,28 @@ void initTensorExprBindings(PyObject* module) { [](const VarHandle& var, const ExprHandle& start, const ExprHandle& stop, - StmtPtr body) { return For::make(var, start, stop, body); }, - py::return_value_policy::reference); + StmtPtr body) { return For::make(var, start, stop, body); }); - py::class_>(te, "Cond") + py::class_>(te, "Cond") .def_static( "make", [](const ExprHandle& condition, StmtPtr true_stmt, StmtPtr false_stmt) { - return alloc(condition.node(), true_stmt, false_stmt); - }, - py::return_value_policy::reference) - .def("true_stmt", &Cond::true_stmt, py::return_value_policy::reference) - .def("false_stmt", &Cond::false_stmt, py::return_value_policy::reference); + return Cond::make(condition, true_stmt, false_stmt); + }) + .def("true_stmt", &Cond::true_stmt) + .def("false_stmt", &Cond::false_stmt); - py::class_< - tensorexpr::Block, - Stmt, - std::unique_ptr>(te, "Block") + py::class_>( + te, "Block") .def(py::init([](const std::vector& stmts) { return tensorexpr::Block::make(stmts); })) - .def( - "stmts", - &tensorexpr::Block::stmts, - py::return_value_policy::reference); - py::class_>( + .def("stmts", &tensorexpr::Block::stmts); + py::class_>( te, "ExternalCall") - .def(py::init(&ExternalCall::make), py::return_value_policy::reference); + .def(py::init(&ExternalCall::make)); py::class_(te, "LoopNest") .def(py::init&>()) From 62d02f2b577c223f94d0b190df3f158cd985c221 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 24 Aug 2021 00:29:22 -0700 Subject: [PATCH 159/530] [TensorExpr] Make 'Tensor' a value type. (#63586) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63586 This is another commit in transition from KernelArena memory management. Tensor is essentially just a pair of and we don't need to dynamically allocate it at all - it's cheap to pass it by value, and that's what we're switching to in this commit. After this change nothing uses KernelScope/KernelArena and they can be safely removed. Differential Revision: D30429114 D30429114 Test Plan: Imported from OSS Reviewed By: navahgar Pulled By: ZolotukhinM fbshipit-source-id: f90b859cfe863692b7beffbe9bd0e4143df1e819 --- benchmarks/cpp/tensorexpr/bench_approx.cpp | 20 +- benchmarks/cpp/tensorexpr/bench_batchnorm.cpp | 4 +- benchmarks/cpp/tensorexpr/bench_compile.cpp | 40 +- benchmarks/cpp/tensorexpr/bench_concat.cpp | 4 +- benchmarks/cpp/tensorexpr/bench_gemm.cpp | 12 +- benchmarks/cpp/tensorexpr/bench_parallel.cpp | 2 +- benchmarks/cpp/tensorexpr/bench_reduce.cpp | 16 +- test/cpp/tensorexpr/test_approx.cpp | 4 +- test/cpp/tensorexpr/test_boundsinference.cpp | 190 +++--- test/cpp/tensorexpr/test_conv.cpp | 8 +- test/cpp/tensorexpr/test_cuda.cpp | 68 +- test/cpp/tensorexpr/test_external_calls.cpp | 48 +- test/cpp/tensorexpr/test_ir_printer.cpp | 14 +- test/cpp/tensorexpr/test_kernel.cpp | 2 +- test/cpp/tensorexpr/test_llvm.cpp | 64 +- test/cpp/tensorexpr/test_loopnest.cpp | 645 +++++++++--------- test/cpp/tensorexpr/test_memdependency.cpp | 68 +- test/cpp/tensorexpr/test_ops.cpp | 4 +- test/cpp/tensorexpr/test_reductions.cpp | 230 +++---- test/cpp/tensorexpr/test_simplify.cpp | 2 +- test/cpp/tensorexpr/tutorial.cpp | 30 +- torch/csrc/jit/runtime/static/ops.cpp | 14 +- torch/csrc/jit/tensorexpr/codegen.h | 2 +- torch/csrc/jit/tensorexpr/ir_printer.cpp | 17 +- torch/csrc/jit/tensorexpr/ir_printer.h | 4 +- torch/csrc/jit/tensorexpr/kernel.cpp | 62 +- torch/csrc/jit/tensorexpr/kernel.h | 10 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 22 +- torch/csrc/jit/tensorexpr/loopnest.h | 14 +- .../csrc/jit/tensorexpr/operators/conv2d.cpp | 18 +- torch/csrc/jit/tensorexpr/operators/conv2d.h | 8 +- .../csrc/jit/tensorexpr/operators/matmul.cpp | 8 +- torch/csrc/jit/tensorexpr/operators/matmul.h | 4 +- torch/csrc/jit/tensorexpr/operators/norm.cpp | 2 +- torch/csrc/jit/tensorexpr/operators/norm.h | 2 +- .../jit/tensorexpr/operators/reduction.cpp | 10 +- .../csrc/jit/tensorexpr/operators/reduction.h | 6 +- .../csrc/jit/tensorexpr/operators/softmax.cpp | 31 +- torch/csrc/jit/tensorexpr/operators/softmax.h | 2 +- torch/csrc/jit/tensorexpr/tensor.cpp | 30 +- torch/csrc/jit/tensorexpr/tensor.h | 39 +- torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 20 +- 42 files changed, 881 insertions(+), 919 deletions(-) diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp index 6e31697d586dd..92c26401f1617 100644 --- a/benchmarks/cpp/tensorexpr/bench_approx.cpp +++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp @@ -10,14 +10,14 @@ using namespace torch::jit; using namespace torch::jit::tensorexpr; -void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) { +void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) { auto loops = ln->getLoopStmtsFor(target); ForPtr inner, tail; ln->splitWithTail(loops[0], width, &inner, &tail); ln->vectorize(inner); } -void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) { +void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) { std::vector loops = ln->getLoopStmtsFor(target); ForPtr inner, tail; ln->splitWithTail(loops[0], 16 * 8, &inner, &tail); @@ -33,7 +33,7 @@ static void relu_nnc(benchmark::State& state) { auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 0; - torch::jit::tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i){ + torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i){ auto A_elem = [&]() { auto elem = A.load(i); auto min = FloatImm::make(clamp); @@ -67,7 +67,7 @@ static void log_nnc_sleef(benchmark::State& state) { KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = + torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { return log(A.load(i)); }); @@ -97,7 +97,7 @@ static void log_nnc_fast(benchmark::State& state) { KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = + torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { return fast_log(A.load(i)); }); @@ -127,7 +127,7 @@ static void log_nnc_vml(benchmark::State& state) { KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = + torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { return log_vml(A.load(i)); }); @@ -168,7 +168,7 @@ static void logit_nnc_sleef(benchmark::State& state) { auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 1e-6f; - tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) { + tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto A_elem = [&]() { auto elem = A.load(i); auto min = FloatImm::make(clamp); @@ -205,7 +205,7 @@ static void logit_nnc_fast(benchmark::State& state) { auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 1e-6f; - tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) { + tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto A_elem = [&]() { auto elem = A.load(i); auto min = FloatImm::make(clamp); @@ -242,7 +242,7 @@ static void logit_nnc_vml(benchmark::State& state) { auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 1e-6f; - tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) { + tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto A_elem = [&]() { auto elem = A.load(i); auto min = FloatImm::make(clamp); @@ -319,7 +319,7 @@ static void tanh_nnc_fast(benchmark::State& state) { KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - torch::jit::tensorexpr::Tensor* B = + torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { return fast_tanh(A.load(i)); }); diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp index 872594ec286b7..85bf9d326ffac 100644 --- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp @@ -84,7 +84,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) { VarHandle eps("eps", kFloat); using axis = const VarHandle&; - Tensor* output = Compute( + Tensor output = Compute( "output", {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}}, [&](axis n, axis c, axis h, axis w) { @@ -147,7 +147,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) { VarHandle eps("eps", kFloat); using axis = const VarHandle&; - Tensor* output = Compute( + Tensor output = Compute( "output", {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}}, [&](axis n, axis c, axis h, axis w) { diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp index 245d5d8b203c5..50d54e57f889a 100644 --- a/benchmarks/cpp/tensorexpr/bench_compile.cpp +++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp @@ -13,24 +13,24 @@ static void BM_CompileSwish(benchmark::State& state) { te::KernelScope ks; te::VarHandle n("n", te::kInt); te::Placeholder A(te::BufHandle("A", {N}, te::kFloat)); - te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { + te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { return te::Max::make(A.load(i), 0.f, false); }); - te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) { - return te::Min::make(relu->load(i), 6.f, false); + te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) { + return te::Min::make(relu.load(i), 6.f, false); }); - te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) { - return min6->load(i) + 3.f; + te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) { + return min6.load(i) + 3.f; }); - te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) { - return A.load(i) * plus3->load(i); + te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) { + return A.load(i) * plus3.load(i); }); - te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) { - return times->load(i) * 1.f / 6.f; + te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) { + return times.load(i) * 1.f / 6.f; }); te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth}); for (auto tensor : {relu, min6, plus3, times}) { - nest.computeInline(tensor->buf()); + nest.computeInline(tensor.buf()); } nest.prepareForCodegen(); te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt()); @@ -43,24 +43,24 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) { te::KernelScope ks; te::VarHandle n("n", te::kInt); te::Placeholder A(te::BufHandle("A", {N}, te::kFloat)); - te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { + te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { return te::Max::make(A.load(i), 0.f, false); }); - te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) { - return te::Min::make(relu->load(i), 6.f, false); + te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) { + return te::Min::make(relu.load(i), 6.f, false); }); - te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) { - return min6->load(i) + 3.f; + te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) { + return min6.load(i) + 3.f; }); - te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) { - return A.load(i) * plus3->load(i); + te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) { + return A.load(i) * plus3.load(i); }); - te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) { - return times->load(i) * 1.f / 6.f; + te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) { + return times.load(i) * 1.f / 6.f; }); te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth}); for (auto tensor : {relu, min6, plus3, times}) { - nest.computeInline(tensor->buf()); + nest.computeInline(tensor.buf()); } nest.prepareForCodegen(); te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt()); diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp index cb9aa84150e88..856065d6e789f 100644 --- a/benchmarks/cpp/tensorexpr/bench_concat.cpp +++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp @@ -60,7 +60,7 @@ class ConcatBench : public benchmark::Fixture { {input_sizes_[i][0], input_sizes_[i][1]})); } - Tensor* output = Compute( + Tensor output = Compute( "aten_cat", {{output_size_[0], "M"}, {output_size_[1], "N"}}, [&](const VarHandle& m, const VarHandle& n) { @@ -147,7 +147,7 @@ class ConcatBench : public benchmark::Fixture { for_stmts[i] = for_st; cumulative_input_sizes += input_sizes_[i][1]; } - auto output = new Tensor(output_buf, alloc(for_stmts)); + auto output = Tensor(output_buf, alloc(for_stmts)); LoopNest nest({output}); nest.prepareForCodegen(); diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp index 7ebaa87781514..8646e97b756c2 100644 --- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp @@ -44,7 +44,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) { te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); - te::Tensor* CT = te::Reduce( + te::Tensor CT = te::Reduce( "gemm", {{M, "M"}, {N, "N"}}, te::Sum(), @@ -68,7 +68,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) { te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); - te::Tensor* CT = te::Reduce( + te::Tensor CT = te::Reduce( "gemm", {{M, "M"}, {N, "N"}}, te::Sum(), @@ -128,7 +128,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) { te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); - te::Tensor* CT = te::Reduce( + te::Tensor CT = te::Reduce( "gemm", {{M, "M"}, {N, "N"}}, te::Sum(), @@ -188,7 +188,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) { te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); - te::Tensor* CT = te::Reduce( + te::Tensor CT = te::Reduce( "gemm", {{M, "M"}, {N, "N"}}, te::Sum(), @@ -256,7 +256,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); - te::Tensor* CT = te::Reduce( + te::Tensor CT = te::Reduce( "gemm", {{M, "M"}, {N, "N"}}, te::Sum(), @@ -302,7 +302,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { } { auto const& loops = loop.getLoopStmtsFor(CT); - loop.cacheAccesses(CT->buf(), "C_regs", loops[2]); + loop.cacheAccesses(CT.buf(), "C_regs", loops[2]); } loop.prepareForCodegen(); diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp index 966c9e2a6853d..847b66d9ee58b 100644 --- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp +++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp @@ -38,7 +38,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) { KernelScope kernel_scope; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); - Tensor* c_tensor = Compute( + Tensor c_tensor = Compute( "c", {{M, "m"}}, [&](const VarHandle& m) { return a_buf.load(m) + b_buf.load(m); }); diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp index be5dcc815bc68..9d3570197414d 100644 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -222,7 +222,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { int M = A.numel(); te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); - te::Tensor* BT = te::Reduce( + te::Tensor BT = te::Reduce( "reduce_full", {{1, "N"}}, te::Sum(), @@ -255,7 +255,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { int M = A.numel(); te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); - te::Tensor* BT = te::Reduce( + te::Tensor BT = te::Reduce( "reduce_full", {{1, "N"}}, te::Sum(), @@ -296,7 +296,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { int M = A.numel(); te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); - te::Tensor* BT = te::Reduce( + te::Tensor BT = te::Reduce( "reduce_full", {{1, "N"}}, te::Sum(), @@ -339,7 +339,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { TORCH_CHECK(M % kChunkSize == 0); te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); - te::Tensor* BT = te::Reduce( + te::Tensor BT = te::Reduce( "reduce_full", {}, te::Sum(), @@ -359,7 +359,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { loop.reorderAxis(mo, mi); loops = loop.getLoopStmtsFor(BT); - auto bt_body = loop.getAllWritesToBuf(BT->buf())[1]; + auto bt_body = loop.getAllWritesToBuf(BT.buf())[1]; TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf)); loop.reorderAxis(loops.at(0), loops.at(1)); @@ -390,7 +390,7 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) { const int kChunkSize = 8; te::Placeholder a("A", te::kFloat, {M}); - te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat); + te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat); te::LoopNest nest({b}); auto loops = nest.getLoopStmtsFor(b); @@ -453,7 +453,7 @@ BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) { te::KernelScope ks; constexpr int kCacheSize = 1 << 12; te::Placeholder a("A", te::kFloat, {M, N}); - te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat); + te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat); te::LoopNest nest({b}); auto sch = state.range(2); @@ -560,7 +560,7 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) { te::KernelScope ks; constexpr int kChunkSize = 8; te::Placeholder a("A", te::kFloat, {M, N}); - te::Tensor* b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat); + te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat); te::LoopNest nest({b}); auto sch = state.range(2); diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp index d761645b25b3f..2a4ce9485acde 100644 --- a/test/cpp/tensorexpr/test_approx.cpp +++ b/test/cpp/tensorexpr/test_approx.cpp @@ -11,7 +11,7 @@ using namespace torch::indexing; namespace te = torch::jit::tensorexpr; -static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) { +static void vectorize(te::LoopNest* ln, te::Tensor target, int width) { auto loops = ln->getLoopStmtsFor(target); te::ForPtr inner, tail; ln->splitWithTail(loops[0], width, &inner, &tail); @@ -33,7 +33,7 @@ TEST(Approx, log_vml) { te::KernelScope ks; te::VarHandle N("N", te::kInt); te::Placeholder A("A", te::kFloat, {N}); - te::Tensor* B = te::Compute( + te::Tensor B = te::Compute( "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); }); te::LoopNest ln({B}); diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp index fcfa8cec4bc49..d038665fad75b 100644 --- a/test/cpp/tensorexpr/test_boundsinference.cpp +++ b/test/cpp/tensorexpr/test_boundsinference.cpp @@ -49,7 +49,7 @@ TEST(BoundsInference, _1) { KernelScope kernel_scope; ExprHandle n(100); Placeholder a(BufHandle("a", {n}, kFloat)); - Tensor* b = + Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -60,9 +60,9 @@ TEST(BoundsInference, _1) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, 99}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}}); } TEST(BoundsInference, _2) { @@ -74,7 +74,7 @@ TEST(BoundsInference, _2) { KernelScope kernel_scope; VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); - Tensor* b = + Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -85,9 +85,9 @@ TEST(BoundsInference, _2) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, -1}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, -1}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, -1}}); } TEST(BoundsInference, _3) { @@ -99,7 +99,7 @@ TEST(BoundsInference, _3) { KernelScope kernel_scope; ExprHandle n(100); Placeholder a(BufHandle("a", {n + 10}, kFloat)); - Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { + Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i) * a.load(i + 10); }); LoopNest l({b}); @@ -111,9 +111,9 @@ TEST(BoundsInference, _3) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, 109}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}}); } TEST(BoundsInference, _4) { @@ -129,13 +129,13 @@ TEST(BoundsInference, _4) { ExprHandle W(320); ExprHandle H(200); Placeholder a(BufHandle("a", {H, W}, kFloat)); - Tensor* b = Compute( + Tensor b = Compute( "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { return x * y; }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return a.load(y, x) * b->load(y, x); + return a.load(y, x) * b.load(y, x); }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); @@ -149,13 +149,13 @@ TEST(BoundsInference, _4) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, 199}, {0, 319}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 199}, {0, 319}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 199}, {0, 319}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 199}, {0, 319}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 199}, {0, 319}}); } { // Infer bounds on the inner loop scope @@ -166,13 +166,13 @@ TEST(BoundsInference, _4) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {0, 319}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad); - verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 319}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad); + verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 319}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 319}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 319}}); } { // Infer bounds on the inner loop body's scope @@ -183,13 +183,13 @@ TEST(BoundsInference, _4) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad); - verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad); + verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}}); } } @@ -208,7 +208,7 @@ TEST(BoundsInference, _5) { KernelScope kernel_scope; ExprHandle n(100); Placeholder a(BufHandle("a", {n}, kFloat)); - Tensor* b = + Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); @@ -229,9 +229,9 @@ TEST(BoundsInference, _5) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, 95}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 95}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 95}}); } { // Verify inferred bounds for the tail loop @@ -242,9 +242,9 @@ TEST(BoundsInference, _5) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{96, 99}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{96, 99}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{96, 99}}); } } @@ -263,13 +263,13 @@ TEST(BoundsInference, _6) { ExprHandle CW(32); ExprHandle CH(20); Placeholder a(BufHandle("a", {H, W}, kFloat)); - Tensor* b = Compute( + Tensor b = Compute( "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { return x * y; }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return a.load(y + 100, x + 100) * b->load(y * 2, x * 5); + return a.load(y + 100, x + 100) * b.load(y * 2, x * 5); }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); @@ -283,13 +283,13 @@ TEST(BoundsInference, _6) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{100, 119}, {100, 131}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 38}, {0, 155}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 38}, {0, 155}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 19}, {0, 31}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 19}, {0, 31}}); } { // Infer bounds on the inner loop scope @@ -300,13 +300,13 @@ TEST(BoundsInference, _6) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {100, 131}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad); - verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 155}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad); + verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 155}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 31}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 31}}); } { // Infer bounds on the inner loop body's scope @@ -317,13 +317,13 @@ TEST(BoundsInference, _6) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad); - verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad); + verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}}); } } @@ -331,9 +331,9 @@ TEST(BoundsInference, Adjacent) { KernelScope kernel_scope; ExprHandle H(6); Placeholder a(BufHandle("a", {20}, kFloat)); - Tensor* b = + Tensor b = Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); }); LoopNest l({b, c}); std::vector loops = NodeFinder::find(l.root_stmt()); @@ -348,9 +348,9 @@ TEST(BoundsInference, Adjacent) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, 5}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}}); } { // Infer bounds on the inner loop scope @@ -362,9 +362,9 @@ TEST(BoundsInference, Adjacent) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{6, 11}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}}); } { // Infer bounds on the high level program. @@ -377,24 +377,24 @@ TEST(BoundsInference, Adjacent) { ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad); verifyConstBounds(bounds_info.at(a.data())[0], {{0, 11}}); - ASSERT_EQ(bounds_info.at(b->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}}); + ASSERT_EQ(bounds_info.at(b.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}}); - ASSERT_EQ(bounds_info.at(c->buf()).size(), 1); - ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore); - verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}}); + ASSERT_EQ(bounds_info.at(c.buf()).size(), 1); + ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore); + verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}}); } } TEST(BoundsInference, MultipleTopLoopLoad) { KernelScope kernel_scope; Placeholder a(BufHandle("a", {100}, kFloat)); - Tensor* b = + Tensor b = Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); }); - Tensor* d = Compute( + Tensor d = Compute( "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); }); LoopNest l({b, c, d}); @@ -418,7 +418,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) { // b, c, d only written. { - auto bounds = bounds_info[b->buf()]; + auto bounds = bounds_info[b.buf()]; ASSERT_EQ(bounds.size(), 1); auto bound = bounds[0]; ASSERT_EQ(bound.kind, TensorAccessKind::kStore); @@ -426,7 +426,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) { verifyConstBounds(bound, {{0, 63}}); } { - auto bounds = bounds_info[c->buf()]; + auto bounds = bounds_info[c.buf()]; ASSERT_EQ(bounds.size(), 1); auto bound = bounds[0]; ASSERT_EQ(bound.kind, TensorAccessKind::kStore); @@ -434,7 +434,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) { verifyConstBounds(bound, {{0, 31}}); } { - auto bounds = bounds_info[d->buf()]; + auto bounds = bounds_info[d.buf()]; ASSERT_EQ(bounds.size(), 1); auto bound = bounds[0]; ASSERT_EQ(bound.kind, TensorAccessKind::kStore); @@ -506,24 +506,24 @@ TEST(BoundsInference, MultipleTopLoopStore) { TEST(BoundsInference, CacheReads) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 30, j + 3); + return A.load(i + 30, j + 3); }); - Tensor* C = Compute( + Tensor C = Compute( "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); + return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); LoopNest l({B, C}); auto bounds_info_before = inferBounds(l.root_stmt()); StmtPtr j_loop = l.getLoopStmtsFor(B)[1]; - LoopNest::cacheAccesses(A->buf(), "A_local", j_loop); + LoopNest::cacheAccesses(A.buf(), "A_local", j_loop); auto bounds_info_after = inferBounds(l.root_stmt()); @@ -571,7 +571,7 @@ TEST(BoundsInference, CacheReads) { TEST(BoundsInference, Flattened) { KernelScope kernel_scope; - Tensor* b = Compute( + Tensor b = Compute( "b", {{3, "z"}, {4, "y"}, {5, "x"}}, [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) { @@ -585,7 +585,7 @@ TEST(BoundsInference, Flattened) { // There's only one buffer. ASSERT_EQ(bounds_info.size(), 1); - auto& TABI = bounds_info[b->buf()][0]; + auto& TABI = bounds_info[b.buf()][0]; ASSERT_EQ(TABI.kind, TensorAccessKind::kStore); // Flattened bounds should have a single dimension. ASSERT_EQ(TABI.start.size(), 1); @@ -651,11 +651,11 @@ TEST(BoundsInference, GetPotentialHazards) { TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return (i + 1) * (j + 1); }); @@ -679,13 +679,13 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) { TEST(BoundsInference, GetPotentialHazardsLoopCall) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i, j) + 5; + return A.load(i, j) + 5; }); LoopNest l({A, B}); @@ -706,7 +706,7 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) { TEST(BoundsInference, GetPotentialHazardsLoopSplit) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp index 63881d0d33cae..293fbe248f176 100644 --- a/test/cpp/tensorexpr/test_conv.cpp +++ b/test/cpp/tensorexpr/test_conv.cpp @@ -30,7 +30,7 @@ TEST(Conv, DepthwiseConv2D) { te::Placeholder input("input", te::kFloat, {N, C, H, W}); te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S}); te::Placeholder bias("bias", te::kFloat, {K}); - te::Tensor* output = te::conv2d_depthwise( + te::Tensor output = te::conv2d_depthwise( input.handle(), weight.handle(), bias.handle(), kStride, kPad, kGroups); te::LoopNest loop({output}); @@ -61,7 +61,7 @@ TEST(Conv, DepthwiseConv2DNoBias) { te::Placeholder input("input", te::kFloat, {N, C, H, W}); te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S}); - te::Tensor* output = te::conv2d_depthwise( + te::Tensor output = te::conv2d_depthwise( input.handle(), weight.handle(), kStride, kPad, kGroups); te::LoopNest loop({output}); @@ -96,7 +96,7 @@ TEST(Conv, DepthwiseConv2DDynamicShapes) { te::Placeholder input("input", te::kFloat, {N_var, C_var, H_var, W_var}); te::Placeholder weight( "weight", te::kFloat, {K_var, CperG_var, R_var, S_var}); - te::Tensor* output = te::conv2d_depthwise( + te::Tensor output = te::conv2d_depthwise( input.handle(), weight.handle(), N_var, @@ -195,7 +195,7 @@ TEST(Conv, Conv2D) { te::Placeholder inputB(te::BufHandle("input", {N, C, H, W}, te::kFloat)); te::Placeholder filterB(te::BufHandle("filter", {K, C, R, S}, te::kFloat)); - te::Tensor* conv = te::Reduce( + te::Tensor conv = te::Reduce( "conv", {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}}, te::Sum(), diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index e36e17ad432f9..ed5c070ea8689 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -34,7 +34,7 @@ static void testCudaTestVectorAdd01_impl() { Dtype dtype = ToDtype(); Placeholder a_buf("a", dtype, {num_iter, block_count, block_size}); Placeholder b_buf("b", dtype, {num_iter, block_count, block_size}); - Tensor* c = Compute( + Tensor c = Compute( "c", { {num_iter, "n"}, @@ -99,7 +99,7 @@ TEST(Cuda, Sigmoid_CUDA) { const int block_size = 128; Dtype dtype = ToDtype(); Placeholder a_buf("a", dtype, {num_iter, block_count, block_size}); - Tensor* c = Compute( + Tensor c = Compute( "c", { {num_iter, "n"}, @@ -165,7 +165,7 @@ static void testCudaTestVectorAdd02_impl(int N, int block_size) { KernelScope kernel_scope; Placeholder a_buf("a", kFloat, {N}); Placeholder b_buf("b", kFloat, {N}); - Tensor* c = Compute( + Tensor c = Compute( "c", { {N, "N"}, @@ -225,7 +225,7 @@ TEST(Cuda, HalfCast_CUDA) { KernelScope ks; auto half = ToDtype(); Placeholder a("a", half, {4}); - Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { + Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { return Cast::make(kFloat, a.load(i)); }); @@ -267,7 +267,7 @@ TEST(Cuda, DynamicShape2D_CUDA) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {m, n}, kFloat)); Placeholder b(BufHandle("b", {m, n}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(i, j); }); @@ -328,7 +328,7 @@ TEST(Cuda, TestRand01_CUDA) { const int num_iter = 3; const int block_count = 16; const int block_size = 128; - Tensor* c = Compute( + Tensor c = Compute( "c", { {num_iter, "n"}, @@ -387,7 +387,7 @@ TEST(Cuda, DynamicShapeSplit_CUDA) { constexpr int N = 4096; VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); - Tensor* b = Compute( + Tensor b = Compute( "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; }); LoopNest l({b}); ForPtr inner; @@ -928,16 +928,16 @@ TEST(Cuda, HalfSupport_CUDA) { KernelScope ks; auto half = ToDtype(); Placeholder a("a", half, {4}); - Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { + Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { return Cast::make(half, ExprHandle(2.0f) * a.load(i)); }); - Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) { - return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b->load(i)); + Tensor c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) { + return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i)); }); - Tensor* d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) { - return Cast::make(half, c->load(i)); + Tensor d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) { + return Cast::make(half, c.load(i)); }); LoopNest l({b, c, d}); @@ -986,7 +986,7 @@ TEST(Cuda, HalfPropagation_CUDA) { KernelScope kernel_scope; auto half = ToDtype(); Placeholder a("a", half, {4}); - Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { + Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { return Max::make(a.load(i), ExprHandle(alloc(0)), true); }); @@ -1036,7 +1036,7 @@ TEST(Cuda, UnusedHalfArgument_CUDA) { Placeholder a("a", kFloat, {4}); auto half = ToDtype(); Placeholder b("b", half, {4}); - Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { + Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { return Max::make(a.load(i), ExprHandle(alloc(0)), true); }); @@ -1168,10 +1168,10 @@ TEST(Cuda, MaskBlockDim_CUDA) { int B_SIZE = 50; Placeholder a_buf("a", kFloat, {A_SIZE}); Placeholder b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); - Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + b_buf.load(i); }); @@ -1261,10 +1261,10 @@ TEST(Cuda, MaskThreadDim_CUDA) { int B_SIZE = 100; Placeholder a_buf("a", kFloat, {A_SIZE}); Placeholder b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); - Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i / 2) + b_buf.load(i); }); @@ -1356,10 +1356,10 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) { int B_SIZE = 50; Placeholder a_buf("a", kFloat, {A_SIZE}); Placeholder b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); - Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + b_buf.load(i); }); @@ -1450,10 +1450,10 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) { int B_SIZE = 50; Placeholder a_buf("a", kFloat, {A_SIZE}); Placeholder b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); - Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf.load(i) + b_buf.load(i); }); @@ -1543,17 +1543,17 @@ TEST(Cuda, MaskMultiDim_CUDA) { int B_SIZE = 50; Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); - Tensor* c = Compute( + Tensor c = Compute( "C", {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); - Tensor* d = Compute( + Tensor d = Compute( "D", {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->load(i, j * 2) + b_buf.load(i, j); + return c.load(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); @@ -1673,17 +1673,17 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) { VarHandle B_SIZE("B_SIZE", kInt); Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); - Tensor* c = Compute( + Tensor c = Compute( "C", {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); - Tensor* d = Compute( + Tensor d = Compute( "D", {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->load(i, j * 2) + b_buf.load(i, j); + return c.load(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); @@ -2087,17 +2087,17 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) { int B_SIZE = 15; Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); - Tensor* c = Compute( + Tensor c = Compute( "C", {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); - Tensor* d = Compute( + Tensor d = Compute( "D", {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->load(i, j * 2) + b_buf.load(i, j); + return c.load(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); @@ -2218,17 +2218,17 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) { int B_SIZE = 15; Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE}); Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE}); - Tensor* c = Compute( + Tensor c = Compute( "C", {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); - Tensor* d = Compute( + Tensor d = Compute( "D", {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->load(i, j * 2) + b_buf.load(i, j); + return c.load(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp index 24ddfbf095ab3..a170e530fa98f 100644 --- a/test/cpp/tensorexpr/test_external_calls.cpp +++ b/test/cpp/tensorexpr/test_external_calls.cpp @@ -31,7 +31,7 @@ TEST(ExternalCall, Conv2d_float) { int64_t dilation = 1; int64_t groups = 1; - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -95,7 +95,7 @@ TEST(ExternalCall, Conv2d_int) { int64_t dilation = 1; int64_t groups = 1; - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -153,7 +153,7 @@ TEST(ExternalCall, Conv2d_nobias_noargs) { Placeholder Weight("Weight", kFloat, {16, 16, 1, 1}); BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat); - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -203,7 +203,7 @@ TEST(ExternalCall, Addmm_float) { int64_t beta = 2; int64_t alpha = 2; - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -283,7 +283,7 @@ TEST(ExternalCall, Prepacked_Linear_float) { weight, bias, c10::optional(), c10::optional()); Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1}); - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -370,7 +370,7 @@ TEST(ExternalCall, Prepacked_Conv2d_float) { c10::optional()); Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1}); - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -431,7 +431,7 @@ TEST(ExternalCall, BinaryFloat) { Placeholder B("", kFloat, toExprHandleVec(bShape)); BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat); - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, @@ -516,7 +516,7 @@ TEST(ExternalCall, UnaryFloat) { Placeholder A("A", kFloat, toExprHandleVec(aShape)); BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat); - Tensor* Result = new Tensor( + Tensor Result = Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, externCallName, {BufHandle(A.data())}, externCallArgs)); @@ -566,14 +566,14 @@ TEST(ExternalCall, ComputeInterop) { BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat); BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat); - Tensor* Input = Compute( + Tensor Input = Compute( "Input", {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, const VarHandle& w) { return FloatImm::make(5.0f); }); - Tensor* Weight = Compute( + Tensor Weight = Compute( "Weight", {{16, "n"}, {16, "c"}, {1, "kh"}, {1, "kw"}}, [&](const VarHandle& n, @@ -581,28 +581,28 @@ TEST(ExternalCall, ComputeInterop) { const VarHandle& h, const VarHandle& w) { return FloatImm::make(6.0f); }); - Tensor* ConvResult = new Tensor( + Tensor ConvResult = Tensor( ConvResultBuf.node(), ExternalCall::make( ConvResultBuf, "nnc_aten_conv2d", - {BufHandle(Input->buf()), BufHandle(Weight->buf())}, + {BufHandle(Input.buf()), BufHandle(Weight.buf())}, {})); - Tensor* MatmulResult = new Tensor( + Tensor MatmulResult = Tensor( MatmulResultBuf.node(), ExternalCall::make( MatmulResultBuf, "nnc_aten_matmul", - {BufHandle(ConvResult->buf()), BufHandle(ConvResult->buf())}, + {BufHandle(ConvResult.buf()), BufHandle(ConvResult.buf())}, {})); - Tensor* Result = Compute( + Tensor Result = Compute( "Result", {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, const VarHandle& w) { - return ConvResult->load(n, c, h, w) + MatmulResult->load(n, c, h, w); + return ConvResult.load(n, c, h, w) + MatmulResult.load(n, c, h, w); }); LoopNest l({Input, Weight, ConvResult, MatmulResult, Result}); @@ -658,31 +658,31 @@ TEST(ExternalCall, Inlining) { BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat); - Tensor* A = Compute( + Tensor A = Compute( "A", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return FloatImm::make(5.0f); }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return FloatImm::make(4.0f); }); - Tensor* MatmulResult = new Tensor( + Tensor MatmulResult = Tensor( MatmulResultBuf.node(), ExternalCall::make( MatmulResultBuf, "nnc_aten_matmul", - {BufHandle(A->buf()), BufHandle(B->buf())}, + {BufHandle(A.buf()), BufHandle(B.buf())}, {})); - Tensor* Result = Compute( + Tensor Result = Compute( "Result", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return MatmulResult->load(i, j) + FloatImm::make(3.0f); + return MatmulResult.load(i, j) + FloatImm::make(3.0f); }); StmtPtr root_stmt = alloc(std::vector( - {A->stmt(), B->stmt(), MatmulResult->stmt(), Result->stmt()})); - LoopNest l(root_stmt, {Result->buf()}); + {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()})); + LoopNest l(root_stmt, {Result.buf()}); // Inlining should not inline anything here since all Bufs are either // defined or used in ExternalCalls diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp index 76d9247579d7c..e11ba06740181 100644 --- a/test/cpp/tensorexpr/test_ir_printer.cpp +++ b/test/cpp/tensorexpr/test_ir_printer.cpp @@ -56,30 +56,30 @@ TEST(IRPrinter, FunctionName) { int M = 4; int N = 20; - Tensor* producer = Compute( + Tensor producer = Compute( "producer", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { return m * n; }); - Tensor* chunk_0 = Compute( + Tensor chunk_0 = Compute( "chunk", {{M, "m"}, {N / 2, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { - return producer->load(m, n); + return producer.load(m, n); }); - Tensor* chunk_1 = Compute( + Tensor chunk_1 = Compute( "chunk", {{M, "m"}, {N / 2, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { - return producer->load(m, n + ExprHandle(N / 2)); + return producer.load(m, n + ExprHandle(N / 2)); }); - Tensor* consumer = Compute( + Tensor consumer = Compute( "consumer", {{M, "i"}, {N / 2, "j"}}, [&](const ExprHandle& i, const ExprHandle& j) { - return i * chunk_1->load(i, j); + return i * chunk_1.load(i, j); }); LoopNest l({chunk_0, chunk_1, consumer}); diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 8f36f54395f49..765522ecf6cd4 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -1329,7 +1329,7 @@ TEST_F(Kernel, CodegenInspection) { #endif } -Tensor* lowerNanToNum( +Tensor lowerNanToNum( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType, diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 75e6a064d1ac5..6081403c25650 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -593,10 +593,10 @@ TEST(LLVM, VectorizerLoadStoreTest) { KernelScope kernel_scope; Placeholder a(BufHandle("A", {1}, kInt)); - Tensor* c = + Tensor c = Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); }); - Placeholder c_buf(BufHandle(c->buf())); + Placeholder c_buf(BufHandle(c.buf())); LoopNest l({c}); StmtPtr s = l.root_stmt(); ASSERT_TRUE(LoopNest::vectorize(to(to(s)->front()))); @@ -616,11 +616,11 @@ TEST(LLVM, VectorizeBitCast) { KernelScope kernel_scope; Placeholder a(BufHandle("A", {128}, kInt)); - Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { return bitcast(a.load(i)); }); - Placeholder c_buf(BufHandle(c->buf())); + Placeholder c_buf(BufHandle(c.buf())); LoopNest l({c}); StmtPtr s = l.root_stmt(); ASSERT_TRUE(LoopNest::vectorize(to(to(s)->front()))); @@ -1217,12 +1217,12 @@ TEST(LLVM, StoreFloat) { TEST(LLVM, SimpleMath01) { KernelScope kernel_scope; const int N = 1024; - Tensor* tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) { + Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) { return cast(i * i + 1); }); LoopNest l({tensor}); StmtPtr stmt = l.root_stmt(); - Placeholder f_buf(BufHandle(tensor->buf())); + Placeholder f_buf(BufHandle(tensor.buf())); LLVMCodeGen cg(stmt, {f_buf}); PaddedBuffer f_v(N, "f_v"); @@ -1241,11 +1241,11 @@ TEST(LLVM, ComputeMul) { const int N = 1024; Placeholder a(BufHandle("a", {N}, kFloat)); Placeholder b(BufHandle("b", {N}, kFloat)); - Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) { return a.load(i) * b.load(i); }); - Placeholder c_buf(BufHandle(c->buf())); + Placeholder c_buf(BufHandle(c.buf())); LoopNest l({c}); StmtPtr s = l.root_stmt(); @@ -1265,12 +1265,12 @@ TEST(LLVM, BroadcastAdd) { const int N = 1024; Placeholder a(BufHandle("a", {M, N}, kFloat)); Placeholder b(BufHandle("b", {N}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(j); }); - Placeholder c_buf(BufHandle(c->buf())); + Placeholder c_buf(BufHandle(c.buf())); LoopNest l({c}); l.prepareForCodegen(); StmtPtr s = l.root_stmt(); @@ -1372,7 +1372,7 @@ TEST(LLVM, TensorDynamicShapeAdd) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); Placeholder b(BufHandle("b", {n}, kFloat)); - Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) + b.load(i); }); LoopNest l({c}); @@ -1396,7 +1396,7 @@ TEST(LLVM, DynamicShape2D) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {m, n}, kFloat)); Placeholder b(BufHandle("b", {m, n}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(i, j); }); @@ -1428,7 +1428,7 @@ TEST(LLVM, EliminatedStmt) { KernelScope kernel_scope; Placeholder a(BufHandle("a", {1}, kFloat)); - Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; }); + Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; }); LoopNest l({c}); l.prepareForCodegen(); @@ -1452,7 +1452,7 @@ TEST(LLVM, SimpleReduction) { // TODO: why doesn't implicit vector work? std::vector axis = {DimArg(1)}; std::vector reduce_axis = {DimArg(M), DimArg(N)}; - Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis); + Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis); LoopNest loop({b}); loop.prepareForCodegen(); @@ -1491,7 +1491,7 @@ TEST(LLVM, RFactorReduction) { // TODO: why doesn't implicit vector work? std::vector axis = {DimArg(1)}; std::vector reduce_axis = {DimArg(M), DimArg(N)}; - Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis); + Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis); LoopNest loop({b}); std::vector loops = loop.getLoopStmtsFor(b); @@ -1502,7 +1502,7 @@ TEST(LLVM, RFactorReduction) { loops = loop.getLoopStmtsFor(b); loop_m = loops.at(2); loop_n = loops.at(1); - auto b_body = loop.getAllWritesToBuf(b->buf())[1]; + auto b_body = loop.getAllWritesToBuf(b.buf())[1]; ASSERT_TRUE(loop.rfactor(b_body, loop_n)); loop.prepareForCodegen(); @@ -1538,13 +1538,13 @@ TEST(LLVM, RFactorVectorizedReduction) { Placeholder a("a", kFloat, {1, M, N}); - Tensor* b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}}); + Tensor b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}}); LoopNest loopnest({b}); std::vector loops = loopnest.getLoopStmtsFor(b); // Reorder n and m loops loopnest.reorderAxis(loops.at(1), loops.at(2)); - auto b_body = loopnest.getAllWritesToBuf(b->buf()).at(1); - auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b->buf()); + auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1); + auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf()); ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3); ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1])); auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]); @@ -1585,7 +1585,7 @@ TEST(LLVM, SimpleParallel) { KernelScope kernel_scope; const int M = 4; const int N = 6; - Tensor* f = Compute( + Tensor f = Compute( "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) { return cast(m + n); }); @@ -1626,23 +1626,23 @@ TEST(LLVM, CompositeParallel) { KernelScope kernel_scope; int M = 5; int N = 7; - Tensor* t1 = + Tensor t1 = Compute("t1", {{M, "M"}}, [](const VarHandle& m) { return m + 1.f; }); - Tensor* t2 = + Tensor t2 = Compute("t2", {{N, "N"}}, [](const VarHandle& n) { return n + 2.f; }); - Tensor* t3 = Compute( + Tensor t3 = Compute( "t3", {{M, "M"}, {N, "N"}}, [=](const VarHandle& m, const VarHandle& n) { - return t1->load(m) * t2->load(n); + return t1.load(m) * t2.load(n); }); - Tensor* t4 = Compute( + Tensor t4 = Compute( "t4", {{M, "M"}, {N, "N"}}, [=](const VarHandle& m, const VarHandle& n) { - return t3->load(m, n) + m + n; + return t3.load(m, n) + m + n; }); - LoopNest loop_nest(std::vector({t4}), {t1, t2, t3, t4}); + LoopNest loop_nest({t4}, {t1, t2, t3, t4}); std::vector loop_list; { auto const& loops = loop_nest.getLoopStmtsFor(t1); @@ -1695,7 +1695,7 @@ TEST(LLVM, VectorizedGEMM) { Placeholder AP(BufHandle("A", {M, K}, kFloat)); Placeholder BP(BufHandle("B", {K, N}, kFloat)); - Tensor* CT = Reduce( + Tensor CT = Reduce( "gemm", {{M, "M"}, {N, "N"}}, Sum(), @@ -1776,7 +1776,7 @@ TEST(LLVM, CallRaw) { VarHandle N("N", kInt); Placeholder a(BufHandle("a", {M, N}, kFloat)); Placeholder b(BufHandle("b", {N}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(j); }); @@ -1793,7 +1793,7 @@ TEST(LLVM, CallRaw) { std::vector cv(M * N_value, 0); std::vector args({av.data(), bv.data(), cv.data(), &N_value}); - LLVMCodeGen cg(s, {a, b, BufHandle(c->buf()), N}); + LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N}); cg.call_raw(args); for (int i = 0; i < M; i++) { @@ -1802,7 +1802,7 @@ TEST(LLVM, CallRaw) { } } - SimpleIREvaluator eval(s, {a, b, BufHandle(c->buf()), N}); + SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N}); eval.call_raw(args); for (int i = 0; i < M; i++) { @@ -1818,7 +1818,7 @@ TEST(LLVM, CustomTarget) { Placeholder a("a", kFloat, {M}); Placeholder b("b", kFloat, {M}); Placeholder c("c", kFloat, {M}); - Tensor* d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) { + Tensor d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) { return a.load(m) * b.load(m) + c.load(m); }); LoopNest nest({d}); diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index c80dd5f492d95..7c3eefaab3b1c 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -42,13 +42,12 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) { TEST(LoopNest, ExprSimple01) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::splitWithTail(loops[0], 2); LoopNest::splitWithTail(loops[0], 2); @@ -56,7 +55,7 @@ TEST(LoopNest, ExprSimple01) { TEST(LoopNest, ExprLower01) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); @@ -73,10 +72,9 @@ TEST(LoopNest, ExprSimple02) { auto func = [](const ExprHandle& x, const ExprHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }; - Tensor* tensor = Compute("f", {{26, "x"}, {5, "y"}}, func); + Tensor tensor = Compute("f", {{26, "x"}, {5, "y"}}, func); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::splitWithTail(loops[0], 4); @@ -168,14 +166,13 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); loops[0]->set_gpu_block_index(LoopOptions::IDX_Y); LoopNest::sliceHead(loops[0], 2, &head, &tail); @@ -193,14 +190,13 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceTail(loops[0], 4, &head, &tail); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -227,14 +223,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceHead(loops[0], 10, &head, &tail); ASSERT_EQ(head, loops[0]); @@ -249,14 +244,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceHead(loops[0], 100, &head, &tail); ASSERT_EQ(head, loops[0]); @@ -271,14 +265,13 @@ TEST(LoopNest, ExprSliceHead) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceHead(loops[0], 4, &head, &tail); ASSERT_NE(head, nullptr); @@ -295,10 +288,9 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -323,14 +315,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceTail(loops[0], 10, &head, &tail); ASSERT_EQ(head, nullptr); @@ -347,14 +338,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceTail(loops[0], 100, &head, &tail); ASSERT_EQ(head, nullptr); @@ -369,14 +359,13 @@ TEST(LoopNest, ExprSliceTail) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::sliceTail(loops[0], 4, &head, &tail); ASSERT_NE(head, nullptr); @@ -396,15 +385,14 @@ TEST(LoopNest, ExprSplitAndSlice) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{100, "x"}}, func); + Tensor tensor = Compute("f", {{100, "x"}}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr inner; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr tail; - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // outer: [0, 4) // inner: [0, 21) // tail: [84, 100) @@ -450,10 +438,9 @@ TEST(LoopNest, ExprSliceAndNormalize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {{10, "x"}}, func); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -482,11 +469,11 @@ TEST(LoopNest, ExprSliceWithVariableDimension) { const std::vector>& expected_for_ranges) { KernelScope kernel_scope; VarHandle dim("dim", kInt); - Tensor* tensor = + Tensor tensor = Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; }); LoopNest l({tensor}); std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -521,10 +508,9 @@ TEST(LoopNest, ExprSplitWithTail) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor* tensor = Compute("f", {{199, "x"}}, func); + Tensor tensor = Compute("f", {{199, "x"}}, func); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) LoopNest::splitWithTail(loops[0], 17); // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) @@ -552,10 +538,9 @@ TEST(LoopNest, ExprSplitWithTailNone) { auto func = [](const ExprHandle& x, const ExprHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }; - Tensor* tensor = Compute("f", {{24, "x"}, {5, "y"}}, func); + Tensor tensor = Compute("f", {{24, "x"}, {5, "y"}}, func); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::splitWithTail(loops[0], 4); StmtPtr stmt = l.root_stmt(); @@ -612,14 +597,13 @@ TEST(LoopNest, ExprSplitWithMask01) { const int N = 5; Placeholder a_buf("a", kFloat, {M, N}); Placeholder b_buf("b", kFloat, {M, N}); - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f; }); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::splitWithMask(loops[1], 4); StmtPtr stmt = l.root_stmt(); @@ -648,13 +632,12 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) { const int M = 64; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); - Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { return a_buf.load(m) + b_buf.load(m) + 1.0f; }); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::splitWithMask(loops[0], 4); LoopNest::splitWithMask(loops[0], 4); @@ -736,14 +719,13 @@ TEST(LoopNest, TileSimple) { const int M = 64, N = 64; Placeholder a_buf("a", kFloat, {M, N}); Placeholder b_buf("b", kFloat, {M, N}); - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f; }); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) l.tile(loops[0], loops[1], 4, 8); @@ -783,14 +765,13 @@ TEST(LoopNest, TileWithTails) { const int M = 64, N = 64; Placeholder a_buf("a", kFloat, {M, N}); Placeholder b_buf("b", kFloat, {M, N}); - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f; }); LoopNest l({tensor}); - std::vector loops = - l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) l.tile(loops[0], loops[1], 5, 9); @@ -831,7 +812,7 @@ TEST(LoopNest, TileInMiddle) { const int M = 8, N = 8, L = 8, K = 8; Placeholder a_buf("a", kFloat, {M, N, L, K}); Placeholder b_buf("b", kFloat, {M, N, L, K}); - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{M, "m"}, {N, "n"}, {L, "l"}, {K, "k"}}, [&](const ExprHandle& m, @@ -843,7 +824,7 @@ TEST(LoopNest, TileInMiddle) { LoopNest nest({tensor}); std::vector loops = - nest.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + nest.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) nest.tile(loops[1], loops[2], 3, 3); @@ -891,7 +872,7 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) { const int M = 21; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); - Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { return a_buf.load(m) + b_buf.load(m) + 1.0f; }); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -922,7 +903,7 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) { const int M = 21; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); - Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { return a_buf.load(m) + b_buf.load(m) + 1.0f; }); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -949,7 +930,7 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) { const int K = 6; Placeholder a_buf("a", kFloat, {M, N}); Placeholder b_buf("b", kFloat, {N, K}); - Tensor* c = Compute( + Tensor c = Compute( "broadcast_add", {{M, "m"}, {N, "n"}, {K, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { @@ -998,20 +979,20 @@ TEST(LoopNest, ScheduleFunctionCall01) { const int K = 6; Placeholder a_buf("a", kFloat, {M, N}); Placeholder b_buf("b", kFloat, {N, K}); - Tensor* c = Compute( + Tensor c = Compute( "broadcast_add", {{M, "m"}, {N, "n"}, {K, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); - Tensor* d = Compute( + Tensor d = Compute( "d", {{M, "m"}, {N, "n"}, {K, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c->load(m, n, k) + 1; + return c.load(m, n, k) + 1; }); - LoopNest l(std::vector({d}), {c, d}); + LoopNest l({d}, {c, d}); l.prepareForCodegen(); StmtPtr stmt = l.root_stmt(); std::ostringstream oss; @@ -1058,22 +1039,22 @@ TEST(LoopNest, ScheduleInlineSimple) { Placeholder c_buf("c", kFloat, {M, N}); Placeholder d_buf("d", kFloat, {M, K}); - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k); + return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k); }); - LoopNest l1(std::vector({y}), {x, y}); + LoopNest l1({y}, {x, y}); LoopNest l2(l1); - l2.computeInline(x->buf()); + l2.computeInline(x.buf()); l1.prepareForCodegen(); l2.prepareForCodegen(); @@ -1139,31 +1120,31 @@ void InlineFunc01Helper(const std::vector& inline_order) { Placeholder c_buf("c", kFloat, {M, N}); Placeholder d_buf("d", kFloat, {M, K}); - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k); + return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k); }); - Tensor* z = Compute( + Tensor z = Compute( "z", {{M, "m3"}, {N, "n3"}, {K, "k3"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return x->load(m, n, k) + y->load(m, n, k); + return x.load(m, n, k) + y.load(m, n, k); }); - LoopNest l(std::vector({z}), {x, y, z}); + LoopNest l({z}, {x, y, z}); for (const std::string& order : inline_order) { if (order == "x") { - l.computeInline(x->buf()); + l.computeInline(x.buf()); } else if (order == "y") { - l.computeInline(y->buf()); + l.computeInline(y.buf()); } else { throw std::runtime_error("Invalid order: " + order); } @@ -1218,7 +1199,7 @@ void InlineFunc01Helper(const std::vector& inline_order) { } if (inline_order.size() == 2) { - Tensor* z2 = Compute( + Tensor z2 = Compute( "z", {{M, "m3"}, {N, "n3"}, {K, "k3"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { @@ -1254,21 +1235,21 @@ TEST(LoopNest, ScheduleInlineRandom) { const int N = 5; const int K = 6; - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return Mod::make(Intrinsics::make(kRand, kInt), 5); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return x->load(m, n, k) + x->load(m, n, k); + return x.load(m, n, k) + x.load(m, n, k); }); - LoopNest l1(std::vector({y}), {x, y}); - l1.computeInline(x->buf()); + LoopNest l1({y}, {x, y}); + l1.computeInline(x.buf()); // would normally compare results but Rand isn't implemented in the // SimpleIREvaluator, even if we could seed it. @@ -1290,22 +1271,22 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) { const int N = 5; const int K = 6; - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return m * n * k; }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return x->load(m, n, k) + Intrinsics::make(kRand, kInt) + + return x.load(m, n, k) + Intrinsics::make(kRand, kInt) + Intrinsics::make(kRand, kInt); }); - LoopNest l1(std::vector({y}), {x, y}); - l1.computeInline(x->buf()); + LoopNest l1({y}, {x, y}); + l1.computeInline(x.buf()); // would normally compare results but Rand isn't implemented in the // SimpleIREvaluator, even if we could seed it. @@ -1327,18 +1308,18 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) { const int N = 5; const int K = 6; - Tensor* x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) { + Tensor x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) { return Mod::make(Intrinsics::make(kRand, kInt), 5); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return x->load(m) + x->load(m); + return x.load(m) + x.load(m); }); - LoopNest l1(std::vector({y}), {x, y}); - l1.computeInline(x->buf()); + LoopNest l1({y}, {x, y}); + l1.computeInline(x.buf()); // would normally compare results but Rand isn't implemented in the // SimpleIREvaluator, even if we could seed it. @@ -1362,17 +1343,17 @@ TEST(LoopNest, ScheduleInlineIntrinsics) { Placeholder a_buf("a", kFloat, {M, N}); Placeholder b_buf("b", kFloat, {N, K}); - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return Intrinsics::make(kSqrt, x->load(m, n, k)); + return Intrinsics::make(kSqrt, x.load(m, n, k)); }); PaddedBuffer a_v(M, N); @@ -1389,9 +1370,9 @@ TEST(LoopNest, ScheduleInlineIntrinsics) { } } - LoopNest l1(std::vector({y}), {x, y}); + LoopNest l1({y}, {x, y}); LoopNest l2(l1); - l2.computeInline(x->buf()); + l2.computeInline(x.buf()); l1.prepareForCodegen(); l2.prepareForCodegen(); @@ -1421,21 +1402,21 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { const int N = 5; const int K = 6; - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return Intrinsics::make(kRand, kFloat); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return Intrinsics::make(kSqrt, x->load(m, n, k)); + return Intrinsics::make(kSqrt, x.load(m, n, k)); }); - LoopNest l1(std::vector({y}), {x, y}); - l1.computeInline(x->buf()); + LoopNest l1({y}, {x, y}); + l1.computeInline(x.buf()); StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt()); @@ -1451,31 +1432,31 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { // Split a Compute then inline it into another compute. TEST(LoopNest, ScheduleSplitAThenInline) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - LoopNest l(std::vector({b}), {a, b}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); + LoopNest l({b}, {a, b}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); LoopNest::splitWithMask(loops[0], 4); - ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); + ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices"); } // Split a Compute then inline another Compute into it. TEST(LoopNest, ScheduleSplitBThenInline) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - LoopNest l(std::vector({b}), {a, b}); - std::vector loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0); + LoopNest l({b}, {a, b}); + std::vector loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0); LoopNest::splitWithMask(loops[0], 3); - l.computeInline(a->buf()); + l.computeInline(a.buf()); l.prepareForCodegen(); StmtPtr s = IRSimplifier::simplify(l.root_stmt()); @@ -1491,32 +1472,32 @@ TEST(LoopNest, ScheduleSplitBThenInline) { // Split a Compute twice then inline it. TEST(LoopNest, ScheduleSplitTwiceThenInline) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr i_inner; - LoopNest l(std::vector({b}), {a, b}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); + LoopNest l({b}, {a, b}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); LoopNest::splitWithMask(loops[0], 4, &i_inner); LoopNest::splitWithMask(i_inner, 2); - ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); + ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices"); } // Inline a Compute, then split. TEST(LoopNest, ScheduleInlineThenSplit) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - LoopNest l(std::vector({b}), {a, b}); - l.computeInline(a->buf()); + LoopNest l({b}, {a, b}); + l.computeInline(a.buf()); std::vector loops = NodeFinder::find(l.root_stmt()); LoopNest::splitWithMask(loops.back(), 3); @@ -1534,16 +1515,16 @@ TEST(LoopNest, ScheduleInlineThenSplit) { // Split a Compute, inline it, then split the result. TEST(LoopNest, ScheduleSplitInlineThenSplit) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - LoopNest l(std::vector({b}), {a, b}); + LoopNest l({b}, {a, b}); auto loops = NodeFinder::find(l.root_stmt()); LoopNest::splitWithMask(loops.back(), 2); - l.computeInline(a->buf()); + l.computeInline(a.buf()); loops = NodeFinder::find(l.root_stmt()); LoopNest::splitWithMask(loops.front(), 2); @@ -1561,35 +1542,35 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) { // Oversplit a loop that is simplified out after inlining. TEST(LoopNest, ScheduleSplitInlineSimplify) { KernelScope kernel_scope; - Tensor* a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return ExprHandle(4) * i - ExprHandle(2) * i; }); - Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { - return a->load(j) - ExprHandle(1); + Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { + return a.load(j) - ExprHandle(1); }); - LoopNest l(std::vector({b}), {a, b}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); + LoopNest l({b}, {a, b}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); LoopNest::splitWithMask(loops[0], 4); - ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); + ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices"); } // Inline a Compute with two consumers. TEST(LoopNest, ScheduleInlineThreeMixedOnce) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a->load(k) * b->load(l); + return a.load(k) * b.load(l); }); - LoopNest l(std::vector({c}), {a, b, c}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); - l.computeInline(a->buf()); + LoopNest l({c}, {a, b, c}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); + l.computeInline(a.buf()); l.prepareForCodegen(); StmtPtr s = IRSimplifier::simplify(l.root_stmt()); @@ -1607,20 +1588,20 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) { // Inline Compute A into B, then inline B into C. TEST(LoopNest, ScheduleInlineThreeMixedTwice) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a->load(k) * b->load(l); + return a.load(k) * b.load(l); }); - LoopNest l(std::vector({c}), {a, b, c}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); - l.computeInline(a->buf()); - l.computeInline(b->buf()); + LoopNest l({c}, {a, b, c}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); + l.computeInline(a.buf()); + l.computeInline(b.buf()); l.prepareForCodegen(); StmtPtr s = IRSimplifier::simplify(l.root_stmt()); @@ -1638,19 +1619,19 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) { // Inline a Compute that is both a producer and consumer. TEST(LoopNest, ScheduleInlineThreeMixedInner) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a->load(k) * b->load(l); + return a.load(k) * b.load(l); }); - LoopNest l(std::vector({c}), {a, b, c}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); - l.computeInline(b->buf()); + LoopNest l({c}, {a, b, c}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); + l.computeInline(b.buf()); l.prepareForCodegen(); StmtPtr s = IRSimplifier::simplify(l.root_stmt()); @@ -1668,25 +1649,25 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) { // Split 3 Computes, then inline the first two into the last. TEST(LoopNest, ScheduleInlineThreeMixedSplit) { KernelScope kernel_scope; - Tensor* a = + Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a->load(j + ExprHandle(8)); + Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { + return a.load(j + ExprHandle(8)); }); - Tensor* c = Compute( + Tensor c = Compute( "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a->load(k) * b->load(l); + return a.load(k) * b.load(l); }); - LoopNest l(std::vector({c}), {a, b, c}); - std::vector loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0); + LoopNest l({c}, {a, b, c}); + std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); LoopNest::splitWithMask(loops[0], 4); - loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0); + loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0); LoopNest::splitWithMask(loops[0], 3); - loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::splitWithMask(loops[0], 2); - ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); + ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices"); } // Check that inlining works for output tensors too @@ -1696,21 +1677,21 @@ TEST(LoopNest, ScheduleInlineOutputTensors) { const int N = 5; const int K = 6; - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return m * n * k; }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return x->load(m, n, k) + m; + return x.load(m, n, k) + m; }); LoopNest l1({x, y}); - l1.computeInline(x->buf()); + l1.computeInline(x.buf()); // would normally compare results but Rand isn't implemented in the // SimpleIREvaluator, even if we could seed it. @@ -1736,14 +1717,14 @@ TEST(LoopNest, ScheduleFuserStyle) { Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Tensor* b = Compute( + Tensor b = Compute( "f", {{kTotalSize, "i"}}, [&](const std::vector& axes) { return a_buf.load(axes[0]) + 11.0f; }); - Tensor* c = Compute( + Tensor c = Compute( "g", {{kTotalSize, "i"}}, [&](const std::vector& axes) { - return b->load(axes[0]) + 1.0f; + return b.load(axes[0]) + 1.0f; }); LoopNest l({b, c}); @@ -1772,17 +1753,17 @@ TEST(LoopNest, ScheduleFuserThreeArg) { Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); - Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + Tensor e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) { return a.load(i) + b.load(i); }); - Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) { - return e->load(i) + c.load(i); + Tensor f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + return e.load(i) + c.load(i); }); - Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) { - return f->load(i) + d.load(i); + Tensor g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + return f.load(i) + d.load(i); }); - LoopNest l(std::vector({g}), {e, f, g}); + LoopNest l({g}, {e, f, g}); l.computeInline(l.getLoopBodyFor(e)); l.computeInline(l.getLoopBodyFor(f)); l.prepareForCodegen(); @@ -1807,7 +1788,7 @@ TEST(LoopNest, ScheduleDynamicShape2D) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {m, n}, kFloat)); Placeholder b(BufHandle("b", {m, n}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(i, j); }); @@ -1842,12 +1823,12 @@ TEST(LoopNest, LoopNestComputeAt_1) { // and the temp should be used in B. KernelScope kernel_scope; VarHandle N("N", kInt); - Tensor* A = Compute( + Tensor A = Compute( "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; }); - Tensor* B = Compute( - "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); }); - LoopNest l(std::vector({B}), {A, B}); - std::vector loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0); + Tensor B = Compute( + "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A.load(i_b); }); + LoopNest l({B}, {A, B}); + std::vector loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); l.prepareForCodegen(); StmtPtr s = l.root_stmt(); @@ -1891,16 +1872,16 @@ TEST(LoopNest, LoopNestComputeAt_2) { const int kW = 16, kH = 16; VarHandle W("W", kInt); VarHandle H("H", kInt); - Tensor* p = Compute( + Tensor p = Compute( "prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](const VarHandle& py, const VarHandle& px) { return px * py; }); - Tensor* c = Compute( + Tensor c = Compute( "cons", {{H, "cy"}, {W, "cx"}}, [&](const VarHandle& y, const VarHandle& x) { - return p->load(y, x) + p->load(y + 1, x) + p->load(y, x + 1) + - p->load(y + 1, x + 1); + return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) + + p.load(y + 1, x + 1); }); std::vector c_ref(kW * kH, 0); @@ -1909,12 +1890,12 @@ TEST(LoopNest, LoopNestComputeAt_2) { c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1); } } - LoopNest orig_loopnest(std::vector({c}), {p, c}); + LoopNest orig_loopnest({c}, {p, c}); { // First let's try to compute P at axis cy (the outer loop) LoopNest l(orig_loopnest); - std::vector loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]); l.prepareForCodegen(); StmtPtr s = l.root_stmt(); @@ -1940,7 +1921,7 @@ TEST(LoopNest, LoopNestComputeAt_2) { { // Now let's try to compute P at axis cx (the inner loop) LoopNest l(orig_loopnest); - std::vector loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]); l.prepareForCodegen(); StmtPtr s = l.root_stmt(); @@ -1979,27 +1960,25 @@ TEST(LoopNest, LoopNestComputeAt_3) { const int kW = 16, kH = 16; VarHandle W("W", kInt); VarHandle H("H", kInt); - Tensor* A = Compute( + Tensor A = Compute( "A", {{H + 1, "ay"}, {W + 1, "ax"}}, [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{H + 1, "by"}, {W + 1, "bx"}}, - [&](const VarHandle& by, const VarHandle& bx) { - return A->load(by, bx); - }); - Tensor* C = Compute( + [&](const VarHandle& by, const VarHandle& bx) { return A.load(by, bx); }); + Tensor C = Compute( "C", {{H, "cy"}, {W, "cx"}}, [&](const VarHandle& cy, const VarHandle& cx) { - return B->load(cy, cx + 1); + return B.load(cy, cx + 1); }); - Tensor* D = Compute( + Tensor D = Compute( "D", {{H, "dy"}, {W, "dx"}}, [&](const VarHandle& dy, const VarHandle& dx) { - return A->load(dy + 1, dx) + C->load(dy, dx); + return A.load(dy + 1, dx) + C.load(dy, dx); }); std::vector c_ref(kW * kH, 0); @@ -2009,11 +1988,11 @@ TEST(LoopNest, LoopNestComputeAt_3) { } } - LoopNest orig_loopnest(std::vector({D}), {A, B, C, D}); + LoopNest orig_loopnest({D}, {A, B, C, D}); { // First let's try to compute A at axis dy (the outer loop) LoopNest l(orig_loopnest); - std::vector loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); l.prepareForCodegen(); StmtPtr s = l.root_stmt(); @@ -2044,7 +2023,7 @@ TEST(LoopNest, LoopNestComputeAt_3) { { // Now let's try to compute A at axis dx (the inner loop) LoopNest l(orig_loopnest); - std::vector loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]); l.prepareForCodegen(); StmtPtr s = l.root_stmt(); @@ -2083,15 +2062,15 @@ TEST(LoopNest, Reduce2dComputeAt) { VarHandle W("W", kInt); VarHandle H("H", kInt); - Tensor* p = + Tensor p = Compute("prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](Axis py, Axis px) { return px * py; }); - Tensor* c = Reduce( + Tensor c = Reduce( "cons", {{H, "cy"}, {W, "cx"}}, Sum(), - [&](Axis y, Axis x, Axis r, Axis s) { return p->load(y + r, x + s); }, + [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); }, {{2, "r"}, {2, "s"}}); std::vector c_ref(kW * kH, 0); @@ -2100,7 +2079,7 @@ TEST(LoopNest, Reduce2dComputeAt) { c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1); } } - LoopNest orig_loopnest(std::vector({c}), {p, c}); + LoopNest orig_loopnest({c}, {p, c}); checkIR(orig_loopnest.root_stmt(), R"IR( # CHECK: for (int py = 0; py < H + 1; py++) { # CHECK: for (int px = 0; px < W + 1; px++) { @@ -2122,7 +2101,7 @@ TEST(LoopNest, Reduce2dComputeAt) { { // First let's try to compute P at axis cy (the outer loop) LoopNest l(orig_loopnest); - auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]); // FIXME: Calling simplify here breaks the IR: // MALFORMED INPUT: could not find base node in Load - temp[...] @@ -2159,7 +2138,7 @@ TEST(LoopNest, Reduce2dComputeAt) { { // Now let's try to compute P at axis cx (the inner loop) LoopNest l(orig_loopnest); - std::vector loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]); l.simplify(); l.eliminateDeadStores(); @@ -2205,17 +2184,17 @@ TEST(LoopNest, DISABLED_Conv1d_NH) { int Pad = 1; Placeholder IP("input", kFloat, {H}); - Tensor* A = + Tensor A = Compute("A", {{N, "np"}, {H + 2 * Pad, "hp"}}, [&](Axis n, Axis h) { auto cond = CompareSelect::make(h, Pad, 1, 0, kLT); cond = CompareSelect::make(h, H + Pad, 1, cond, kGE); return ifThenElse(cond, 0.f, IP.load(n, h - Pad)); }); - Tensor* B = Reduce( + Tensor B = Reduce( "B", {{N, "n"}, {H, "h"}}, Sum(), - [&](Axis n, Axis h, Axis r) { return A->load(n, h + r); }, + [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); }, {{R, "r"}}); LoopNest l({B}); checkIR(l.root_stmt(), R"IR( @@ -2233,7 +2212,7 @@ TEST(LoopNest, DISABLED_Conv1d_NH) { # CHECK: } # CHECK: } )IR"); - std::vector loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0); + std::vector loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); // FIXME: The current IR is totally broken. The body of the inlined loop is: @@ -2292,7 +2271,7 @@ class LoopOrderHelper : public IRVisitor { TEST(LoopNest, LoopNestReorderAxis1) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); @@ -2303,7 +2282,7 @@ TEST(LoopNest, LoopNestReorderAxis1) { SimpleIREvaluator cg(stmt1, {tensor}); cg.call({stmt1_output}); - auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[1]); StmtPtr stmt2 = Stmt::clone(l.root_stmt()); @@ -2324,7 +2303,7 @@ TEST(LoopNest, LoopNestReorderAxis1) { } // Reorder them back. - loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[1]); StmtPtr stmt3 = l.root_stmt(); @@ -2341,7 +2320,7 @@ TEST(LoopNest, LoopNestReorderAxis1) { TEST(LoopNest, LoopNestReorderPartialAxes) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}, {4, "z"}}, [](const VarHandle& x, const VarHandle& y, const VarHandle& z) { @@ -2358,7 +2337,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { SimpleIREvaluator cg(stmt1, {tensor}); cg.call({stmt1_output}); - auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[1]); ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,"); @@ -2372,7 +2351,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { ASSERT_EQ(stmt1_output[i], stmt2_output[i]); } - loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[1], loops[2]); ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,"); @@ -2389,7 +2368,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { TEST(LoopNest, LoopNestReorderInternalAxis) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, [](const VarHandle& w, @@ -2409,7 +2388,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) { SimpleIREvaluator cg(stmt1, {tensor}); cg.call({stmt1_output}); - auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[2], loops[1]); ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,"); @@ -2426,7 +2405,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) { TEST(LoopNest, LoopNestReorderEnclosingAxis) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, [](const VarHandle& w, @@ -2445,7 +2424,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) { SimpleIREvaluator cg(stmt1, {tensor}); cg.call({stmt1_output}); - auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[3]); ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,"); @@ -2462,14 +2441,14 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) { TEST(LoopNest, LoopNestReorderSameAxis) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); StmtPtr stmt1 = Stmt::clone(l.root_stmt()); - auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[1], loops[1]); StmtPtr stmt2 = Stmt::clone(l.root_stmt()); @@ -2492,7 +2471,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}, {4, "z"}}, [](const VarHandle& x, const VarHandle& y, const VarHandle& z) { @@ -2503,7 +2482,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { Placeholder extra(BufHandle("res", {6, 3}, kFloat)); - auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); VarHandle i = VarHandle(loops[0]->var()); @@ -2589,7 +2568,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { * * */ - loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0); + loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[2]); StmtPtr stmt3 = Stmt::clone(l.root_stmt()); @@ -2628,7 +2607,7 @@ void LoopNestReorderTestHelper( int index2) { KernelScope kernel_scope; - Tensor* c = Compute( + Tensor c = Compute( "5d", {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, [](const std::vector&) { return -1; }); @@ -2636,7 +2615,7 @@ void LoopNestReorderTestHelper( Placeholder extra(BufHandle("extra", {5}, kInt)); - auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); int j = 0; for (auto l : loops) { // Add an increment at each layer of the loop which counts the number of @@ -2677,7 +2656,7 @@ void LoopNestReorderTestHelper( ASSERT_EQ(extra1[i], expected_loops); } - loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0); + loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0); LoopNest::reorderAxis(loops[index1], loops[index2]); StmtPtr stmt2 = Stmt::clone(l.root_stmt()); @@ -2752,26 +2731,26 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { Placeholder c_buf("c", kFloat, {M, N}); Placeholder d_buf("d", kFloat, {M, K}); - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); - Tensor* y = Compute( + Tensor y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k); + return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k); }); - Tensor* z = Compute( + Tensor z = Compute( "z", {{M, "m3"}, {N, "n3"}, {K, "k3"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return x->load(m, n, k) + y->load(m, n, k); + return x.load(m, n, k) + y.load(m, n, k); }); - LoopNest l(std::vector({z}), {x, y, z}); + LoopNest l({z}, {x, y, z}); ForPtr a = nullptr; ForPtr b = nullptr; auto fors = NodeFinder::find(l.root_stmt()); @@ -2845,14 +2824,14 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { TEST(LoopNest, OuterLoopVectorization) { KernelScope kernel_scope; - Tensor* tensor = Compute( + Tensor tensor = Compute( "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); ASSERT_TRUE( - LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor->buf())[0][0])); + LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor.buf())[0][0])); StmtPtr root_stmt = l.root_stmt(); BlockPtr outer_block = to(root_stmt); @@ -2899,10 +2878,10 @@ namespace { std::string constantUpperBoundLoopIR(int upper_bound_val) { KernelScope kernel_scope; ExprHandle upper_bound(upper_bound_val); - Tensor* A = Compute( + Tensor A = Compute( "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); LoopNest l({A}); - std::vector loops = l.getAllLoopNestsWritingToBuf(A->buf())[0]; + std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; LoopNest::unroll(loops[0], &unrolled); std::ostringstream oss; @@ -2927,12 +2906,12 @@ TEST(LoopNest, UnrollOuter) { KernelScope kernel_scope; ExprHandle outer_bound(3); ExprHandle inner_bound(4); - Tensor* A = Compute( + Tensor A = Compute( "A", {{outer_bound, "x"}, {inner_bound, "y"}}, [&](const VarHandle& x, const VarHandle& y) { return x + y; }); LoopNest l({A}); - std::vector loops = l.getAllLoopNestsWritingToBuf(A->buf())[0]; + std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; LoopNest::unroll(loops[0], &unrolled); checkIR(unrolled, R"IR( @@ -2951,12 +2930,12 @@ TEST(LoopNest, UnrollInner) { KernelScope kernel_scope; ExprHandle outer_bound(3); ExprHandle inner_bound(4); - Tensor* A = Compute( + Tensor A = Compute( "A", {{outer_bound, "x"}, {inner_bound, "y"}}, [&](const VarHandle& x, const VarHandle& y) { return x + y; }); LoopNest l({A}); - std::vector loops = l.getAllLoopNestsWritingToBuf(A->buf())[0]; + std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; LoopNest::unroll( static_to(loops[0]->body()->stmts().front()), &unrolled); @@ -3044,10 +3023,10 @@ TEST(LoopNest, UnrollEmpty) { TEST(LoopNest, NoUnroll) { KernelScope kernel_scope; VarHandle upper_bound("N", kInt); - Tensor* A = Compute( + Tensor A = Compute( "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); LoopNest l({A}); - std::vector loops = l.getAllLoopNestsWritingToBuf(A->buf())[0]; + std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; ASSERT_THROWS_WITH( LoopNest::unroll(loops[0], &unrolled), "non-constant loop"); @@ -3326,7 +3305,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) { // Create a dummy tensor to construct LoopNest. ExprHandle n(100); Placeholder a(BufHandle("a", {n}, kFloat)); - Tensor* b = + Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); @@ -3615,12 +3594,12 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) { VarHandle m("m", kInt); VarHandle n("n", kInt); Placeholder b(BufHandle("b", {m, n}, kFloat)); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); LoopNest loop({c}); HashProvider hasher; auto hash_before = hasher.hash(loop.root_stmt()); - auto loops = loop.getAllLoopNestsWritingToBuf(c->buf())[1]; + auto loops = loop.getAllLoopNestsWritingToBuf(c.buf())[1]; ForPtr flattened = nullptr; ASSERT_FALSE(LoopNest::flatten(loops, &flattened)); ASSERT_EQ(flattened, nullptr); @@ -3673,14 +3652,14 @@ TEST(LoopNest, DetectInlineRankMismatch) { const int kTotalSize = 8; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + Tensor a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { return a_buf.load(i); }); - Tensor* reshape = Compute( + Tensor reshape = Compute( "reshape", {{kTotalSize / 2, "i"}, {2, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); }); - LoopNest l(std::vector({reshape}), {a, reshape}); + [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); }); + LoopNest l({reshape}, {a, reshape}); ASSERT_THROWS_WITH( l.computeInline(l.getLoopBodyFor(a)), "Placeholder indexed access is inconsistent with its rank"); @@ -3689,22 +3668,22 @@ TEST(LoopNest, DetectInlineRankMismatch) { TEST(LoopNest, CacheReadsSimple) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 30, j + 3); + return A.load(i + 30, j + 3); }); - Tensor* C = Compute( + Tensor C = Compute( "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); + return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); - LoopNest l(std::vector({B, C}), {A, B, C}); - StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1]; - LoopNest::cacheAccesses(A->buf(), "A_local", j_loop); + LoopNest l({B, C}, {A, B, C}); + StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1]; + LoopNest::cacheAccesses(A.buf(), "A_local", j_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -3757,22 +3736,22 @@ TEST(LoopNest, CacheReadsSimple) { TEST(LoopNest, CacheReadsOuter) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 30, j + 40) + A->load(i + 31, j + 41); + return A.load(i + 30, j + 40) + A.load(i + 31, j + 41); }); - Tensor* C = Compute( + Tensor C = Compute( "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); + return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); - LoopNest l(std::vector({B, C}), {A, B, C}); - StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0]; - LoopNest::cacheAccesses(A->buf(), "A_local", i_loop); + LoopNest l({B, C}, {A, B, C}); + StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][0]; + LoopNest::cacheAccesses(A.buf(), "A_local", i_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -3805,22 +3784,22 @@ TEST(LoopNest, CacheReadsOuter) { TEST(LoopNest, CacheReadsInternal) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 30, j + 40) + A->load(i + 31, j + 41); + return A.load(i + 30, j + 40) + A.load(i + 31, j + 41); }); - Tensor* C = Compute( + Tensor C = Compute( "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); + return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); - LoopNest l(std::vector({B, C}), {A, B, C}); - StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1]; - LoopNest::cacheAccesses(A->buf(), "A_local", j_loop); + LoopNest l({B, C}, {A, B, C}); + StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1]; + LoopNest::cacheAccesses(A.buf(), "A_local", j_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -3852,23 +3831,23 @@ TEST(LoopNest, CacheReadsInternal) { TEST(LoopNest, CacheReadsInner) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); // note im changing the offset of the first arg of the first call to A. - Tensor* B = Compute( + Tensor B = Compute( "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 34, j + 40) + A->load(i + 30, j + 41); + return A.load(i + 34, j + 40) + A.load(i + 30, j + 41); }); - Tensor* C = Compute( + Tensor C = Compute( "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); + return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); - LoopNest l(std::vector({B, C}), {A, B, C}); + LoopNest l({B, C}, {A, B, C}); StmtPtr body = l.getLoopBodyFor(B); - LoopNest::cacheAccesses(A->buf(), "A_local", body); + LoopNest::cacheAccesses(A.buf(), "A_local", body); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -3900,22 +3879,22 @@ TEST(LoopNest, CacheReadsInner) { TEST(LoopNest, CacheWritesSimple) { KernelScope kernel_scope; - Tensor* A = Compute( + Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; }); - Tensor* B = Compute( + Tensor B = Compute( "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 30, j + 40) + A->load(i + 31, j + 41); + return A.load(i + 30, j + 40) + A.load(i + 31, j + 41); }); - Tensor* C = Compute( + Tensor C = Compute( "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i + 10, j + 20) + A->load(i + 30, j + 40); + return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); - LoopNest l(std::vector({B, C}), {A, B, C}); - StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1]; - LoopNest::cacheAccesses(A->buf(), "A_local", a_loop); + LoopNest l({B, C}, {A, B, C}); + StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A.buf())[0][1]; + LoopNest::cacheAccesses(A.buf(), "A_local", a_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -4054,7 +4033,7 @@ TEST(LoopNest, CompoundTensorSimple) { auto outer_for2 = For::make(x, 0, 10, inner_for2); BlockPtr body = Block::make({outer_for1, outer_for2}); - Tensor* A = new Tensor(a_buf.node(), body); + Tensor A = Tensor(a_buf.node(), body); LoopNest l({A}); l.prepareForCodegen(); @@ -4080,22 +4059,22 @@ TEST(LoopNest, InlineConstantIndex) { KernelScope kernel_scope; const int N = 10; Placeholder x_buf("a", kFloat, {1, N, 1}); - Tensor* y = Compute( + Tensor y = Compute( "f", {{1, "m"}, {N, "n"}, {1, "o"}}, [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) { return x_buf.load(m, n, o); }); - Tensor* z = Compute( + Tensor z = Compute( "f", {{1, "m"}, {N, "n"}, {1, "o"}}, [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) { - return y->load(m, n, o); + return y.load(m, n, o); }); - LoopNest l(std::vector({z}), {y, z}); + LoopNest l({z}, {y, z}); l.simplify(); - ASSERT_TRUE(l.computeInline(y->buf())); + ASSERT_TRUE(l.computeInline(y.buf())); } TEST(LoopNest, CompoundTensorUsed) { @@ -4115,14 +4094,14 @@ TEST(LoopNest, CompoundTensorUsed) { auto outer_for2 = For::make(x, 0, 10, inner_for2); BlockPtr body = Block::make({outer_for1, outer_for2}); - Tensor* A = new Tensor(a_buf.node(), body); - Tensor* B = Compute( + Tensor A = Tensor(a_buf.node(), body); + Tensor B = Compute( "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A->load(i, j + 1) + A->load(i, j + 2); + return A.load(i, j + 1) + A.load(i, j + 2); }); - LoopNest l(std::vector({B}), {A, B}); - ASSERT_FALSE(l.computeInline(A->buf())); + LoopNest l({B}, {A, B}); + ASSERT_FALSE(l.computeInline(A.buf())); l.prepareForCodegen(); std::vector a_data(50, 0); @@ -4707,12 +4686,10 @@ TEST(LoopNest, OptimizeConditionalsNotNormalized) { ASSERT_EQ(hash_before, hash_after); } -static std::pair, Tensor*> colReduce( - int M, - int N) { +static std::pair, Tensor> colReduce(int M, int N) { auto a = std::make_unique("a", kFloat, std::vector{M, N}); - Tensor* t = Reduce( + Tensor t = Reduce( "b", {{N, "n"}}, Sum(), @@ -4721,10 +4698,10 @@ static std::pair, Tensor*> colReduce( return {std::move(a), t}; } -static StmtPtr splitTailReorder(Tensor* b) { +static StmtPtr splitTailReorder(Tensor b) { constexpr int kVectorWidth = 8; LoopNest nest({b}); - auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0]; + auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0]; nest.splitWithTail(loops[0], kVectorWidth); // Now the loopnests will look like: // @@ -4745,24 +4722,24 @@ static StmtPtr splitTailReorder(Tensor* b) { // Write #2: "b[n_outer * 8 + n_inner] = ReduceOp(...)" // Loopnest #2: {n_outer, n_inner, m}; // We will have to reorder n_inner and m. - auto loopnests = nest.getAllLoopNestsWritingToBuf(b->buf()); + auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf()); LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]); nest.prepareForCodegen(); return nest.root_stmt(); } -static StmtPtr splitMaskReorder(Tensor* b) { +static StmtPtr splitMaskReorder(Tensor b) { constexpr int kVectorWidth = 8; LoopNest nest({b}); - auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1]; + auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1]; nest.splitWithMask(loops[0], kVectorWidth); - loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1]; + loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1]; LoopNest::reorderAxis(loops[1], loops[2]); nest.prepareForCodegen(); return nest.root_stmt(); } -static void checkColReduce(StmtPtr s, Placeholder& p, Tensor* t) { +static void checkColReduce(StmtPtr s, Placeholder& p, Tensor t) { int M = immediateAs(p.dim(0)); int N = immediateAs(p.dim(1)); PaddedBuffer a(M, N); @@ -4893,14 +4870,14 @@ TEST(LoopNest, VectorizeUse) { KernelScope kernel_scope; constexpr int N = 8; Placeholder a("a", kFloat, {N}); - Tensor* b = Compute( + Tensor b = Compute( "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; }); - Tensor* c = Compute( - "c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; }); - LoopNest nest(std::vector({c}), {b, c}); - auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0]; + Tensor c = Compute( + "c", {{N, "n"}}, [&](const VarHandle& n) { return b.load(n) + 2.0f; }); + LoopNest nest({c}, {b, c}); + auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0]; ASSERT_TRUE(LoopNest::vectorize(loops[0])); - loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0]; + loops = nest.getAllLoopNestsWritingToBuf(c.buf())[0]; ASSERT_TRUE(LoopNest::vectorize(loops[0])); nest.prepareForCodegen(); // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) @@ -4939,7 +4916,7 @@ TEST(LoopNest, Int64Compute) { constexpr int64_t N = 12; Placeholder a("a", kLong, {N}); - Tensor* b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) { + Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + LongImm::make(1l); }); LoopNest nest({b}); diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp index 9503f9d57b726..db37b66876976 100644 --- a/test/cpp/tensorexpr/test_memdependency.cpp +++ b/test/cpp/tensorexpr/test_memdependency.cpp @@ -2726,28 +2726,28 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) { // Can determine if 2 loops created by Compute are dependent. Placeholder a_buf("a", kFloat, {4, 5}); Placeholder b_buf("b", kFloat, {5, 6}); - Tensor* c = Compute( + Tensor c = Compute( "broadcast_add", {{4, "m"}, {5, "n"}, {6, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); - Tensor* d = Compute( + Tensor d = Compute( "d", {{4, "m"}, {5, "n"}, {6, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c->load(m, n, k) + 1; + return c.load(m, n, k) + 1; }); - LoopNest l(std::vector({d}), {c, d}); + LoopNest l({d}, {c, d}); - MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()}); + MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()}); l.root_stmt()->accept(&analyzer); // Sanity test: Output depends on input. - ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data())); - ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data())); + ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data())); + ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data())); // Second loop depends on first loop. auto c_loop = l.getLoopStmtsFor(c)[0]; @@ -2773,32 +2773,32 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) { Placeholder a_buf("a", kFloat, {4, 5}); Placeholder b_buf("b", kFloat, {5, 6}); - Tensor* c = Compute( + Tensor c = Compute( "broadcast_add", {{4, "m"}, {5, "n"}, {6, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); - Tensor* d = Compute( + Tensor d = Compute( "d", {{4, "m"}, {5, "n"}, {6, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c->load(m, n, k) + 1; + return c.load(m, n, k) + 1; }); - LoopNest l(std::vector({d}), {c, d}); - l.computeInline(c->buf()); + LoopNest l({d}, {c, d}); + l.computeInline(c.buf()); - MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()}); + MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()}); l.root_stmt()->accept(&analyzer); // Sanity test: Output depends on input. - ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data())); - ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data())); + ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data())); + ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data())); // broadcast_add tensor should not appear in trace at all. for (auto& wi : analyzer.getHistory()) { - ASSERT_NE(wi->var(), c->buf()->base_handle()); + ASSERT_NE(wi->var(), c.buf()->base_handle()); } } @@ -2810,7 +2810,7 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) { Placeholder a_buf("a", kFloat, {4, 5}); Placeholder b_buf("b", kFloat, {5, 6}); - Tensor* c = Compute( + Tensor c = Compute( "broadcast_add", {{4, "m"}, {5, "n"}, {6, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { @@ -2819,13 +2819,12 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) { LoopNest l({c}); - MemDependencyChecker analyzer_before( - {a_buf.data(), b_buf.data()}, {c->buf()}); + MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()}); l.root_stmt()->accept(&analyzer_before); l.splitWithTail(l.getLoopStmtsFor(c)[0], 2); - MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()}); + MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()}); StmtPtr stmt = IRSimplifier::simplify(l.root_stmt()); stmt->accept(&analyzer_after); @@ -2859,7 +2858,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) { Placeholder a_buf("a", kFloat, {4, 5}); Placeholder b_buf("b", kFloat, {5, 6}); - Tensor* c = Compute( + Tensor c = Compute( "broadcast_add", {{4, "m"}, {5, "n"}, {6, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { @@ -2868,14 +2867,13 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) { LoopNest l({c}); - MemDependencyChecker analyzer_before( - {a_buf.data(), b_buf.data()}, {c->buf()}); + MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()}); l.root_stmt()->accept(&analyzer_before); auto loops = l.getLoopStmtsFor(c); l.reorderAxis(loops[0], loops[1]); - MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()}); + MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()}); StmtPtr stmt = IRSimplifier::simplify(l.root_stmt()); stmt->accept(&analyzer_after); @@ -2928,22 +2926,22 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) { Placeholder a(BufHandle("a", {2, 3, 6}, kFloat)); Placeholder b(BufHandle("b", {2, 3, 6}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{2, "l2"}, {3, "n1"}, {6, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}}); - LoopNest l(std::vector({d}), {c, d}); + Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}}); + LoopNest l({d}, {c, d}); - MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()}); + MemDependencyChecker analyzer({a.data(), b.data()}, {d.buf()}); l.root_stmt()->accept(&analyzer); // Sanity test: Output depends on input. - ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a.data())); - ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b.data())); + ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a.data())); + ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b.data())); // Second loop depends on first loop. auto c_loop = l.getLoopStmtsFor(c)[0]; @@ -2965,7 +2963,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) { Placeholder AP(BufHandle("A", {M, K}, kFloat)); Placeholder BP(BufHandle("B", {K, N}, kFloat)); - Tensor* CT = Reduce( + Tensor CT = Reduce( "gemm", {{M, "M"}, {N, "N"}}, Sum(), @@ -3011,7 +3009,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) { } { auto const& loops = loop.getLoopStmtsFor(CT); - loop.cacheAccesses(CT->buf(), "C_regs", loops[2]); + loop.cacheAccesses(CT.buf(), "C_regs", loops[2]); } MemDependencyChecker analyzer_unlowered( @@ -3026,12 +3024,12 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) { stmt->accept(&analyzer_unlowered); // Outputs depend on inputs. - ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), AP.data())); - ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), BP.data())); + ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), AP.data())); + ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), BP.data())); // The last write to gemm should cover the total bound of the output. std::shared_ptr outputAccess = - analyzer_unlowered.output(CT->buf()); + analyzer_unlowered.output(CT.buf()); // A single dependency. ASSERT_EQ(outputAccess->dependencies().size(), 1); diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp index 674dbd9cb0199..122a498276f24 100644 --- a/test/cpp/tensorexpr/test_ops.cpp +++ b/test/cpp/tensorexpr/test_ops.cpp @@ -6,7 +6,7 @@ using namespace torch::jit::tensorexpr; -using Tensors = std::vector; +using Tensors = std::vector; using Args = std::vector; std::unique_ptr compile( const Args& inputs, @@ -28,7 +28,7 @@ TEST(Ops, Sum) { constexpr int N = 16; Placeholder a("a", kFloat, {M, N}); - Tensor* b = computeSum({a.handle(), dims, false}, c10::kFloat); + Tensor b = computeSum({a.handle(), dims, false}, c10::kFloat); auto cg = compile({a}, {b}); auto at = at::arange(M * N, at::kFloat).view({M, N}); diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index 449edac19823f..6620ef2686a94 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -35,7 +35,7 @@ TEST(Reductions, ReduceSum0D_1) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -59,7 +59,7 @@ TEST(Reductions, ReduceSum0D_2) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {}); + Tensor c = Reduce("sum", {}, Sum(), b, {}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -83,7 +83,7 @@ TEST(Reductions, ReduceSum1D) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{10, "m"}}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -114,7 +114,7 @@ TEST(Reductions, ReduceSum2D) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -145,7 +145,7 @@ TEST(Reductions, ReduceSum3D) { Placeholder b(BufHandle("b", {2, 3, m}, kFloat)); - Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}}); + Tensor c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -175,7 +175,7 @@ TEST(Reductions, ReduceSum3D) { ASSERT_EQ(cData[i], expected); } - Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}}); + Tensor d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}}); LoopNest loop2({d}); loop2.prepareForCodegen(); StmtPtr s2 = loop2.root_stmt(); @@ -192,8 +192,8 @@ TEST(Reductions, ReduceSum3D) { } // This is the same as just reducing the original result across that axis. - Placeholder c_buf(BufHandle(c->buf())); - Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}}); + Placeholder c_buf(BufHandle(c.buf())); + Tensor e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}}); LoopNest loop3({e}); loop3.prepareForCodegen(); StmtPtr s3 = loop3.root_stmt(); @@ -219,7 +219,7 @@ TEST(Reductions, ReduceSum10D) { std::vector in(InputSize, 1.f); std::vector out(OutputSize, -1.f); - Tensor* c = Reduce( + Tensor c = Reduce( "sum", {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, Sum(), @@ -261,7 +261,7 @@ TEST(Reductions, ReduceProduct) { Reducer product( ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; }); - Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}}); + Tensor c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -294,7 +294,7 @@ TEST(Reductions, ReduceMax) { in[j] = j; } - Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}}); + Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}}); LoopNest loop({dm1}); loop.prepareForCodegen(); @@ -309,7 +309,7 @@ TEST(Reductions, ReduceMax) { Placeholder in2_(BufHandle("b", {2, 5}, kFloat)); std::vector out2(2, -1.f); - Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}}); + Tensor m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}}); LoopNest loop2({m2d}); loop2.prepareForCodegen(); @@ -336,7 +336,7 @@ TEST(Reductions, ReduceMinCustomInitializer) { in[j] = 10 + j; } - Tensor* min = Reduce( + Tensor min = Reduce( "min", {}, Minimum(ExprHandle(minInit)), @@ -372,7 +372,7 @@ TEST(Reductions, ReduceAnyAll) { return CompareSelect::make(a, 1, 1, b, kEQ); }); - Tensor* any = Reduce( + Tensor any = Reduce( "anyEqual", {{4, "i"}}, anyEqSV, @@ -415,7 +415,7 @@ TEST(Reductions, ReduceAnyAll) { return CompareSelect::make(a, 0, 0, b, kEQ); }); - Tensor* allGreaterThan = Reduce( + Tensor allGreaterThan = Reduce( "allGreaterThan", {{4, "i"}}, allGTSV, @@ -465,7 +465,7 @@ TEST(Reductions, ReduceMatmul2D) { } } - Tensor* mm = Reduce( + Tensor mm = Reduce( "mm", {{3, "m"}, {3, "n"}}, Sum(), @@ -501,10 +501,10 @@ TEST(Reductions, ReduceRfactorLike) { std::vector in_rf_(10, -2.f); std::vector out(1, -1.f); - Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}}); - Placeholder in_rf(BufHandle(l1->buf())); + Tensor l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}}); + Placeholder in_rf(BufHandle(l1.buf())); - Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}}); + Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}}); LoopNest loop({l1, l2}); loop.prepareForCodegen(); @@ -526,14 +526,14 @@ TEST(Reductions, ReduceAsProducer) { Placeholder a(BufHandle("a", {2, 3}, kFloat)); Placeholder b(BufHandle("b", {2, 3, m}, kFloat)); - Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}}); - Tensor* d = Compute( + Tensor c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}}); + Tensor d = Compute( "scale", {{2, "l2"}, {3, "n1"}}, [&](const VarHandle& l, const VarHandle& n) { - return c->load(l, n) * a.load(l, n); + return c.load(l, n) * a.load(l, n); }); - LoopNest loop(std::vector({d}), {c, d}); + LoopNest loop({d}, {c, d}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); s = IRSimplifier::simplify(s); @@ -571,14 +571,14 @@ TEST(Reductions, ReduceAsConsumer) { Placeholder a(BufHandle("a", {2, 3, m}, kFloat)); Placeholder b(BufHandle("b", {2, 3, m}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{2, "l2"}, {3, "n1"}, {m, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}}); - LoopNest loop(std::vector({d}), {c, d}); + Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}}); + LoopNest loop({d}, {c, d}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); s = IRSimplifier::simplify(s); @@ -626,7 +626,7 @@ TEST(Reductions, SplitReduceAxis) { } std::vector out(16, -1.f); - Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); LoopNest l({tensor}); std::vector loops = l.getLoopStmtsFor(tensor); LoopNest::splitWithTail(loops[1], 2); @@ -656,7 +656,7 @@ TEST(Reductions, SplitNonReduceAxis) { } } std::vector out(16, -1.f); - Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); LoopNest l({tensor}); std::vector loops = l.getLoopStmtsFor(tensor); LoopNest::splitWithTail(loops[0], 2); @@ -687,14 +687,14 @@ TEST(Reductions, ReorderedReductionInitializer) { Placeholder in(BufHandle("in", {1, 12, 6}, kFloat)); std::vector in_(12 * 6, 1.f); - Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); + Tensor tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); LoopNest l_({tensor_}); l_.prepareForCodegen(); StmtPtr s_ = Stmt::clone(l_.root_stmt()); s_ = IRSimplifier::simplify(s_); - Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); + Tensor tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); LoopNest l({tensor}); auto loops = l.getLoopStmtsFor(tensor); @@ -741,10 +741,10 @@ TEST(Reductions, ReduceRfactor) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); - auto c_body = loop.getAllWritesToBuf(c->buf())[1]; + auto c_body = loop.getAllWritesToBuf(c.buf())[1]; ASSERT_TRUE(loop.rfactor(c_body, loops.at(0))); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); @@ -776,10 +776,10 @@ TEST(Reductions, Reduce3DRfactorInner) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); - auto c_body = loop.getAllWritesToBuf(c->buf())[1]; + auto c_body = loop.getAllWritesToBuf(c.buf())[1]; ASSERT_FALSE(loop.rfactor(c_body, loops.at(2))); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 1); @@ -811,10 +811,10 @@ TEST(Reductions, Reduce3DRfactorOuter) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); - auto c_body = loop.getAllWritesToBuf(c->buf())[1]; + auto c_body = loop.getAllWritesToBuf(c.buf())[1]; ASSERT_TRUE(loop.rfactor(c_body, loops.at(0))); auto rc = NodeFinder::find(loop.root_stmt()); ASSERT_EQ(rc.size(), 2); @@ -837,7 +837,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { std::vector out(1, -1.f); std::vector ref(1, -1.f); - Tensor* c = Reduce( + Tensor c = Reduce( "sum", {}, Sum(), @@ -854,7 +854,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { IRSimplifier::simplify(refloop.root_stmt()), {in_, c}); ref_cg.call({in, ref}); - BufPtr tmp_buf = c->buf(); + BufPtr tmp_buf = c.buf(); for (int idx = 0; idx < rfac_number; idx++) { auto reduce = loop.getAllWritesToBuf(tmp_buf)[1]; @@ -890,7 +890,7 @@ TEST(Reductions, ReduceSplitTail) { for (int i = 0; i < 3; ++i) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[i], 8); @@ -922,7 +922,7 @@ TEST(Reductions, ReduceSplitNoTail) { for (int i = 0; i < 3; ++i) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[i], 5); @@ -956,7 +956,7 @@ TEST(Reductions, ReduceOverSplitTail) { for (int i = 0; i < 3; ++i) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[i], 16); @@ -989,7 +989,7 @@ TEST(Reductions, ReduceSplitMask) { for (int i = 0; i < 3; ++i) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithMask(loops[i], 8); @@ -1021,7 +1021,7 @@ TEST(Reductions, ReduceSplitNoMask) { for (int i = 0; i < 3; ++i) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithMask(loops[i], 5); @@ -1054,7 +1054,7 @@ TEST(Reductions, ReduceOverSplitMask) { for (int i = 0; i < 3; ++i) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithMask(loops[i], 16); @@ -1090,16 +1090,16 @@ TEST(Reductions, ReduceSplitRfactor) { std::vector out(M, -1.f); - Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[2], SPLIT_FACTOR); - auto c_body = loop.getAllWritesToBuf(c->buf())[2]; - auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf()); + auto c_body = loop.getAllWritesToBuf(c.buf())[2]; + auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf()); ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3); LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]); - all_loops = loop.getAllLoopNestsWritingToBuf(c->buf()); + all_loops = loop.getAllLoopNestsWritingToBuf(c.buf()); ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3); ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1])); loop.prepareForCodegen(); @@ -1131,7 +1131,7 @@ TEST(Reductions, ReduceOverSplitRfactor) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -1139,9 +1139,9 @@ TEST(Reductions, ReduceOverSplitRfactor) { LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t); LoopNest::reorderAxis(loops[0], i); - auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf()); + auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf()); ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3); - auto c_body = loop.getAllWritesToBuf(c->buf())[1]; + auto c_body = loop.getAllWritesToBuf(c.buf())[1]; ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0])); LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]); @@ -1182,9 +1182,9 @@ TEST(Reductions, ReduceInlineReduction) { Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M, N, K}); - Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}}); - Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) { - return a_buf.load(m) + x->load(m); + Tensor x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}}); + Tensor y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) { + return a_buf.load(m) + x.load(m); }); PaddedBuffer a_v(M); @@ -1201,9 +1201,9 @@ TEST(Reductions, ReduceInlineReduction) { } } - LoopNest l1(std::vector({y}), {x, y}); + LoopNest l1({y}, {x, y}); // Cannot inline a reduction computation - ASSERT_FALSE(l1.computeInline(x->buf())); + ASSERT_FALSE(l1.computeInline(x.buf())); } TEST(Reductions, ReduceInlineConsumer) { @@ -1215,13 +1215,13 @@ TEST(Reductions, ReduceInlineConsumer) { Placeholder a_buf("a", kFloat, {M, N, K}); Placeholder b_buf("b", kFloat, {M, N, K}); - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n, k) + b_buf.load(m, n, k); }); - Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}}); + Tensor y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}}); PaddedBuffer a_v(M, N, K); PaddedBuffer b_v(M, N, K); @@ -1235,9 +1235,9 @@ TEST(Reductions, ReduceInlineConsumer) { } } - LoopNest l1(std::vector({y}), {x, y}); + LoopNest l1({y}, {x, y}); LoopNest l2(l1); - l2.computeInline(x->buf()); + l2.computeInline(x.buf()); l1.prepareForCodegen(); l2.prepareForCodegen(); @@ -1269,7 +1269,7 @@ TEST(Reductions, ReduceInlineReducerInternal) { Placeholder a_buf("a", kFloat, {M, N, K}); Placeholder b_buf("b", kFloat, {M, N, K}); - Tensor* x = Compute( + Tensor x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { @@ -1279,7 +1279,7 @@ TEST(Reductions, ReduceInlineReducerInternal) { Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) { return Add::make(ExprHandle(1.f), Min::make(a, b, false)); }); - Tensor* y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}}); + Tensor y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}}); PaddedBuffer a_v(M, N, K); PaddedBuffer b_v(M, N, K); @@ -1293,9 +1293,9 @@ TEST(Reductions, ReduceInlineReducerInternal) { } } - LoopNest l1(std::vector({y}), {x, y}); + LoopNest l1({y}, {x, y}); LoopNest l2(l1); - l2.computeInline(x->buf()); + l2.computeInline(x.buf()); l1.prepareForCodegen(); l2.prepareForCodegen(); @@ -1328,25 +1328,25 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) { Placeholder a(BufHandle("a", {L, N, M}, kFloat)); Placeholder b(BufHandle("b", {L, N, M}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{L, "l2"}, {N, "n1"}, {M, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); + Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); - Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); StmtPtr d_loop = l.getLoopStmtsFor(d)[0]; - l.cacheAccesses(d->buf(), "d_local", d_loop); + l.cacheAccesses(d.buf(), "d_local", d_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1405,25 +1405,25 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { Placeholder a(BufHandle("a", {L, N, M}, kFloat)); Placeholder b(BufHandle("b", {L, N, M}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{L, "l2"}, {N, "n1"}, {M, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); + Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); - Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); StmtPtr d_loop = l.getLoopStmtsFor(d)[1]; - l.cacheAccesses(d->buf(), "d_local", d_loop); + l.cacheAccesses(d.buf(), "d_local", d_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1480,25 +1480,25 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { Placeholder a(BufHandle("a", {L, N, M}, kFloat)); Placeholder b(BufHandle("b", {L, N, M}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{L, "l2"}, {N, "n1"}, {M, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); + Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); - Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); StmtPtr d_loop = l.getLoopStmtsFor(d)[2]; - l.cacheAccesses(d->buf(), "d_local", d_loop); + l.cacheAccesses(d.buf(), "d_local", d_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1551,22 +1551,22 @@ TEST(Reductions, ReductionCacheBodyAccess) { Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{24, "l2"}, {32, "n1"}, {12, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); - Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); StmtPtr d_loop = l.getLoopStmtsFor(d)[1]; - l.cacheAccesses(c->buf(), "scale_local", d_loop); + l.cacheAccesses(c.buf(), "scale_local", d_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1592,24 +1592,24 @@ TEST(Reductions, ReductionCacheConsumerAccess) { Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{24, "l2"}, {32, "n1"}, {12, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); - Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4); StmtPtr e_loop = l.getLoopStmtsFor(e)[1]; - l.cacheAccesses(d->buf(), "sum_local", e_loop); + l.cacheAccesses(d.buf(), "sum_local", e_loop); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1633,19 +1633,19 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{24, "l2"}, {32, "n1"}, {12, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); - Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr inner; @@ -1656,7 +1656,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { // Split reduction consumer. LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner); - l.cacheAccesses(d->buf(), "sum_local", inner); + l.cacheAccesses(d.buf(), "sum_local", inner); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1681,19 +1681,19 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); - Tensor* c = Compute( + Tensor c = Compute( "scale", {{24, "l2"}, {32, "n1"}, {12, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); - Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { - return b.load(0, 0, l) * d->load(l); + Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + return b.load(0, 0, l) * d.load(l); }); - LoopNest l(std::vector({e}), {c, d, e}); + LoopNest l({e}, {c, d, e}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr inner; @@ -1705,7 +1705,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { // Split reduction consumer. LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner); - l.cacheAccesses(d->buf(), "sum_local", inner); + l.cacheAccesses(d.buf(), "sum_local", inner); l.prepareForCodegen(); StmtPtr result = IRSimplifier::simplify(l.root_stmt()); @@ -1742,13 +1742,13 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::reorderAxis(loops.at(0), loops.at(1)); loops = loop.getLoopStmtsFor(c); - auto c_body = loop.getAllWritesToBuf(c->buf())[1]; + auto c_body = loop.getAllWritesToBuf(c.buf())[1]; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) BufPtr rfac_buf; ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf)); @@ -1811,10 +1811,10 @@ TEST(Reductions, ReductionRfactorCacheTempInner) { std::vector out(1, -1.f); - Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); - auto c_body = loop.getAllWritesToBuf(c->buf())[1]; + auto c_body = loop.getAllWritesToBuf(c.buf())[1]; LoopNest::reorderAxis(loops.at(0), loops.at(1)); loops = loop.getLoopStmtsFor(c); @@ -1871,7 +1871,7 @@ TEST(Reductions, ReductionVectorize) { Placeholder in(BufHandle("in", {8, 8}, kFloat)); - Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); LoopNest l_before({tensor}); LoopNest l(l_before); l_before.prepareForCodegen(); @@ -1909,7 +1909,7 @@ TEST(Reductions, ReductionVectorizeInner) { Placeholder in(BufHandle("in", {8, 8}, kFloat)); - Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); LoopNest l({tensor}); ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1])); @@ -1929,7 +1929,7 @@ TEST(Reductions, ReductionVectorizeRfactor) { Placeholder in(BufHandle("in", {8, 8}, kFloat)); - Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}}); + Tensor tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}}); LoopNest l_before({tensor}); LoopNest l(l_before); @@ -1944,7 +1944,7 @@ TEST(Reductions, ReductionVectorizeRfactor) { std::vector loops = l.getLoopStmtsFor(tensor); LoopNest::reorderAxis(loops[0], loops[1]); loops = l.getLoopStmtsFor(tensor); - auto tensor_body = l.getAllWritesToBuf(tensor->buf())[1]; + auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1]; BufPtr rfac_buf = nullptr; ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf)); @@ -1988,7 +1988,7 @@ TEST(Reductions, InitFunction) { constexpr int N = 16; Placeholder A("A", kFloat, {M, N}); Placeholder B("B", kFloat, {N}); - Tensor* C = Reduce( + Tensor C = Reduce( "C", {{N, "n"}}, Sum(), diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index c25ae4f68a1fc..0df9e9242e198 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -3932,7 +3932,7 @@ TEST(Simplify, SimplifyForCleansUp) { { Placeholder a("a", kFloat, {1, 12, 1}); VarHandle x("x", kInt); - Tensor* b = Compute( + Tensor b = Compute( // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) "x", {{1, "i"}, {12, "m"}, {1, "n"}}, diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index 5a6f257d6a79b..0f0277e37292e 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -155,8 +155,8 @@ int main(int argc, char* argv[]) { ExprPtr body = alloc(i, j); // Finally, we pass all these pieces together to Tensor constructor: - Tensor* X = new Tensor(buf, args, body); - std::cout << "Tensor computation: " << *X << std::endl; + Tensor X = Tensor(buf, args, body); + std::cout << "Tensor computation: " << X << std::endl; // Prints: // Tensor computation: Tensor X[64, 32]: // for (int i = 0; i < 64; i++) { @@ -171,11 +171,11 @@ int main(int argc, char* argv[]) { // constructing Exprs, Tensors also have a more convenient API for // construction. It is based on Compute API, which takes a name, // dimensions, and a lambda specifying the computation body: - Tensor* Z = Compute( + Tensor Z = Compute( "Z", {{64, "i"}, {32, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i / j; }); - std::cout << "Tensor computation: " << *Z << std::endl; + std::cout << "Tensor computation: " << Z << std::endl; // Prints: // Tensor computation: Tensor Z[64, 32]: // for (int i = 0; i < 64; i++) { @@ -187,13 +187,13 @@ int main(int argc, char* argv[]) { // Tensors might access other tensors and external placeholders in their // expressions. It can be done like so: Placeholder P("P", kInt, {64, 32}); - Tensor* R = Compute( + Tensor R = Compute( "R", {{64, "i"}, {32, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return Z->load(i, j) * P.load(i, j); + return Z.load(i, j) * P.load(i, j); }); - std::cout << "Tensor computation: " << *R << std::endl; + std::cout << "Tensor computation: " << R << std::endl; // Prints: // Tensor computation: Tensor R[64, 32]: // for (int i = 0; i < 64; i++) { @@ -224,20 +224,20 @@ int main(int argc, char* argv[]) { // Let's create a simple tensor expression and construct a loop nest for it. Placeholder A("A", kFloat, {64, 32}); Placeholder B("B", kFloat, {64, 32}); - Tensor* X = Compute( + Tensor X = Compute( "X", {{64, "i"}, {32, "j"}}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i, j) + B.load(i, j); }); - Tensor* Y = Compute( + Tensor Y = Compute( "Y", {{64, "i"}, {32, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return sigmoid(X->load(i, j)); + return sigmoid(X.load(i, j)); }); - std::cout << "Tensor computation X: " << *X - << "Tensor computation Y: " << *Y << std::endl; + std::cout << "Tensor computation X: " << X << "Tensor computation Y: " << Y + << std::endl; // Prints: // Tensor computation X: Tensor X[64, 32]: // for (int i = 0; i < 64; i++) { @@ -256,9 +256,7 @@ int main(int argc, char* argv[]) { // Creating a loop nest is as quite simple, we just need to specify a list // of all and a list of output tensors: // NOLINTNEXTLINE(bugprone-argument-comment) - std::vector outputs = {Y}; - std::vector all = {X, Y}; - LoopNest loopnest(outputs, all); + LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y}); // An IR used in LoopNest is based on tensor statements, represented by // `Stmt` class. Statements are used to specify the loop nest structure, and @@ -357,7 +355,7 @@ int main(int argc, char* argv[]) { // Let's start by constructing a simple computation for us to work with: Placeholder A("A", kInt, {64, 32}); Placeholder B("B", kInt, {64, 32}); - Tensor* X = Compute( + Tensor X = Compute( "X", {{64, "i"}, {32, "j"}}, [&](const VarHandle& i, const VarHandle& j) { diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 27f6e545ec7bc..0a34f476b0d3b 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -500,7 +500,7 @@ struct TEWrapper { void optimizePointwise( tensorexpr::LoopNest* ln, - tensorexpr::Tensor* target, + tensorexpr::Tensor target, int width) { using namespace torch::jit::tensorexpr; std::vector loops = ln->getLoopStmtsFor(target); @@ -513,7 +513,7 @@ void optimizePointwise( std::shared_ptr wrapTECompute( std::shared_ptr wrap, tensorexpr::Placeholder& in, - tensorexpr::Tensor* out, + tensorexpr::Tensor out, tensorexpr::VarHandle& dim, int width = kVectorWidth) { using namespace torch::jit::tensorexpr; @@ -553,7 +553,7 @@ struct TEWrapper { std::shared_ptr wrapTECompute( std::shared_ptr wrap, tensorexpr::Placeholder& in, - tensorexpr::Tensor* out, + tensorexpr::Tensor out, tensorexpr::VarHandle& dim, int width = kVectorWidth) { return wrap; @@ -593,7 +593,7 @@ std::shared_ptr createLogit(c10::optional clamp) { auto wrap = std::make_shared(); auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) { + tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto A_elem = [&]() { if (!clamp) { return A.load(i); @@ -619,7 +619,7 @@ std::shared_ptr createRelu() { wrap = std::make_shared(); auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) { + tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto zero = FloatImm::make(0.f); auto a = A.load(i); return ifThenElse(a < zero, zero, a); @@ -638,7 +638,7 @@ std::shared_ptr createTanh() { wrap = std::make_shared(); auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) { + tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto a = A.load(i); return fast_tanh(a); }); @@ -656,7 +656,7 @@ std::shared_ptr createSigmoid() { wrap = std::make_shared(); auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); - Tensor* B = + Tensor B = Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); }); // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor // (Sleef_expf8). diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h index 77ba8e173631e..d7cfe783fab8f 100644 --- a/torch/csrc/jit/tensorexpr/codegen.h +++ b/torch/csrc/jit/tensorexpr/codegen.h @@ -104,7 +104,7 @@ class TORCH_API CodeGen { class CodeGen::BufferArg { public: BufferArg(const Placeholder& buffer) : buf_(buffer.data()) {} - BufferArg(Tensor* tensor) : buf_(tensor->buf()) {} + BufferArg(Tensor tensor) : buf_(tensor.buf()) {} BufferArg(const VarHandle& var) : var_(var.node()), isVar_(true) {} BufferArg(const BufHandle& buf) : buf_(buf.node()) {} diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index f885246e24d2b..27b56e2f58146 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -545,7 +545,7 @@ std::ostream& operator<<(std::ostream& stream, const Stmt& stmt) { } std::ostream& operator<<(std::ostream& stream, const Tensor& t) { - stream << std::to_string(&t); + stream << std::to_string(t); return stream; } @@ -568,7 +568,7 @@ void print(StmtPtr stmt) { } } -void print(const Tensor* t) { +void print(const Tensor& t) { std::cout << std::to_string(t); } @@ -589,20 +589,17 @@ std::string to_string(StmtPtr stmt) { return oss.str(); } -std::string to_string(const Tensor* t) { - if (!t) { - return "(null tensor)\n"; - } +std::string to_string(const Tensor& t) { std::ostringstream oss; // TODO: move this to Buf printer - oss << "Tensor " << t->buf()->name_hint() << "["; - for (const auto i : c10::irange(t->buf()->ndim())) { + oss << "Tensor " << t.buf()->name_hint() << "["; + for (const auto i : c10::irange(t.buf()->ndim())) { if (i != 0) { oss << ", "; } - oss << *t->buf()->dim(i); + oss << *t.buf()->dim(i); } - oss << "]:\n" << *t->stmt() << "\n"; + oss << "]:\n" << *t.stmt() << "\n"; return oss.str(); } } // namespace std diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h index e76dccab846a1..321d1efe55457 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.h +++ b/torch/csrc/jit/tensorexpr/ir_printer.h @@ -103,7 +103,7 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&); TORCH_API void print(ExprPtr expr); TORCH_API void print(StmtPtr stmt); -TORCH_API void print(const Tensor* t); +TORCH_API void print(const Tensor& t); } // namespace tensorexpr } // namespace jit @@ -119,5 +119,5 @@ using torch::jit::tensorexpr::Tensor; TORCH_API std::string to_string(ExprPtr expr); TORCH_API std::string to_string(StmtPtr stmt); -TORCH_API std::string to_string(const Tensor* t); +TORCH_API std::string to_string(const Tensor& t); } // namespace std diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index faacd022e7e0b..8076ba2b71d67 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -960,7 +960,7 @@ std::vector TensorExprKernel::broadcastShapesMut( return res.first; } -Tensor* computeOneOperand( +Tensor computeOneOperand( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, @@ -981,7 +981,7 @@ Tensor* computeOneOperand( }); } -Tensor* computeTwoOperand( +Tensor computeTwoOperand( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, @@ -1004,7 +1004,7 @@ Tensor* computeTwoOperand( }); } -Tensor* computeTwoOperandWithAlpha( +Tensor computeTwoOperandWithAlpha( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, @@ -1028,7 +1028,7 @@ Tensor* computeTwoOperandWithAlpha( }); } -Tensor* computeConditionWithTwoOperand( +Tensor computeConditionWithTwoOperand( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, @@ -1055,7 +1055,7 @@ Tensor* computeConditionWithTwoOperand( }); } -Tensor* computeThreeOperand( +Tensor computeThreeOperand( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, @@ -1083,7 +1083,7 @@ Tensor* computeThreeOperand( return demoteOutput(compute, outputType); }); } -Tensor* computeFourOperand( +Tensor computeFourOperand( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, @@ -1135,7 +1135,7 @@ std::pair> processCatList( } return {highType, nonEmptyInputs}; } -Tensor* computeCatWoConditionals( +Tensor computeCatWoConditionals( const std::vector& inputs, const std::vector& outputShape) { // NOLINTNEXTLINE(performance-unnecessary-copy-initialization) @@ -1164,7 +1164,7 @@ Tensor* computeCatWoConditionals( auto output_buf = alloc("aten_cat", output_sizes_expr, ToDtype(high_type)); if (non_empty_inputs.size() == 0) { - return new Tensor( + return Tensor( output_buf, alloc(std::vector({}))); } @@ -1213,10 +1213,10 @@ Tensor* computeCatWoConditionals( concat_dim_size = alloc(concat_dim_size, input_dims[norm_concat_dim].node()); } - return new Tensor(output_buf, IRSimplifier::simplify(block)); + return Tensor(output_buf, IRSimplifier::simplify(block)); } -Tensor* computeCat( +Tensor computeCat( const std::vector& inputs, const std::vector& outputShape, at::Device device) { @@ -1276,7 +1276,7 @@ Tensor* computeCat( }); } -Tensor* computeConv2d( +Tensor computeConv2d( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType) { @@ -1319,10 +1319,10 @@ Tensor* computeConv2d( dilation[0], dilation[1], groups}); - return new Tensor(ResultBuf.node(), s); + return Tensor(ResultBuf.node(), s); } -Tensor* tensorexpr::computeOperandValue( +Tensor tensorexpr::computeOperandValue( c10::Symbol op, const std::vector& inputs, const std::vector& outputShape, @@ -2391,7 +2391,7 @@ c10::optional findDtypeForValue(const torch::jit::Value* v) { return c10::nullopt; } -Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { +Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) { auto inputs = v->node()->inputs(); auto op = v->node()->kind(); @@ -2703,9 +2703,9 @@ static std::vector toExprHandles(const std::vector& sizes) { return dims; } -Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) { +Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) { auto const& t = input->type(); - Tensor* result = nullptr; + Tensor result(nullptr, nullptr); switch (t->kind()) { case TypeKind::TensorType: { auto tt = input->type()->cast(); @@ -2744,7 +2744,7 @@ Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) { } return inBuffer.load(idx); }); - bufs_.emplace(input, result->buf()); + bufs_.emplace(input, result.buf()); bufferArgs_.emplace_back(inBuffer); break; @@ -2800,7 +2800,7 @@ bool denseAndNonOverlapping( return (strides == at::infer_dense_strides(sizes, strides)); } -Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { +Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { const TensorTypePtr& tt = v->type()->expect(); TORCH_INTERNAL_ASSERT(bufs_.count(v)); BufPtr buf = bufs_.at(v); @@ -2816,19 +2816,19 @@ Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { auto sizes = *tt->sizes().concrete_sizes(); std::vector default_strides = TensorType::contiguousStridesOf(sizes); if (!tt->strides().concrete_sizes()) { - return new Tensor(buf, nullptr); + return Tensor(buf, nullptr); } TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes()); const std::vector strides = *tt->strides().concrete_sizes(); // All Tensors in NNC are layed out in default, contiguous layout. // If the output is also default contiguous we don't need to do anything if (strides == default_strides) { - return new Tensor(buf, nullptr); + return Tensor(buf, nullptr); } // If the tensor is not dense or overlaps, we have // no way of matching the profiled striding if (!denseAndNonOverlapping(sizes, strides)) { - return new Tensor(buf, nullptr); + return Tensor(buf, nullptr); } auto dims = c10::fmap(sizesForValue(v)); @@ -2922,8 +2922,9 @@ void TensorExprKernel::compile() { nInputs_ = graph_->inputs().size(); genInputDebugNames(); for (auto const& input : graph_->inputs()) { - if (Tensor* t = bindInput(input)) { - block->append_stmt(t->stmt()); + Tensor t = bindInput(input); + if (t.stmt()) { + block->append_stmt(t.stmt()); } } @@ -2937,10 +2938,9 @@ void TensorExprKernel::compile() { } else { for (auto const& output : n->outputs()) { if (output->hasUses()) { - Tensor* t = computeValue(output); - bufs_.emplace(output, t->buf()); - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - block->append_stmt(t->stmt()); + Tensor t = computeValue(output); + bufs_.emplace(output, t.buf()); + block->append_stmt(t.stmt()); } } } @@ -2958,12 +2958,12 @@ void TensorExprKernel::compile() { // The "strided" tensor will be incorrect if used in NNC, // since NNC views it as contiguous. Only convert it to the right // strides at the end of the kernel (if already contiguous it's a no-op) - Tensor* properly_strided_output = convertOutputToCorrectStrides(output); - if (properly_strided_output->stmt()) { - block->append_stmt(properly_strided_output->stmt()); + Tensor properly_strided_output = convertOutputToCorrectStrides(output); + if (properly_strided_output.stmt()) { + block->append_stmt(properly_strided_output.stmt()); } // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - bufs_[output] = properly_strided_output->buf(); + bufs_[output] = properly_strided_output.buf(); const auto& tt = output->type()->expect(); auto sizes = *tt->sizes().concrete_sizes(); tensorOutputSizes_.push_back(sizes); diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 7b35e1e44905c..a8a57b9f15a16 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -47,7 +47,7 @@ using ArgValue = c10::variant< IntList, ArgNone>; -using NNCLoweringFunction = std::function&, const std::vector&, const c10::optional&, @@ -123,7 +123,7 @@ struct TensorInfo { c10::ScalarType dtype; }; -TORCH_API Tensor* computeOperandValue( +TORCH_API Tensor computeOperandValue( c10::Symbol op, const std::vector& inputs, const std::vector& outputShape, @@ -209,7 +209,7 @@ class TORCH_API TensorExprKernel { const torch::jit::Value* v, const std::vector& axes); - Tensor* computeValue(const torch::jit::Value* v); + Tensor computeValue(const torch::jit::Value* v); void bindConstant(const torch::jit::Value* v); @@ -222,9 +222,9 @@ class TORCH_API TensorExprKernel { std::vector& outputs); BackendType inferBackendTypeFromDevice(at::Device device); - Tensor* bindInput(const torch::jit::Value* input); + Tensor bindInput(const torch::jit::Value* input); - Tensor* convertOutputToCorrectStrides(torch::jit::Value* v); + Tensor convertOutputToCorrectStrides(torch::jit::Value* v); // Captures the information for reduction operation nodes. struct ReductionInfo { diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index d9d20736057fb..190499998b289 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -47,14 +47,14 @@ LoopNest::LoopNest(StmtPtr stmt, std::unordered_set output_bufs) // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) LoopNest::LoopNest( - const std::vector& output_tensors, - const std::vector& tensors_to_compute) { + const std::vector& output_tensors, + const std::vector& tensors_to_compute) { initialize(output_tensors, tensors_to_compute); verify(root_stmt_); } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -LoopNest::LoopNest(const std::vector& output_tensors) { +LoopNest::LoopNest(const std::vector& output_tensors) { initialize(output_tensors, output_tensors); verify(root_stmt_); } @@ -486,15 +486,15 @@ bool LoopNest::vectorize(ForPtr f) { } void LoopNest::initialize( - const std::vector& output_tensors, - const std::vector& tensors_to_compute) { + const std::vector& output_tensors, + const std::vector& tensors_to_compute) { for (auto t : output_tensors) { - output_bufs_.insert(t->buf()); + output_bufs_.insert(t.buf()); } std::vector loops; - for (Tensor* t : tensors_to_compute) { - StmtPtr loop = t->stmt(); + for (Tensor t : tensors_to_compute) { + StmtPtr loop = t.stmt(); if (loop->get_parent()) { std::cerr << "Error: creating a loopnest from already used Tensors\n"; loops = {}; @@ -2384,7 +2384,7 @@ void LoopNest::compressAllBuffers(StmtPtr stmt) { } } -std::vector LoopNest::getLoopStmtsFor(Tensor* t) const { +std::vector LoopNest::getLoopStmtsFor(Tensor t) const { StmtPtr cur_stmt = getLoopBodyFor(t); return getLoopStmtsFor(cur_stmt); } @@ -2407,8 +2407,8 @@ std::vector LoopNest::getLoopStmtsFor(StmtPtr s) const { return result; } -StmtPtr LoopNest::getLoopBodyFor(Tensor* t) const { - return getLoopBodyFor(t->buf()); +StmtPtr LoopNest::getLoopBodyFor(Tensor t) const { + return getLoopBodyFor(t.buf()); } StmtPtr LoopNest::getLoopBodyFor(BufPtr buf) const { diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index c8cf2d8553d2d..42f072d2da7d8 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -27,11 +27,11 @@ class TORCH_API LoopNest { public: // A constructor for building a LoopNest from a list of Tensors LoopNest( - const std::vector& output_tensors, - const std::vector& tensors_to_compute); + const std::vector& output_tensors, + const std::vector& tensors_to_compute); // A convenience constructor for the case when all tensors are output tensors - LoopNest(const std::vector& output_tensors); + LoopNest(const std::vector& output_tensors); // A constructor for building a LoopNest from an Stmt and a list of output // buffers. @@ -45,10 +45,10 @@ class TORCH_API LoopNest { return root_stmt_; } - std::vector getLoopStmtsFor(Tensor*) const; + std::vector getLoopStmtsFor(Tensor) const; std::vector getLoopStmtsFor(BufPtr) const; std::vector getLoopStmtsFor(StmtPtr) const; - StmtPtr getLoopBodyFor(Tensor*) const; + StmtPtr getLoopBodyFor(Tensor) const; StmtPtr getLoopBodyFor(BufPtr) const; // Returns the For stmt indexed by 'indices' in the 'root' For stmt. @@ -547,8 +547,8 @@ class TORCH_API LoopNest { private: void initialize( - const std::vector& output_tensors, - const std::vector& tensors_to_compute); + const std::vector& output_tensors, + const std::vector& tensors_to_compute); StmtPtr insertAllocFree(StmtPtr stmt); const std::unordered_set getIntermediateBufs() const; diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp index c4af83a8cc6f4..51d323f4130a4 100644 --- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp +++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp @@ -16,7 +16,7 @@ void assert_dims_constant(const BufHandle& buf) { using InitFunc = std::function&)>; -Tensor* conv2d_depthwise_static( +Tensor conv2d_depthwise_static( BufHandle input, BufHandle weight, const InitFunc& init_func, @@ -45,7 +45,7 @@ Tensor* conv2d_depthwise_static( auto OH = (H - R + 2 * pad) / stride + 1; auto OW = (W - S + 2 * pad) / stride + 1; - Tensor* conv = Reduce( + Tensor conv = Reduce( "conv2d_depthwise", {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}}, Sum(), @@ -83,7 +83,7 @@ Tensor* conv2d_depthwise_static( } else if (R == 3 && stride == 1 && pad == 1) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr main, peeled; - auto loops = nest.getAllLoopNestsWritingToBuf(conv->buf()); + auto loops = nest.getAllLoopNestsWritingToBuf(conv.buf()); main = loops[1][kLoopW]; nest.sliceHead(main, 1, &peeled, &main); nest.sliceTail(main, 1, &main, &peeled); @@ -92,10 +92,10 @@ Tensor* conv2d_depthwise_static( nest.sliceTail(main, 1, &main, &peeled); } - return new Tensor(conv->buf(), nest.root_stmt()); + return Tensor(conv.buf(), nest.root_stmt()); } -Tensor* conv2d_depthwise_dynamic( +Tensor conv2d_depthwise_dynamic( BufHandle input, BufHandle weight, const InitFunc& init_func, @@ -144,7 +144,7 @@ Tensor* conv2d_depthwise_dynamic( } // namespace -Tensor* conv2d_depthwise( +Tensor conv2d_depthwise( BufHandle input, BufHandle weight, BufHandle bias, @@ -158,7 +158,7 @@ Tensor* conv2d_depthwise( return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups); } -Tensor* conv2d_depthwise( +Tensor conv2d_depthwise( BufHandle input, BufHandle weight, int stride, @@ -170,7 +170,7 @@ Tensor* conv2d_depthwise( return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups); } -Tensor* conv2d_depthwise( +Tensor conv2d_depthwise( BufHandle input, BufHandle weight, BufHandle bias, @@ -206,7 +206,7 @@ Tensor* conv2d_depthwise( groups); } -Tensor* conv2d_depthwise( +Tensor conv2d_depthwise( BufHandle input, BufHandle weight, ExprHandle N, diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h index 14612fb17ee74..4c2215b38d868 100644 --- a/torch/csrc/jit/tensorexpr/operators/conv2d.h +++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h @@ -7,7 +7,7 @@ namespace jit { namespace tensorexpr { // An API to compute 2D depthwise convolutions with bias. -TORCH_API Tensor* conv2d_depthwise( +TORCH_API Tensor conv2d_depthwise( BufHandle input, BufHandle weight, BufHandle bias, @@ -16,14 +16,14 @@ TORCH_API Tensor* conv2d_depthwise( int groups); // An API to compute 2D depthwise convolutions without bias. -TORCH_API Tensor* conv2d_depthwise( +TORCH_API Tensor conv2d_depthwise( BufHandle input, BufHandle weight, int stride, int pad, int groups); -TORCH_API Tensor* conv2d_depthwise( +TORCH_API Tensor conv2d_depthwise( BufHandle input, BufHandle weight, BufHandle bias, @@ -39,7 +39,7 @@ TORCH_API Tensor* conv2d_depthwise( ExprHandle pad, ExprHandle groups); -TORCH_API Tensor* conv2d_depthwise( +TORCH_API Tensor conv2d_depthwise( BufHandle input, BufHandle weight, ExprHandle N, diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp index 23cb45564c97c..581514cdcb095 100644 --- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp +++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp @@ -5,7 +5,7 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeMatmul( +Tensor computeMatmul( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType) { @@ -44,13 +44,13 @@ Tensor* computeMatmul( }, {{size_a[1], "K"}}); } else { - return new Tensor( + return Tensor( ResultBuf.node(), ExternalCall::make(ResultBuf, "nnc_aten_matmul", {a, b}, {})); } } -Tensor* computeAddMM( +Tensor computeAddMM( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType) { @@ -59,7 +59,7 @@ Tensor* computeAddMM( dtype = Dtype(*outputType); } BufHandle ResultBuf("addmm", outputShape, dtype); - return new Tensor( + return Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h index 35b30f4168914..0b52ad65c43c8 100644 --- a/torch/csrc/jit/tensorexpr/operators/matmul.h +++ b/torch/csrc/jit/tensorexpr/operators/matmul.h @@ -6,11 +6,11 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeMatmul( +Tensor computeMatmul( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); -Tensor* computeAddMM( +Tensor computeAddMM( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp index d96ebcd9447db..610f928d4e0b8 100644 --- a/torch/csrc/jit/tensorexpr/operators/norm.cpp +++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp @@ -4,7 +4,7 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeBatchNorm( +Tensor computeBatchNorm( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType) { diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h index 98d53b4c306e3..7f1412f0aecd0 100644 --- a/torch/csrc/jit/tensorexpr/operators/norm.h +++ b/torch/csrc/jit/tensorexpr/operators/norm.h @@ -6,7 +6,7 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeBatchNorm( +Tensor computeBatchNorm( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp index c1f3f7f4f2630..fe5cb6d286bd5 100644 --- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp +++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp @@ -19,7 +19,7 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeSum( +Tensor computeSum( const std::vector& inputs, const c10::optional& outputType) { std::vector axes; @@ -100,7 +100,7 @@ Tensor* computeSum( reductionDims); } -Tensor* computeMean( +Tensor computeMean( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType) { @@ -120,13 +120,13 @@ Tensor* computeMean( mean_dims_expr.emplace_back(idx); } } - return new Tensor( + return Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, "nnc_aten_mean", {InputBuf}, mean_dims_expr)); } -Tensor* computeAdaptiveAvgPool2d( +Tensor computeAdaptiveAvgPool2d( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType) { @@ -137,7 +137,7 @@ Tensor* computeAdaptiveAvgPool2d( BufHandle ResultBuf("adaptive_avgpool2d", outputShape, dtype); // NOLINTNEXTLINE(performance-unnecessary-copy-initialization) auto out_size_param = c10::get(inputs[1]); - return new Tensor( + return Tensor( ResultBuf.node(), ExternalCall::make( ResultBuf, diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h index 4335d7b3bd7f7..d76bac6aa34a1 100644 --- a/torch/csrc/jit/tensorexpr/operators/reduction.h +++ b/torch/csrc/jit/tensorexpr/operators/reduction.h @@ -6,14 +6,14 @@ namespace torch { namespace jit { namespace tensorexpr { -TORCH_API Tensor* computeSum( +TORCH_API Tensor computeSum( const std::vector& inputs, const c10::optional& outputType); -TORCH_API Tensor* computeMean( +TORCH_API Tensor computeMean( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); -TORCH_API Tensor* computeAdaptiveAvgPool2d( +TORCH_API Tensor computeAdaptiveAvgPool2d( const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType); diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.cpp b/torch/csrc/jit/tensorexpr/operators/softmax.cpp index d6cb6c0d7d089..c1c2872cc4efe 100644 --- a/torch/csrc/jit/tensorexpr/operators/softmax.cpp +++ b/torch/csrc/jit/tensorexpr/operators/softmax.cpp @@ -6,7 +6,7 @@ namespace tensorexpr { using namespace torch::jit::tensorexpr; -Tensor* computeSoftmax( +Tensor computeSoftmax( const std::vector& inputs, const std::vector& outputShape, bool log_softmax) { @@ -111,48 +111,43 @@ Tensor* computeSoftmax( Compute("aten_softmax_exp", output_dims, [&](ParameterList& indices) { auto inp = tensorOrConstant( inputs[0], convert_indices_to_expr_handle(indices)); - return exp(inp - max->load(remove_softmax_dim_index(indices))); + return exp(inp - max.load(remove_softmax_dim_index(indices))); }); auto sum = Reduce( "aten_softmax_sum", non_softmax_dims, Sum(), [&](ParameterList& indices) { - return e->load(move_softmax_dim_index_to_pos(indices)); + return e.load(move_softmax_dim_index_to_pos(indices)); }, {output_dims[softmax_dim]}); if (!log_softmax) { auto result = Compute("aten_softmax", output_dims, [&](ParameterList& indices) { - return e->load(indices) / - sum->load(remove_softmax_dim_index(indices)); + return e.load(indices) / sum.load(remove_softmax_dim_index(indices)); }); - return new Tensor( - result->buf(), + return Tensor( + result.buf(), alloc(std::vector( - {max->stmt(), e->stmt(), sum->stmt(), result->stmt()}))); + {max.stmt(), e.stmt(), sum.stmt(), result.stmt()}))); } auto log_sum = Compute( "aten_softmax_log_sum", non_softmax_dims, [&](ParameterList& indices) { - return log(sum->load(indices)); + return log(sum.load(indices)); }); auto result = Compute("aten_log_softmax", output_dims, [&](ParameterList& indices) { auto inp = tensorOrConstant( inputs[0], convert_indices_to_expr_handle(indices)); auto non_softmax_indices = remove_softmax_dim_index(indices); - return inp - max->load(non_softmax_indices) - - log_sum->load(non_softmax_indices); + return inp - max.load(non_softmax_indices) - + log_sum.load(non_softmax_indices); }); - return new Tensor( - result->buf(), + return Tensor( + result.buf(), alloc(std::vector( - {max->stmt(), - e->stmt(), - sum->stmt(), - log_sum->stmt(), - result->stmt()}))); + {max.stmt(), e.stmt(), sum.stmt(), log_sum.stmt(), result.stmt()}))); } } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.h b/torch/csrc/jit/tensorexpr/operators/softmax.h index 07ddd0f95b355..b74a867a91b9b 100644 --- a/torch/csrc/jit/tensorexpr/operators/softmax.h +++ b/torch/csrc/jit/tensorexpr/operators/softmax.h @@ -6,7 +6,7 @@ namespace torch { namespace jit { namespace tensorexpr { -Tensor* computeSoftmax( +Tensor computeSoftmax( const std::vector& inputs, const std::vector& outputShape, bool log_softmax); diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp index 9df70f81be4a9..ea3902dcf3c0d 100644 --- a/torch/csrc/jit/tensorexpr/tensor.cpp +++ b/torch/csrc/jit/tensorexpr/tensor.cpp @@ -48,7 +48,7 @@ StmtPtr Tensor::constructStmt( return s; } -Tensor* Compute( +Tensor Compute( const std::string& name, const std::vector& dim_args, const std::function&)>& body_func) { @@ -57,10 +57,10 @@ Tensor* Compute( unpack_dim_args(dim_args, &dims, &args); ExprPtr body = body_func(VarVectorToVarHandleVector(args)).node(); BufPtr buf = alloc(name, dims, body->dtype()); - return new Tensor(buf, args, body); + return Tensor(buf, args, body); } -Tensor* Compute( +Tensor Compute( const std::string& name, const std::vector& dim_args, const std::function& body_func) { @@ -73,10 +73,10 @@ Tensor* Compute( unpack_dim_args(dim_args, &dims, &args); ExprPtr body = body_func(VarHandle(args[0])).node(); BufPtr buf = alloc(name, dims, body->dtype()); - return new Tensor(buf, args, body); + return Tensor(buf, args, body); } -Tensor* Compute( +Tensor Compute( const std::string& name, const std::vector& dim_args, const std::function& @@ -89,10 +89,10 @@ Tensor* Compute( unpack_dim_args(dim_args, &dims, &args); ExprPtr body = body_func(VarHandle(args[0]), VarHandle(args[1])).node(); BufPtr buf = alloc(name, dims, body->dtype()); - return new Tensor(buf, args, body); + return Tensor(buf, args, body); } -Tensor* Compute( +Tensor Compute( const std::string& name, const std::vector& dim_args, const std::function< @@ -108,10 +108,10 @@ Tensor* Compute( body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2])) .node(); BufPtr buf = alloc(name, dims, body->dtype()); - return new Tensor(buf, args, body); + return Tensor(buf, args, body); } -Tensor* Compute( +Tensor Compute( const std::string& name, const std::vector& dim_args, const std::function(name, dims, body->dtype()); - return new Tensor(buf, args, body); + return Tensor(buf, args, body); } -Tensor* Reduce( +Tensor Reduce( const std::string& name, const std::vector& dim_args, const Reducer& reducer, @@ -149,7 +149,7 @@ Tensor* Reduce( reduce_args); } -Tensor* Reduce( +Tensor Reduce( const std::string& name, const std::vector& dim_args, const Reducer& reducer, @@ -163,17 +163,17 @@ Tensor* Reduce( reduce_args); } -Tensor* Reduce( +Tensor Reduce( const std::string& name, const std::vector& dim_args, const Reducer& reducer, - Tensor* tensor, + Tensor tensor, const std::vector& reduce_args) { return Reduce( name, dim_args, reducer, - [&](ParameterList& p) { return tensor->load(p); }, + [&](ParameterList& p) { return tensor.load(p); }, reduce_args); } diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h index 3eb02c69bda78..8d8ffe5cfee44 100644 --- a/torch/csrc/jit/tensorexpr/tensor.h +++ b/torch/csrc/jit/tensorexpr/tensor.h @@ -12,7 +12,7 @@ namespace torch { namespace jit { namespace tensorexpr { -class TORCH_API Tensor : KernelScopedObject { +class TORCH_API Tensor { public: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) Tensor(BufPtr buf, const std::vector& args, ExprPtr body) @@ -42,9 +42,9 @@ class TORCH_API Tensor : KernelScopedObject { } template - inline ExprHandle load(const std::vector& args); + inline ExprHandle load(const std::vector& args) const; template - inline ExprHandle load(const Ts&... ts); + inline ExprHandle load(const Ts&... ts) const; private: StmtPtr constructStmt( @@ -134,22 +134,22 @@ class Placeholder { std::vector strides_; }; -TORCH_API Tensor* Compute( +TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dim_args, const std::function& body_func); -TORCH_API Tensor* Compute( +TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dim_args, const std::function& body_func); -TORCH_API Tensor* Compute( +TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dim_args, const std::function< ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>& body_func); -TORCH_API Tensor* Compute( +TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dim_args, const std::function& body_func); -TORCH_API Tensor* Compute( +TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dim_args, const std::function&)>& body_func); @@ -179,7 +179,7 @@ inline void unpack_dim_args( // Handle reductions over a Reducer and a body_func which produces values. template -Tensor* Reduce( +Tensor Reduce( const std::string& func_name, const std::vector& dim_args, const Reducer& reducer, @@ -207,7 +207,7 @@ Tensor* Reduce( .node(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) BufPtr func_result = alloc(func_name, dims, body->dtype()); - return new Tensor(func_result, vars, body); + return Tensor(func_result, vars, body); } // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -227,13 +227,12 @@ Tensor* Reduce( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ReduceOpPtr reduce_op = reducer(func_result, body, output_args, reduce_vars); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - Tensor* t = - new Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op); + Tensor t = Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op); return t; } template -Tensor* Reduce( +Tensor Reduce( const std::string& func_name, const std::vector& dim_args, const Reducer& reducer, @@ -250,7 +249,7 @@ Tensor* Reduce( // Overload which allows inline lambda functions for the body_func. template -Tensor* Reduce( +Tensor Reduce( const std::string& func_name, const std::vector& dim_args, const Reducer& reducer, @@ -260,14 +259,14 @@ Tensor* Reduce( } // Overload for the common case of all dimensions of a Placeholder. -TORCH_API Tensor* Reduce( +TORCH_API Tensor Reduce( const std::string& func_name, const std::vector& dim_args, const Reducer& reducer, const Placeholder& buffer, const std::vector& reduce_args); -TORCH_API Tensor* Reduce( +TORCH_API Tensor Reduce( const std::string& name, const std::vector& dim_args, const Reducer& reducer, @@ -276,22 +275,22 @@ TORCH_API Tensor* Reduce( // Overload for the common case of all dimensions of a prevously Computed // Tensor. -TORCH_API Tensor* Reduce( +TORCH_API Tensor Reduce( const std::string& func_name, const std::vector& dim_args, const Reducer& reducer, - Tensor* tensor, + Tensor tensor, const std::vector& reduce_args); template -inline ExprHandle Tensor::load(const Ts&... ts) { +inline ExprHandle Tensor::load(const Ts&... ts) const { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector params({ExprHandle(ts)...}); return Load::make(BufHandle(this->buf()), params); } template -inline ExprHandle Tensor::load(const std::vector& args) { +inline ExprHandle Tensor::load(const std::vector& args) const { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector params(args.begin(), args.end()); return Load::make(BufHandle(this->buf()), params); diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp index 4e1618a8745d7..c380233cce16a 100644 --- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp +++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp @@ -185,9 +185,9 @@ void initTensorExprBindings(PyObject* module) { const std::vector& args, const ExprHandle& val) { return self.store(args, val); }) .def("data", [](Placeholder& self) { return BufHandle(self.data()); }); - py::class_>(te, "Tensor") - .def(py::init( - [](BufHandle& b, StmtPtr s) { return new Tensor(b.node(), s); })) + py::class_(te, "Tensor") + .def( + py::init([](BufHandle& b, StmtPtr s) { return Tensor(b.node(), s); })) .def( "load", [](Tensor& self, const std::vector& v) { @@ -268,7 +268,7 @@ void initTensorExprBindings(PyObject* module) { [](const std::string& func_name, const std::vector& dim_args, const Reducer& reducer, - Tensor* buffer, + Tensor buffer, const std::vector& reduce_args) { return Reduce(func_name, dim_args, reducer, buffer, reduce_args); }, @@ -380,7 +380,7 @@ void initTensorExprBindings(PyObject* module) { .def(py::init(&ExternalCall::make)); py::class_(te, "LoopNest") - .def(py::init&>()) + .def(py::init&>()) .def(py::init([](StmtPtr s, const std::vector& bufs) { std::unordered_set buf_nodes; for (auto& buf : bufs) { @@ -392,9 +392,7 @@ void initTensorExprBindings(PyObject* module) { .def("prepare_for_codegen", &LoopNest::prepareForCodegen) .def( "get_loop_body_for", - [](const LoopNest& self, Tensor* t) { - return self.getLoopBodyFor(t); - }, + [](const LoopNest& self, Tensor t) { return self.getLoopBodyFor(t); }, py::return_value_policy::reference) .def( "get_loop_body_for", @@ -404,7 +402,7 @@ void initTensorExprBindings(PyObject* module) { py::return_value_policy::reference) .def( "get_loops_for", - [](const LoopNest& self, Tensor* t) { + [](const LoopNest& self, Tensor t) { return self.getLoopStmtsFor(t); }, py::return_value_policy::reference) @@ -760,12 +758,12 @@ void initTensorExprBindings(PyObject* module) { py::class_(te, "BufferArg") .def(py::init()) - .def(py::init()) + .def(py::init()) .def(py::init()) .def(py::init()); py::implicitly_convertible(); - py::implicitly_convertible(); + py::implicitly_convertible(); py::implicitly_convertible(); py::implicitly_convertible(); From f0d274294d48c7979b47ad0a0257b978d739936d Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 24 Aug 2021 00:29:22 -0700 Subject: [PATCH 160/530] [TensorExpr] Nuke KernelArena and KernelScope. (#63587) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63587 Now that there is no classes using KernelArena for memory management we can remove it. Differential Revision: D30429115 D30429115 Test Plan: Imported from OSS Reviewed By: navahgar Pulled By: ZolotukhinM fbshipit-source-id: 375f6f9294d27790645eeb7cb5a8e87047a57544 --- benchmarks/cpp/tensorexpr/bench_approx.cpp | 8 - benchmarks/cpp/tensorexpr/bench_batchnorm.cpp | 2 - benchmarks/cpp/tensorexpr/bench_compile.cpp | 2 - benchmarks/cpp/tensorexpr/bench_concat.cpp | 2 - benchmarks/cpp/tensorexpr/bench_gemm.cpp | 5 - benchmarks/cpp/tensorexpr/bench_parallel.cpp | 1 - benchmarks/cpp/tensorexpr/bench_reduce.cpp | 7 - test/cpp/tensorexpr/test_approx.cpp | 1 - test/cpp/tensorexpr/test_aten.cpp | 35 --- test/cpp/tensorexpr/test_boundsinference.cpp | 41 --- test/cpp/tensorexpr/test_conv.cpp | 5 - test/cpp/tensorexpr/test_cpp_codegen.cpp | 3 - test/cpp/tensorexpr/test_cuda.cpp | 27 -- test/cpp/tensorexpr/test_expr.cpp | 31 --- test/cpp/tensorexpr/test_external_calls.cpp | 15 -- test/cpp/tensorexpr/test_graph_opt.cpp | 7 - test/cpp/tensorexpr/test_ir_printer.cpp | 4 - test/cpp/tensorexpr/test_ir_verifier.cpp | 8 - test/cpp/tensorexpr/test_kernel.cpp | 33 --- test/cpp/tensorexpr/test_llvm.cpp | 83 ------ test/cpp/tensorexpr/test_loopnest.cpp | 254 ------------------ test/cpp/tensorexpr/test_memdependency.cpp | 40 --- test/cpp/tensorexpr/test_ops.cpp | 2 - test/cpp/tensorexpr/test_reductions.cpp | 83 ------ test/cpp/tensorexpr/test_registerizer.cpp | 70 ----- test/cpp/tensorexpr/test_simplify.cpp | 114 -------- test/cpp/tensorexpr/test_te_fuser_pass.cpp | 1 - test/cpp/tensorexpr/test_type.cpp | 22 -- test/cpp/tensorexpr/tutorial.cpp | 13 - test/test_tensorexpr_pybind.py | 154 +++++------ tools/build_variables.bzl | 1 - torch/csrc/jit/runtime/static/ops.cpp | 4 - torch/csrc/jit/tensorexpr/expr.h | 1 - torch/csrc/jit/tensorexpr/kernel.cpp | 5 - torch/csrc/jit/tensorexpr/kernel.h | 1 - torch/csrc/jit/tensorexpr/mem_arena.cpp | 67 ----- torch/csrc/jit/tensorexpr/mem_arena.h | 60 ----- torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 1 - 38 files changed, 70 insertions(+), 1143 deletions(-) delete mode 100644 torch/csrc/jit/tensorexpr/mem_arena.cpp delete mode 100644 torch/csrc/jit/tensorexpr/mem_arena.h diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp index 92c26401f1617..425d19faabc30 100644 --- a/benchmarks/cpp/tensorexpr/bench_approx.cpp +++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp @@ -29,7 +29,6 @@ void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) { } static void relu_nnc(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 0; @@ -64,7 +63,6 @@ static void relu_nnc(benchmark::State& state) { } static void log_nnc_sleef(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); torch::jit::tensorexpr::Tensor B = @@ -94,7 +92,6 @@ static void log_nnc_sleef(benchmark::State& state) { } static void log_nnc_fast(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); torch::jit::tensorexpr::Tensor B = @@ -124,7 +121,6 @@ static void log_nnc_fast(benchmark::State& state) { } static void log_nnc_vml(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); torch::jit::tensorexpr::Tensor B = @@ -164,7 +160,6 @@ static void log_aten(benchmark::State& state) { } static void logit_nnc_sleef(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 1e-6f; @@ -201,7 +196,6 @@ static void logit_nnc_sleef(benchmark::State& state) { } static void logit_nnc_fast(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 1e-6f; @@ -238,7 +232,6 @@ static void logit_nnc_fast(benchmark::State& state) { } static void logit_nnc_vml(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); auto clamp = 1e-6f; @@ -316,7 +309,6 @@ static void logit_caffe2(benchmark::State& state) { } static void tanh_nnc_fast(benchmark::State& state) { - KernelScope ks; auto N = VarHandle("N", kInt); Placeholder A("A", kFloat, {N}); torch::jit::tensorexpr::Tensor B = diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp index 85bf9d326ffac..702ed1cf3ab9d 100644 --- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp @@ -74,7 +74,6 @@ BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) { } BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) { - KernelScope ks; Placeholder input("input", kFloat, {N_, C_, H_, W_}); Placeholder weight("weight", kFloat, {C_}); @@ -137,7 +136,6 @@ BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) { } BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) { - KernelScope ks; Placeholder input("input", kFloat, {N_, C_, H_, W_}); Placeholder weight("weight", kFloat, {C_}); diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp index 50d54e57f889a..f204377ab8126 100644 --- a/benchmarks/cpp/tensorexpr/bench_compile.cpp +++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp @@ -10,7 +10,6 @@ namespace te = torch::jit::tensorexpr; static void BM_CompileSwish(benchmark::State& state) { for (auto _ : state) { constexpr int N = 512; - te::KernelScope ks; te::VarHandle n("n", te::kInt); te::Placeholder A(te::BufHandle("A", {N}, te::kFloat)); te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { @@ -40,7 +39,6 @@ static void BM_CompileSwish(benchmark::State& state) { static void BM_CompileSwishLLVMOnly(benchmark::State& state) { constexpr int N = 512; - te::KernelScope ks; te::VarHandle n("n", te::kInt); te::Placeholder A(te::BufHandle("A", {N}, te::kFloat)); te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp index 856065d6e789f..c108c867acbf4 100644 --- a/benchmarks/cpp/tensorexpr/bench_concat.cpp +++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp @@ -47,7 +47,6 @@ class ConcatBench : public benchmark::Fixture { } void runNNC(benchmark::State& state) { - KernelScope ks; size_t num_inputs = inputs_.size(); size_t num_dims = 2; @@ -101,7 +100,6 @@ class ConcatBench : public benchmark::Fixture { } void runNNCLoop(benchmark::State& state) { - KernelScope ks; size_t num_inputs = inputs_.size(); size_t num_dims = 2; diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp index 8646e97b756c2..ec13b09025eea 100644 --- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp @@ -40,7 +40,6 @@ BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) { } BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) { - te::KernelScope ks; te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); @@ -64,7 +63,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) { } BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) { - te::KernelScope ks; te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); @@ -124,7 +122,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) { } BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) { - te::KernelScope ks; te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); @@ -184,7 +181,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) { } BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) { - te::KernelScope ks; te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); @@ -252,7 +248,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) { } BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { - te::KernelScope ks; te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat)); te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat)); diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp index 847b66d9ee58b..178a8795edd03 100644 --- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp +++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp @@ -35,7 +35,6 @@ class ParallelAdd : public benchmark::Fixture { }; BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) { - KernelScope kernel_scope; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); Tensor c_tensor = Compute( diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp index 9d3570197414d..e053317feca60 100644 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -217,7 +217,6 @@ BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24}); #endif // USE_AVX2 BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { - te::KernelScope ks; int M = A.numel(); @@ -250,7 +249,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24}); BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { - te::KernelScope ks; int M = A.numel(); @@ -291,7 +289,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24}); BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { - te::KernelScope ks; int M = A.numel(); @@ -332,7 +329,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24}); BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { - te::KernelScope ks; int M = A.numel(); const int kChunkSize = 8; @@ -385,7 +381,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24}); BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) { - te::KernelScope ks; const int M = A.numel(); const int kChunkSize = 8; @@ -450,7 +445,6 @@ BENCHMARK_REGISTER_F(Reduce2DCol, Torch) ->Args({1 << 12, 1 << 12}); BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) { - te::KernelScope ks; constexpr int kCacheSize = 1 << 12; te::Placeholder a("A", te::kFloat, {M, N}); te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat); @@ -557,7 +551,6 @@ BENCHMARK_REGISTER_F(Reduce2DRow, Hand) ->Args({1 << 18, 1 << 6}); BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) { - te::KernelScope ks; constexpr int kChunkSize = 8; te::Placeholder a("A", te::kFloat, {M, N}); te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat); diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp index 2a4ce9485acde..8de395fe92796 100644 --- a/test/cpp/tensorexpr/test_approx.cpp +++ b/test/cpp/tensorexpr/test_approx.cpp @@ -30,7 +30,6 @@ std::string diffs(const at::Tensor& a, const at::Tensor& b) { } TEST(Approx, log_vml) { - te::KernelScope ks; te::VarHandle N("N", te::kInt); te::Placeholder A("A", te::kFloat, {N}); te::Tensor B = te::Compute( diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp index 9eb141250cb35..040b7b0a920fb 100644 --- a/test/cpp/tensorexpr/test_aten.cpp +++ b/test/cpp/tensorexpr/test_aten.cpp @@ -15,7 +15,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(ATen, _cast_Float) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -43,7 +42,6 @@ TEST(ATen, _cast_Float) { } TEST(ATen, negInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -71,7 +69,6 @@ TEST(ATen, negInt) { } TEST(ATen, negFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -99,7 +96,6 @@ TEST(ATen, negFloat) { } TEST(ATen, addInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -136,7 +132,6 @@ TEST(ATen, addInt) { } TEST(ATen, addFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -173,7 +168,6 @@ TEST(ATen, addFloat) { } TEST(ATen, subInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -210,7 +204,6 @@ TEST(ATen, subInt) { } TEST(ATen, subFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -247,7 +240,6 @@ TEST(ATen, subFloat) { } TEST(ATen, lerp) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -284,7 +276,6 @@ TEST(ATen, lerp) { } TEST(ATen, addcmulInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -326,7 +317,6 @@ TEST(ATen, addcmulInt) { } TEST(ATen, addcmulFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -368,7 +358,6 @@ TEST(ATen, addcmulFloat) { } TEST(ATen, mulInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -400,7 +389,6 @@ TEST(ATen, mulInt) { } TEST(ATen, mulFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -432,7 +420,6 @@ TEST(ATen, mulFloat) { } TEST(ATen, divInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -464,7 +451,6 @@ TEST(ATen, divInt) { } TEST(ATen, divFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -496,7 +482,6 @@ TEST(ATen, divFloat) { } TEST(ATen, maxInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -528,7 +513,6 @@ TEST(ATen, maxInt) { } TEST(ATen, maxFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -560,7 +544,6 @@ TEST(ATen, maxFloat) { } TEST(ATen, minInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -592,7 +575,6 @@ TEST(ATen, minInt) { } TEST(ATen, minFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -624,7 +606,6 @@ TEST(ATen, minFloat) { } void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -651,7 +632,6 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() { } TEST(ATen, reluInt) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); @@ -678,7 +658,6 @@ TEST(ATen, reluInt) { } TEST(ATen, reluFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -707,7 +686,6 @@ TEST(ATen, reluFloat) { } TEST(ATen, logFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -734,7 +712,6 @@ TEST(ATen, logFloat) { } TEST(ATen, fastLogFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -766,7 +743,6 @@ TEST(ATen, fastLogFloat) { } TEST(ATen, fastTanhFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -798,7 +774,6 @@ TEST(ATen, fastTanhFloat) { } TEST(ATen, fastSigmoidFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -831,7 +806,6 @@ TEST(ATen, fastSigmoidFloat) { } TEST(ATen, log10Float) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -858,7 +832,6 @@ TEST(ATen, log10Float) { } TEST(ATen, log2Float) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -885,7 +858,6 @@ TEST(ATen, log2Float) { } TEST(ATen, expFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -913,7 +885,6 @@ TEST(ATen, expFloat) { } TEST(ATen, erfFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -941,7 +912,6 @@ TEST(ATen, erfFloat) { } TEST(ATen, cosFloat) { - KernelScope kernel_scope; const int kTotalSize = 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -969,7 +939,6 @@ TEST(ATen, cosFloat) { } TEST(ATen, eqInt) { - KernelScope kernel_scope; constexpr int N = 128; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -995,7 +964,6 @@ TEST(ATen, eqInt) { } TEST(ATen, geInt) { - KernelScope kernel_scope; constexpr int N = 128; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -1021,7 +989,6 @@ TEST(ATen, geInt) { } TEST(ATen, gtInt) { - KernelScope kernel_scope; constexpr int N = 128; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -1047,7 +1014,6 @@ TEST(ATen, gtInt) { } TEST(ATen, leInt) { - KernelScope kernel_scope; constexpr int N = 128; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -1073,7 +1039,6 @@ TEST(ATen, leInt) { } TEST(ATen, ltInt) { - KernelScope kernel_scope; constexpr int N = 128; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp index d038665fad75b..2eb0dfb997da8 100644 --- a/test/cpp/tensorexpr/test_boundsinference.cpp +++ b/test/cpp/tensorexpr/test_boundsinference.cpp @@ -46,7 +46,6 @@ TEST(BoundsInference, _1) { // b[i] = a[i] // For this loop bounds inference should yield the following: // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}} - KernelScope kernel_scope; ExprHandle n(100); Placeholder a(BufHandle("a", {n}, kFloat)); Tensor b = @@ -71,7 +70,6 @@ TEST(BoundsInference, _2) { // b[i] = a[i] // For this loop bounds inference should yield the following: // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}} - KernelScope kernel_scope; VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); Tensor b = @@ -96,7 +94,6 @@ TEST(BoundsInference, _3) { // b[i] = a[i] * a[i+10] // For this loop bounds inference should yield the following: // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}} - KernelScope kernel_scope; ExprHandle n(100); Placeholder a(BufHandle("a", {n + 10}, kFloat)); Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { @@ -125,7 +122,6 @@ TEST(BoundsInference, _4) { // for y in 0..200: // for x in 0..320: // c[y,x] = a[y,x] * b[y,x] - KernelScope kernel_scope; ExprHandle W(320); ExprHandle H(200); Placeholder a(BufHandle("a", {H, W}, kFloat)); @@ -205,7 +201,6 @@ TEST(BoundsInference, _5) { // b[i_outer * 16 + i_inner] = a[i_outer * 16 + i_inner] // for i_tail in 0..100%16: // b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16]; - KernelScope kernel_scope; ExprHandle n(100); Placeholder a(BufHandle("a", {n}, kFloat)); Tensor b = @@ -257,7 +252,6 @@ TEST(BoundsInference, _6) { // for y in 0..20: // for x in 0..32: // c[y,x] = a[y+100,x+100] * b[y*2,x*5] - KernelScope kernel_scope; ExprHandle W(320); ExprHandle H(200); ExprHandle CW(32); @@ -328,7 +322,6 @@ TEST(BoundsInference, _6) { } TEST(BoundsInference, Adjacent) { - KernelScope kernel_scope; ExprHandle H(6); Placeholder a(BufHandle("a", {20}, kFloat)); Tensor b = @@ -388,7 +381,6 @@ TEST(BoundsInference, Adjacent) { } TEST(BoundsInference, MultipleTopLoopLoad) { - KernelScope kernel_scope; Placeholder a(BufHandle("a", {100}, kFloat)); Tensor b = Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); }); @@ -444,7 +436,6 @@ TEST(BoundsInference, MultipleTopLoopLoad) { } TEST(BoundsInference, MultipleTopLoopStore) { - KernelScope kernel_scope; BufHandle a("a", {100}, kFloat); BufHandle b("b", {100}, kFloat); BufHandle c("c", {100}, kFloat); @@ -504,8 +495,6 @@ TEST(BoundsInference, MultipleTopLoopStore) { } TEST(BoundsInference, CacheReads) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -570,7 +559,6 @@ TEST(BoundsInference, CacheReads) { } TEST(BoundsInference, Flattened) { - KernelScope kernel_scope; Tensor b = Compute( "b", {{3, "z"}, {4, "y"}, {5, "x"}}, @@ -597,7 +585,6 @@ TEST(BoundsInference, Flattened) { } TEST(BoundsInference, GetPotentialHazards) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -649,8 +636,6 @@ TEST(BoundsInference, GetPotentialHazards) { } TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -677,8 +662,6 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) { } TEST(BoundsInference, GetPotentialHazardsLoopCall) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -704,8 +687,6 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) { } TEST(BoundsInference, GetPotentialHazardsLoopSplit) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -731,8 +712,6 @@ TEST(BoundsInference, GetPotentialHazardsLoopSplit) { } TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -755,8 +734,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) { } TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -778,8 +755,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) { } TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -803,8 +778,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) { } TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -827,8 +800,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) { } TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -868,8 +839,6 @@ TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) { } TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -909,8 +878,6 @@ TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) { } TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -949,8 +916,6 @@ TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) { } TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < 100; j++) { // A[j] = 10 * j; @@ -978,8 +943,6 @@ TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) { } TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) { - KernelScope kernel_scope; - // Input IR: // for (int k = 0; k < 100; k++) { // B[k] = 20 * A[99-k]; @@ -1007,8 +970,6 @@ TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) { } TEST(BoundsInference, HasConflictingOverlapWithLoads) { - KernelScope kernel_scope; - // Input IR: // for (int k = 10; k < 100; k++) { // B[k] = 20 * A[99-k]; @@ -1041,8 +1002,6 @@ TEST(BoundsInference, HasConflictingOverlapWithLoads) { } TEST(BoundsInference, IsOverlapping) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; i++) { // A[i] = i * 10; // storeA1 diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp index 293fbe248f176..19372779094a6 100644 --- a/test/cpp/tensorexpr/test_conv.cpp +++ b/test/cpp/tensorexpr/test_conv.cpp @@ -21,7 +21,6 @@ static at::Tensor genTestData(c10::IntArrayRef args) { #ifdef TORCH_ENABLE_LLVM TEST(Conv, DepthwiseConv2D) { - te::KernelScope kernel_scope; constexpr int N = 1, C = 72, H = 56, W = 56; constexpr int K = 72, R = 3, S = 3; constexpr int kPad = 1, kStride = 2, kGroups = C; @@ -53,7 +52,6 @@ TEST(Conv, DepthwiseConv2D) { } TEST(Conv, DepthwiseConv2DNoBias) { - te::KernelScope kernel_scope; constexpr int N = 1, C = 72, H = 56, W = 56; constexpr int K = 72, R = 3, S = 3; constexpr int kPad = 1, kStride = 2, kGroups = C; @@ -80,7 +78,6 @@ TEST(Conv, DepthwiseConv2DNoBias) { } TEST(Conv, DepthwiseConv2DDynamicShapes) { - te::KernelScope kernel_scope; te::VarHandle N_var("N", te::kInt); te::VarHandle C_var("C", te::kInt); te::VarHandle H_var("H", te::kInt); @@ -164,8 +161,6 @@ TEST(Conv, DepthwiseConv2DDynamicShapes) { #endif TEST(Conv, Conv2D) { - te::KernelScope kernel_scope; - // Input dimensions. constexpr int N = 1; constexpr int C = 3; diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp index 82ea40d995f29..df9166b675859 100644 --- a/test/cpp/tensorexpr/test_cpp_codegen.cpp +++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include @@ -13,7 +12,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(CppPrinter, AllocateOnStackThenFree) { - KernelScope kernel_scope; std::vector dims = {alloc(2), alloc(3)}; BufPtr buf = alloc("x", dims, kInt); AllocatePtr alloc_ = alloc(buf); @@ -32,7 +30,6 @@ TEST(CppPrinter, AllocateOnStackThenFree) { } TEST(CppPrinter, AllocateOnHeapThenFree) { - KernelScope kernel_scope; std::vector dims = { alloc(20), alloc(50), alloc(3)}; BufPtr buf = alloc("y", dims, kLong); diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index ed5c070ea8689..164ff772d5b46 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -27,7 +27,6 @@ using namespace torch::jit::tensorexpr; template static void testCudaTestVectorAdd01_impl() { - KernelScope kernel_scope; const int num_iter = 3; const int block_count = 16; const int block_size = 128; @@ -93,7 +92,6 @@ float sigmoid(float x) { } TEST(Cuda, Sigmoid_CUDA) { - KernelScope kernel_scope; const int num_iter = 3; const int block_count = 16; const int block_size = 128; @@ -162,7 +160,6 @@ TEST(Cuda, TestVectorAdd01_CUDA) { } static void testCudaTestVectorAdd02_impl(int N, int block_size) { - KernelScope kernel_scope; Placeholder a_buf("a", kFloat, {N}); Placeholder b_buf("b", kFloat, {N}); Tensor c = Compute( @@ -222,7 +219,6 @@ TEST(Cuda, TestVectorAdd02_CUDA) { } TEST(Cuda, HalfCast_CUDA) { - KernelScope ks; auto half = ToDtype(); Placeholder a("a", half, {4}); Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { @@ -261,7 +257,6 @@ TEST(Cuda, HalfCast_CUDA) { } TEST(Cuda, DynamicShape2D_CUDA) { - KernelScope kernel_scope; auto testWithSize = [](int32_t M, int32_t N) { VarHandle m("m", kInt); VarHandle n("n", kInt); @@ -324,7 +319,6 @@ TEST(Cuda, DynamicShape2D_CUDA) { } TEST(Cuda, TestRand01_CUDA) { - KernelScope kernel_scope; const int num_iter = 3; const int block_count = 16; const int block_size = 128; @@ -383,7 +377,6 @@ TEST(Cuda, TestRand01_CUDA) { } TEST(Cuda, DynamicShapeSplit_CUDA) { - KernelScope ks; constexpr int N = 4096; VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); @@ -434,7 +427,6 @@ TEST(Cuda, DynamicShapeSplit_CUDA) { TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) { const static int N = 1024; - KernelScope kernel_scope; Placeholder data_buf("data", kFloat, {N}); Placeholder output_buf("output", kFloat, {1}); @@ -501,7 +493,6 @@ TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) { TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) { const static int N = 1024; - KernelScope kernel_scope; // This test does the following reduction: // clang-format off @@ -578,8 +569,6 @@ TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) { } TEST(Cuda, NoThreadIdxWrite_1_CUDA) { - KernelScope kernel_scope; - // This test does the following reduction: // // for k in 0..1: // block-idx @@ -676,7 +665,6 @@ TEST(Cuda, NoThreadIdxWrite_1_CUDA) { TEST(Cuda, SharedMemReduce_1_CUDA) { // FIXME: this test is flaky in CI. - KernelScope kernel_scope; // This test does the following: // for k in 0..1: // block-idx // alloc(c, 64) @@ -814,7 +802,6 @@ TEST(Cuda, SharedMemReduce_1_CUDA) { } TEST(Cuda, LocalMemReduce_1_CUDA) { - KernelScope kernel_scope; // This test does the following: // for k in 0..1: // block-idx // b(k) = 0 @@ -925,7 +912,6 @@ TEST(Cuda, LocalMemReduce_1_CUDA) { } TEST(Cuda, HalfSupport_CUDA) { - KernelScope ks; auto half = ToDtype(); Placeholder a("a", half, {4}); Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { @@ -983,7 +969,6 @@ TEST(Cuda, HalfSupport_CUDA) { } TEST(Cuda, HalfPropagation_CUDA) { - KernelScope kernel_scope; auto half = ToDtype(); Placeholder a("a", half, {4}); Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { @@ -1032,7 +1017,6 @@ TEST(Cuda, HalfPropagation_CUDA) { } TEST(Cuda, UnusedHalfArgument_CUDA) { - KernelScope kernel_scope; Placeholder a("a", kFloat, {4}); auto half = ToDtype(); Placeholder b("b", half, {4}); @@ -1089,7 +1073,6 @@ TEST(Cuda, UnusedHalfArgument_CUDA) { } TEST(Cuda, PrioritizeDependents_CUDA) { - KernelScope kernel_scope; Placeholder a("a", kFloat, {10}); Placeholder b("b", kFloat, {12}); Placeholder c("c", kFloat, {12}); @@ -1163,7 +1146,6 @@ TEST(Cuda, PrioritizeDependents_CUDA) { /// Tests the case where there are two loops which have different extents bound /// to the same block dimension. We must mask the smaller extent loop body. TEST(Cuda, MaskBlockDim_CUDA) { - KernelScope kernel_scope; int A_SIZE = 100; int B_SIZE = 50; Placeholder a_buf("a", kFloat, {A_SIZE}); @@ -1256,7 +1238,6 @@ TEST(Cuda, MaskBlockDim_CUDA) { /// to the same thread dimension. This is the same as the above - the smaller /// rank write should be masked. But this time we also need to syncthreads. TEST(Cuda, MaskThreadDim_CUDA) { - KernelScope kernel_scope; int A_SIZE = 50; int B_SIZE = 100; Placeholder a_buf("a", kFloat, {A_SIZE}); @@ -1351,7 +1332,6 @@ TEST(Cuda, MaskThreadDim_CUDA) { // Note: this is an extremely dumb pattern which we should never see, but is a // useful edge case to make sure we've got things covered. TEST(Cuda, MaskMultiBlockDim_CUDA) { - KernelScope kernel_scope; int A_SIZE = 100; int B_SIZE = 50; Placeholder a_buf("a", kFloat, {A_SIZE}); @@ -1445,7 +1425,6 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) { // Note: this is an extremely dumb pattern which we should never see, but is a // useful edge case to make sure we've got things covered. TEST(Cuda, MaskBlockAndThreadDim_CUDA) { - KernelScope kernel_scope; int A_SIZE = 100; int B_SIZE = 50; Placeholder a_buf("a", kFloat, {A_SIZE}); @@ -1537,7 +1516,6 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) { /// outer loop bound to blockDim.x and the inner loop bound to threadDim.x. In /// this case all writes with a rank smaller than the max should be masked. TEST(Cuda, MaskMultiDim_CUDA) { - KernelScope kernel_scope; int OUTER_SIZE = 10; int A_SIZE = 100; int B_SIZE = 50; @@ -1667,7 +1645,6 @@ TEST(Cuda, MaskMultiDim_CUDA) { // In this case both stores must be masked against the extent of the other loop, // incase it is larger. TEST(Cuda, MaskMultiDimSymbolic_CUDA) { - KernelScope kernel_scope; VarHandle OUTER_SIZE("OUTER_SIZE", kInt); VarHandle A_SIZE("A_SIZE", kInt); VarHandle B_SIZE("B_SIZE", kInt); @@ -1803,7 +1780,6 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) { // extents but are bound to the same thread dimension. The smaller loop should // be masked. TEST(Cuda, MaskCompoundInnerLoop_CUDA) { - KernelScope kernel_scope; int OUTER_SIZE = 10; int A_SIZE = 100; int B_SIZE = 50; @@ -1942,7 +1918,6 @@ TEST(Cuda, MaskCompoundInnerLoop_CUDA) { // the first thread dimensions. This should work just like the MaskThreadDim // test where the bigger loop is unmasked but the smaller is masked. TEST(Cuda, MaskInnerLoopOneBlock_CUDA) { - KernelScope kernel_scope; int OUTER_SIZE = 10; int A_SIZE = 100; int B_SIZE = 50; @@ -2081,7 +2056,6 @@ TEST(Cuda, MaskInnerLoopOneBlock_CUDA) { // this case both bodies must be masked against the other dimension being > 0. // Note: this is a bit degenerate no one would actually write this for perf. TEST(Cuda, MaskMultiDimMultiAxis_CUDA) { - KernelScope kernel_scope; int OUTER_SIZE = 10; int A_SIZE = 30; int B_SIZE = 15; @@ -2211,7 +2185,6 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) { // the second loop is smaller in both cases - the second store must be masked // for both the block and thread dimension. TEST(Cuda, MaskMultiDimMultiLevel_CUDA) { - KernelScope kernel_scope; int OUTER_A_SIZE = 10; int OUTER_B_SIZE = 5; int A_SIZE = 30; diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp index 7c234fb95cdb1..d2405353e8301 100644 --- a/test/cpp/tensorexpr/test_expr.cpp +++ b/test/cpp/tensorexpr/test_expr.cpp @@ -24,7 +24,6 @@ using namespace torch::jit::tensorexpr; using SimpleIRExprEval = ExprEval; TEST(Expr, BasicValueTest) { - KernelScope kernel_scope; ExprHandle a = IntImm::make(2), b = IntImm::make(3); ExprHandle c = Add::make(a, b); SimpleIRExprEval eval(c); @@ -32,7 +31,6 @@ TEST(Expr, BasicValueTest) { } TEST(Expr, BasicValueTest02) { - KernelScope kernel_scope; ExprHandle a(2.0f); ExprHandle b(3.0f); ExprHandle c(4.0f); @@ -43,7 +41,6 @@ TEST(Expr, BasicValueTest02) { } TEST(Expr, LetTest01) { - KernelScope kernel_scope; VarHandle x("x", kFloat); ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f)); SimpleIRExprEval eval(body); @@ -52,7 +49,6 @@ TEST(Expr, LetTest01) { } TEST(Expr, LetTest02) { - KernelScope kernel_scope; VarHandle x("x", kFloat); VarHandle y("y", kFloat); ExprHandle body = @@ -64,7 +60,6 @@ TEST(Expr, LetTest02) { } TEST(Expr, LetStmtTest01) { - KernelScope kernel_scope; Placeholder a_buf("a", kFloat, {1}); Placeholder b_buf("b", kFloat, {1}); @@ -88,7 +83,6 @@ TEST(Expr, LetStmtTest01) { } TEST(Expr, IntTest) { - KernelScope kernel_scope; VarHandle x("x", kInt); ExprHandle body = ExprHandle(2) + (x * ExprHandle(3) + ExprHandle(4)); SimpleIRExprEval eval(body); @@ -97,7 +91,6 @@ TEST(Expr, IntTest) { } TEST(Expr, FloatTest) { - KernelScope kernel_scope; VarHandle x("x", kFloat); ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f)); SimpleIRExprEval eval(body); @@ -106,7 +99,6 @@ TEST(Expr, FloatTest) { } TEST(Expr, ByteTest) { - KernelScope kernel_scope; VarHandle x("x", kByte); ExprHandle body = ExprHandle((uint8_t)2) + (x * ExprHandle((uint8_t)3) + ExprHandle((uint8_t)4)); @@ -116,7 +108,6 @@ TEST(Expr, ByteTest) { } TEST(Expr, CharTest) { - KernelScope kernel_scope; VarHandle x("x", kChar); ExprHandle body = ExprHandle((int8_t)2) + (x * ExprHandle((int8_t)3) + ExprHandle((int8_t)4)); @@ -126,7 +117,6 @@ TEST(Expr, CharTest) { } TEST(Expr, ShortTest) { - KernelScope kernel_scope; VarHandle x("x", kShort); ExprHandle body = ExprHandle((int16_t)2) + (x * ExprHandle((int16_t)3) + ExprHandle((int16_t)4)); @@ -136,7 +126,6 @@ TEST(Expr, ShortTest) { } TEST(Expr, LongTest) { - KernelScope kernel_scope; VarHandle x("x", kLong); ExprHandle body = ExprHandle((int64_t)2) + (x * ExprHandle((int64_t)3) + ExprHandle((int64_t)4)); @@ -146,7 +135,6 @@ TEST(Expr, LongTest) { } TEST(Expr, HalfTest) { - KernelScope kernel_scope; VarHandle x("x", kHalf); ExprHandle body = ExprHandle((at::Half)2) + (x * ExprHandle((at::Half)3) + ExprHandle((at::Half)4)); @@ -156,7 +144,6 @@ TEST(Expr, HalfTest) { } TEST(Expr, DoubleTest) { - KernelScope kernel_scope; VarHandle x("x", kDouble); ExprHandle body = ExprHandle((double)2) + (x * ExprHandle((double)3) + ExprHandle((double)4)); @@ -166,7 +153,6 @@ TEST(Expr, DoubleTest) { } TEST(Expr, VectorAdd01) { - KernelScope kernel_scope; const int kVectorSize = 8; const int kVectorCount = 128; const int kTotalSize = kVectorSize * kVectorCount; @@ -212,7 +198,6 @@ TEST(Expr, VectorAdd01) { } TEST(Expr, CompareSelectEQ) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -251,7 +236,6 @@ TEST(Expr, CompareSelectDtypes) { // This test constructs a CompareSelect expression where the input dtype is // different from the output dtype and verifies that it works correctly: // result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2 - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -290,7 +274,6 @@ TEST(Expr, CompareSelectDtypes) { } TEST(Expr, IntrinsicsDtypes) { - KernelScope kernel_scope; constexpr int N = 256; Placeholder a(BufHandle("A", {N}, kDouble)); Placeholder b(BufHandle("B", {N}, kDouble)); @@ -312,7 +295,6 @@ TEST(Expr, IntrinsicsDtypes) { } TEST(Expr, Substitute01) { - KernelScope kernel_scope; VarPtr x = alloc("x", kFloat); VarPtr y = alloc("y", kFloat); ExprPtr e = @@ -334,7 +316,6 @@ TEST(Expr, Substitute01) { } TEST(Expr, Math01) { - KernelScope kernel_scope; ExprHandle v = sin(ExprHandle(1.0f)); std::ostringstream oss; @@ -348,7 +329,6 @@ TEST(Expr, Math01) { } TEST(Expr, UnaryMath01) { - KernelScope kernel_scope; struct TestConfig { std::function func; std::function ref_func; @@ -416,7 +396,6 @@ TEST(Expr, UnaryMath01) { } TEST(Expr, BinaryMath01) { - KernelScope kernel_scope; struct TestConfig { std::function func; std::function ref_func; @@ -440,7 +419,6 @@ TEST(Expr, BinaryMath01) { } TEST(Expr, LogicalOps01) { - KernelScope kernel_scope; ExprHandle a(23); ExprHandle b(11); ExprHandle c(0.72f); @@ -473,7 +451,6 @@ TEST(Expr, LogicalOps01) { } TEST(Expr, LogicalOps02) { - KernelScope kernel_scope; ExprHandle a(23); ExprHandle b(11); ExprHandle c(0.72f); @@ -492,7 +469,6 @@ TEST(Expr, LogicalOps02) { } TEST(Expr, LogicalOps03) { - KernelScope kernel_scope; ExprHandle a(23); ExprHandle b(11); ExprHandle c(0.72f); @@ -550,7 +526,6 @@ TEST(Expr, LogicalOps03) { } TEST(Expr, BitwiseOps) { - KernelScope kernel_scope; ExprHandle a(59); ExprHandle b(11); ExprHandle c(101); @@ -562,7 +537,6 @@ TEST(Expr, BitwiseOps) { } TEST(Expr, DynamicShapeAdd) { - KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); @@ -582,7 +556,6 @@ TEST(Expr, DynamicShapeAdd) { } void testCond01() { - KernelScope kernel_scope; const int N = 16; PaddedBuffer a_v(N); Placeholder a_buf("a", kFloat, {N}); @@ -606,7 +579,6 @@ void testCond01() { } void testIfThenElse01() { - KernelScope kernel_scope; ExprHandle v = ifThenElse(ExprHandle(1), ExprHandle(1.0f), ExprHandle(2.0f)); std::ostringstream oss; @@ -618,7 +590,6 @@ void testIfThenElse01() { } void testIfThenElse02() { - KernelScope kernel_scope; ExprHandle v = ifThenElse(ExprHandle(0), ExprHandle(1.0f), ExprHandle(2.0f)); std::ostringstream oss; @@ -630,7 +601,6 @@ void testIfThenElse02() { } void testIfThenElse03() { - KernelScope kernel_scope; ExprHandle v = ifThenElse(BoolImm::make(false), ExprHandle(1.0f), ExprHandle(2.0f)); @@ -643,7 +613,6 @@ void testIfThenElse03() { } void testStmtClone() { - KernelScope kernel_scope; const int N = 16; Placeholder a_buf("a", kInt, {N}); diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp index a170e530fa98f..176158e7fe13a 100644 --- a/test/cpp/tensorexpr/test_external_calls.cpp +++ b/test/cpp/tensorexpr/test_external_calls.cpp @@ -20,8 +20,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(ExternalCall, Conv2d_float) { - KernelScope kernel_scope; - Placeholder Input("Input", kFloat, {1, 3, 224, 224}); Placeholder Weight("Weight", kFloat, {16, 3, 3, 3}); Placeholder Bias("Bias", kFloat, {16}); @@ -84,7 +82,6 @@ TEST(ExternalCall, Conv2d_float) { TEST(ExternalCall, Conv2d_int) { // A similar test, but now using kInt tensors - KernelScope kernel_scope; Placeholder Input("Input", kInt, {1, 3, 224, 224}); Placeholder Weight("Weight", kInt, {16, 3, 3, 3}); @@ -147,8 +144,6 @@ TEST(ExternalCall, Conv2d_int) { } TEST(ExternalCall, Conv2d_nobias_noargs) { - KernelScope kernel_scope; - Placeholder Input("Input", kFloat, {1, 16, 112, 112}); Placeholder Weight("Weight", kFloat, {16, 16, 1, 1}); BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat); @@ -194,8 +189,6 @@ TEST(ExternalCall, Conv2d_nobias_noargs) { } TEST(ExternalCall, Addmm_float) { - KernelScope kernel_scope; - Placeholder Input("Input", kFloat, {100, 300}); Placeholder Mat1("Mat1", kFloat, {100, 200}); Placeholder Mat2("Mat2", kFloat, {200, 300}); @@ -252,8 +245,6 @@ TEST(ExternalCall, Addmm_float) { TEST(ExternalCall, Prepacked_Linear_float) { using namespace at::native::xnnpack; - KernelScope kernel_scope; - Placeholder Input("Input", kFloat, {100, 200}); BufHandle ResultBuf("Result", {100, 300}, kFloat); @@ -317,8 +308,6 @@ TEST(ExternalCall, Prepacked_Linear_float) { TEST(ExternalCall, Prepacked_Conv2d_float) { using namespace at::native::xnnpack; - KernelScope kernel_scope; - Placeholder Input("Input", kFloat, {1, 3, 224, 224}); BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat); int64_t stride = 2; @@ -404,7 +393,6 @@ TEST(ExternalCall, Prepacked_Conv2d_float) { #endif // USE_XNNPACK TEST(ExternalCall, BinaryFloat) { - KernelScope kernel_scope; using TensorFunc = std::function; using Test = std::tuple< std::vector, @@ -479,7 +467,6 @@ TEST(ExternalCall, BinaryFloat) { } TEST(ExternalCall, UnaryFloat) { - KernelScope kernel_scope; using TensorFunc = std::function; auto toExprHandleVec = [](std::vector v) { auto intV = std::vector(v.begin(), v.end()); @@ -561,7 +548,6 @@ TEST(ExternalCall, UnaryFloat) { TEST(ExternalCall, ComputeInterop) { // This test verifies that Tensors using external calls can be used by and can // use Tensors built with Compute API. - KernelScope kernel_scope; BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat); BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat); @@ -654,7 +640,6 @@ TEST(ExternalCall, ComputeInterop) { TEST(ExternalCall, Inlining) { // This test verifies that Tensors using external calls can be used by and // can use Tensors built with Compute API. - KernelScope kernel_scope; BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat); diff --git a/test/cpp/tensorexpr/test_graph_opt.cpp b/test/cpp/tensorexpr/test_graph_opt.cpp index 3175d7f142524..e5a237f5f7541 100644 --- a/test/cpp/tensorexpr/test_graph_opt.cpp +++ b/test/cpp/tensorexpr/test_graph_opt.cpp @@ -45,7 +45,6 @@ TEST_F(GraphOpt, OptimizeCat) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // The `aten::log` op must be moved to the inputs of `aten::cat`. @@ -88,7 +87,6 @@ TEST_F(GraphOpt, OptimizeCat2) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // The `aten::log` and `aten::tanh` ops must be moved to the inputs of @@ -137,7 +135,6 @@ TEST_F(GraphOpt, OptimizeCat3) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // The `aten::tanh` op must be moved to the inputs of `aten::cat`. @@ -183,7 +180,6 @@ TEST_F(GraphOpt, OptimizeCatWithTypePromotionInUser) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // The `aten::tanh` op must be moved to the inputs of `aten::cat`. @@ -227,7 +223,6 @@ TEST_F(GraphOpt, OptimizeCatWithTypePromotionInCat) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // No transformation should have happened because the `aten::cat` op performs @@ -257,7 +252,6 @@ TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // No transformation is expected since the consumers of cat are not @@ -290,7 +284,6 @@ TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp2) { torch::jit::parseIR(graph_string, g.get()); g->lint(); - KernelScope kernel_scope; TensorExprKernel kernel(g); // No transformation is expected since the consumers of cat are not diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp index e11ba06740181..820f12689acca 100644 --- a/test/cpp/tensorexpr/test_ir_printer.cpp +++ b/test/cpp/tensorexpr/test_ir_printer.cpp @@ -17,7 +17,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(IRPrinter, BasicValueTest) { - KernelScope kernel_scope; ExprHandle a = IntImm::make(2), b = IntImm::make(3); ExprHandle c = Add::make(a, b); @@ -27,7 +26,6 @@ TEST(IRPrinter, BasicValueTest) { } TEST(IRPrinter, BasicValueTest02) { - KernelScope kernel_scope; ExprHandle a(2.0f); ExprHandle b(3.0f); ExprHandle c(4.0f); @@ -40,7 +38,6 @@ TEST(IRPrinter, BasicValueTest02) { } TEST(IRPrinter, CastTest) { - KernelScope kernel_scope; VarHandle x("x", kHalf); VarHandle y("y", kFloat); ExprHandle body = ExprHandle(2.f) + @@ -52,7 +49,6 @@ TEST(IRPrinter, CastTest) { } TEST(IRPrinter, FunctionName) { - KernelScope kernel_scope; int M = 4; int N = 20; diff --git a/test/cpp/tensorexpr/test_ir_verifier.cpp b/test/cpp/tensorexpr/test_ir_verifier.cpp index 2c91d8b24b253..cbe15502ad1f9 100644 --- a/test/cpp/tensorexpr/test_ir_verifier.cpp +++ b/test/cpp/tensorexpr/test_ir_verifier.cpp @@ -17,7 +17,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(IRVerifier, BitwiseOps) { - KernelScope kernel_scope; VarPtr X = alloc("x", kInt); VarPtr Y = alloc("y", kFloat); { @@ -48,7 +47,6 @@ TEST(IRVerifier, BitwiseOps) { } TEST(IRVerifier, CompareSelect) { - KernelScope kernel_scope; ExprPtr X = alloc(1); ExprPtr Y = alloc(3.14f); { @@ -64,7 +62,6 @@ TEST(IRVerifier, CompareSelect) { } TEST(IRVerifier, Ramp) { - KernelScope kernel_scope; VarPtr I = alloc("i", kInt); VarPtr J = alloc("j", kFloat); { @@ -75,7 +72,6 @@ TEST(IRVerifier, Ramp) { } TEST(IRVerifier, Load) { - KernelScope kernel_scope; VarPtr I = alloc("i", kInt); VarPtr J = alloc("j", kLong); VarPtr K = alloc("k", kFloat); @@ -105,7 +101,6 @@ TEST(IRVerifier, Load) { } TEST(IRVerifier, IfThenElse) { - KernelScope kernel_scope; VarPtr I = alloc("i", kInt); VarPtr J = alloc("j", kLong); VarPtr K = alloc("k", kFloat); @@ -130,7 +125,6 @@ TEST(IRVerifier, IfThenElse) { } TEST(IRVerifier, For) { - KernelScope kernel_scope; VarPtr I = alloc("i", kInt); VarPtr J = alloc("j", kInt); StmtPtr body = alloc(std::vector({})); @@ -143,7 +137,6 @@ TEST(IRVerifier, For) { } TEST(IRVerifier, Block) { - KernelScope kernel_scope; VarPtr I = alloc("i", kInt); BufPtr B = alloc("B", std::vector({alloc(10)}), kInt); { @@ -160,7 +153,6 @@ TEST(IRVerifier, Block) { } TEST(IRVerifier, Store) { - KernelScope kernel_scope; VarPtr I = alloc("i", kInt); VarPtr J = alloc("j", kLong); VarPtr K = alloc("k", kFloat); diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 765522ecf6cd4..e14282f258893 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -39,7 +39,6 @@ TEST_F(Kernel, InliningIntermediates) { %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2) %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one) return (%5))IR"; - KernelScope kernel_scope; auto graph = std::make_shared(); parseIR(graph_string, &*graph); TensorExprKernel k(graph); @@ -63,7 +62,6 @@ TEST_F(Kernel, InliningIntermediates) { continue; } - KernelScope kernel_scope; TemplateEnv env; env.s("device", use_cuda ? "cuda:0" : "cpu"); const auto graph_string = format(graph_template, env); @@ -88,8 +86,6 @@ TEST_F(Kernel, InliningIntermediates) { } TEST_F(Kernel, _1) { - KernelScope kernel_scope; - const auto graph_string = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), %1 : Float(5, 3, strides=[3, 1], device=cpu)): @@ -127,8 +123,6 @@ TEST_F(Kernel, _1) { } TEST_F(Kernel, _2) { - KernelScope kernel_scope; - const auto graph_string = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), %1 : Float(5, 3, strides=[1, 5], device=cpu)): @@ -167,8 +161,6 @@ TEST_F(Kernel, _2) { } TEST_F(Kernel, _3) { - KernelScope kernel_scope; - const auto graph_string = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), %1 : Float(5, 3, strides=[12, 2], device=cpu)): @@ -212,8 +204,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { // Test TensorExpr shape inference capabilities: it should only require shapes // for the inputs { - KernelScope kernel_scope; - const auto graph_string = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), %1 : Float(5, 3, strides=[12, 2], device=cpu)): @@ -251,8 +241,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { } } { - KernelScope kernel_scope; - const auto graph_string = R"IR( graph(%0 : Float(8, 8, strides=[8, 1], device=cpu), %1 : Float(8, 8, strides=[8, 1], device=cpu)): @@ -292,7 +280,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { } { // Test that shape inference handles aten::unsqueeze - KernelScope kernel_scope; const auto graph_string = R"IR( graph(%a : Float(4, 2, strides=[2, 1], device=cpu), @@ -355,7 +342,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { } { // Test that shape inference handles aten::cat - KernelScope kernel_scope; const auto graph_string = R"IR( graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu), @@ -409,7 +395,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { } { // Test that we throw an error when input list for aten::cat is empty - KernelScope kernel_scope; const auto graph_string = R"IR( graph(): @@ -427,7 +412,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { } { // Test that we throw an error when 'dim' passed to aten::cat is invalid - KernelScope kernel_scope; const auto ir_dim_99 = R"IR( graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu), @@ -458,7 +442,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { TEST_F(Kernel, CatInputTypesPromotion) { { // Test that we properly promote input types for aten::cat - KernelScope kernel_scope; const auto graph_string = R"IR( graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu), @@ -676,7 +659,6 @@ TEST_F(Kernel, SumAllAxes) { auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat)); for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) { - KernelScope kernel_scope; TemplateEnv env; env.s("dtype", dtypeConstant(scalar_type)); if (scalar_type == ScalarType::Undefined) { @@ -745,7 +727,6 @@ TEST_F(Kernel, SumOneAxis) { for (int dim = -a.dim(); dim < a.dim(); ++dim) { for (bool keepdim : {false, true}) { for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) { - KernelScope kernel_scope; TemplateEnv env; env.d("dim", dim); env.d("keepdim", keepdim); @@ -812,7 +793,6 @@ TEST_F(Kernel, SumMultipleAxes) { for (int dim1 = 0; dim1 < a.dim(); ++dim1) { for (int dim2 = dim1 + 1; dim2 < a.dim(); ++dim2) { for (bool keepdim : {false, true}) { - KernelScope kernel_scope; TemplateEnv env; env.d("dim1", dim1); env.d("dim2", dim2); @@ -888,7 +868,6 @@ TEST_F(Kernel, Softmax2D) { auto other_dim = (softmax_dim + 1) % a.dim(); auto ref = log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); - KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); env.s("op", log_softmax ? "log_softmax" : "softmax"); @@ -964,7 +943,6 @@ TEST_F(Kernel, Softmax3D) { auto ref = log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); - KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); env.s("op", log_softmax ? "log_softmax" : "softmax"); @@ -1046,7 +1024,6 @@ TEST_F(Kernel, Softmax4D) { auto ref = log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); - KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); env.s("op", log_softmax ? "log_softmax" : "softmax"); @@ -1090,8 +1067,6 @@ TEST_F(Kernel, Softmax4D) { } TEST_F(Kernel, InlineProducerIntoReduction) { - KernelScope kernel_scope; - // Inline producer (mul) into reduction (sum). const auto graph_string = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), @@ -1129,8 +1104,6 @@ TEST_F(Kernel, InlineProducerIntoReduction) { } TEST_F(Kernel, InlineReductionIntoConsumer) { - KernelScope kernel_scope; - // Inline producer (mul %2) into reduction (sum %4) but DO NOT // inline the reduction into consumer (mul %4). const auto graph_string = R"IR( @@ -1179,7 +1152,6 @@ TEST_F(Kernel, SanitizeNames_CUDA) { %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1) %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2) return (%4))IR"; - KernelScope kernel_scope; auto graph = std::make_shared(); parseIR(graph_string, &*graph); graph->inputs().at(0)->setDebugName("aten::add:"); @@ -1204,7 +1176,6 @@ TEST_F(Kernel, ConstantTensors) { %y : Float(16, 16, strides=[16, 1], device=cpu) = aten::ones(%sizes, %none, %none, %none, %none) %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y) return (%z))IR"; - KernelScope kernel_scope; auto graph = std::make_shared(); parseIR(graph_string, &*graph); // IRParser doesn't support tensor constants, so we insert a call to @@ -1237,7 +1208,6 @@ TEST_F(Kernel, ConstantTensorsNonContiguous) { %y : Tensor = aten::t(%y_t) %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y) return (%z))IR"; - KernelScope kernel_scope; auto graph = std::make_shared(); parseIR(graph_string, &*graph); // IRParser doesn't support tensor constants, so we generate several aten @@ -1261,7 +1231,6 @@ TEST_F(Kernel, ConstantTensorsNonContiguous) { TEST_F(Kernel, RunFast) { #ifdef TORCH_ENABLE_LLVM // TODO: Implement call_raw in IREval and remove the ifdef - KernelScope kernel_scope; const auto graph_string = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), @@ -1301,7 +1270,6 @@ TEST_F(Kernel, CodegenInspection) { %y : Tensor = aten::t(%y_t) %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y) return (%z))IR"; - KernelScope kernel_scope; auto graph = std::make_shared(); parseIR(graph_string, &*graph); // IRParser doesn't support tensor constants, so we generate several aten @@ -1353,7 +1321,6 @@ TEST_F(Kernel, CustomLowering) { %y : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::nan_to_num(%x, %none, %none, %none) return (%y) )IR"; - KernelScope kernel_scope; auto graph = std::make_shared(); parseIR(graph_string, &*graph); diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 6081403c25650..139763b071317 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -36,7 +36,6 @@ using LLVMExprEval = ExprEval; #define IMM_TEST(Type, Name, Val) \ TEST(LLVM, Name##ImmTest) { \ - KernelScope kernel_scope; \ auto a = Name##Imm::make(Val); \ LLVMExprEval cg(a); \ if (std::is_floating_point()) { \ @@ -50,7 +49,6 @@ TEST_LLVM_SCALAR_TYPES(IMM_TEST) #define ADD_TEST(Type, Name, Val) \ TEST(LLVM, Name##AddTest) { \ - KernelScope kernel_scope; \ auto a = Name##Imm::make(Val); \ auto b = Name##Imm::make(Val * 2); \ auto c = Add::make(a, b); \ @@ -66,7 +64,6 @@ TEST_LLVM_SCALAR_TYPES(ADD_TEST) #define SUB_TEST(Type, Name, Val) \ TEST(LLVM, Name##SubTest) { \ - KernelScope kernel_scope; \ auto a = Name##Imm::make(Val * 2); \ auto b = Name##Imm::make(Val); \ auto c = Sub::make(a, b); \ @@ -82,7 +79,6 @@ TEST_LLVM_SCALAR_TYPES(SUB_TEST) #define MUL_TEST(Type, Name, Val) \ TEST(LLVM, Name##MulTest) { \ - KernelScope kernel_scope; \ auto a = Name##Imm::make(Val); \ auto b = Name##Imm::make((Type)4); \ auto c = Mul::make(a, b); \ @@ -98,7 +94,6 @@ TEST_LLVM_SCALAR_TYPES(MUL_TEST) #define DIV_TEST(Type, Name, Val) \ TEST(LLVM, Name##DivTest) { \ - KernelScope kernel_scope; \ auto a = Name##Imm::make((Type)6); \ auto b = Name##Imm::make((Type)3); \ auto c = Div::make(a, b); \ @@ -113,7 +108,6 @@ TEST_LLVM_SCALAR_TYPES(DIV_TEST) #undef DIV_TEST TEST(LLVM, IntToFloatCastTest) { - KernelScope kernel_scope; auto a = IntImm::make(2); auto b = Cast::make(kFloat, a); LLVMExprEval cg(b, {}); @@ -121,7 +115,6 @@ TEST(LLVM, IntToFloatCastTest) { } TEST(LLVM, FloatToIntCastTest) { - KernelScope kernel_scope; auto a = FloatImm::make(2.0); auto b = Cast::make(kInt, a); LLVMExprEval cg(b); @@ -129,7 +122,6 @@ TEST(LLVM, FloatToIntCastTest) { } TEST(LLVM, IntToLongCastTest) { - KernelScope kernel_scope; auto a = IntImm::make(12345); auto b = Cast::make(kLong, a); LLVMExprEval cg(b); @@ -137,7 +129,6 @@ TEST(LLVM, IntToLongCastTest) { } TEST(LLVM, ByteToCharCastTest) { - KernelScope kernel_scope; auto a = ByteImm::make(250); auto b = Cast::make(kChar, a); LLVMExprEval cg(b); @@ -145,7 +136,6 @@ TEST(LLVM, ByteToCharCastTest) { } TEST(LLVM, HalfToLongCastTest) { - KernelScope kernel_scope; auto a = HalfImm::make(2.0); auto b = Cast::make(kLong, a); LLVMExprEval cg(b); @@ -153,7 +143,6 @@ TEST(LLVM, HalfToLongCastTest) { } TEST(LLVM, ByteToDoubleCastTest) { - KernelScope kernel_scope; auto a = ByteImm::make(2); auto b = Cast::make(kDouble, a); LLVMExprEval cg(b); @@ -170,7 +159,6 @@ TEST(LLVM, BitCast) { // this is broken /*{ - KernelScope kernel_scope; at::Half k_; at::Half* k = &k_; *reinterpret_cast(k) = ref16; @@ -181,7 +169,6 @@ TEST(LLVM, BitCast) { }*/ { - KernelScope kernel_scope; float k = raw_bitcast(ref32); auto a = FloatImm::make(k); auto b = BitCast::make(kInt, a); @@ -190,7 +177,6 @@ TEST(LLVM, BitCast) { } { - KernelScope kernel_scope; double k = raw_bitcast(ref64); auto a = DoubleImm::make(k); auto b = BitCast::make(kLong, a); @@ -199,7 +185,6 @@ TEST(LLVM, BitCast) { } { - KernelScope kernel_scope; int64_t k = raw_bitcast(reff64); auto a = LongImm::make(k); auto b = BitCast::make(kDouble, a); @@ -208,7 +193,6 @@ TEST(LLVM, BitCast) { } { - KernelScope kernel_scope; int32_t k = raw_bitcast(reff32); auto a = IntImm::make(k); auto b = BitCast::make(kFloat, a); @@ -218,7 +202,6 @@ TEST(LLVM, BitCast) { } TEST(LLVM, fastLogFloat) { - KernelScope kernel_scope; const int kTotalSize = 128 * 128; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); @@ -250,8 +233,6 @@ TEST(LLVM, fastLogFloat) { } TEST(LLVM, LetTest01) { - KernelScope kernel_scope; - Placeholder a(BufHandle("A", {1}, kFloat)); std::vector v = {1, 0}; std::vector args({v.data()}); @@ -267,8 +248,6 @@ TEST(LLVM, LetTest01) { } TEST(LLVM, LetTest02) { - KernelScope kernel_scope; - Placeholder a(BufHandle("A", {1}, kFloat)); std::vector v = {1, 0}; std::vector args({v.data()}); @@ -287,8 +266,6 @@ TEST(LLVM, LetTest02) { } TEST(LLVM, LetTestMultitype) { - KernelScope kernel_scope; - Placeholder a(BufHandle("A", {1}, kDouble)); std::vector v = {1, 0}; std::vector args({v.data()}); @@ -310,7 +287,6 @@ TEST(LLVM, LetTestMultitype) { } TEST(LLVM, BufferTest) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {32}, kFloat)); std::vector v(5); std::vector args({v.data()}); @@ -320,7 +296,6 @@ TEST(LLVM, BufferTest) { } TEST(LLVM, BlockTest) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {32}, kInt)); std::vector v = {1, 2}; std::vector args({v.data()}); @@ -338,7 +313,6 @@ TEST(LLVM, BlockTest) { } TEST(LLVM, LoadStoreTest) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {1}, kInt)); Placeholder b(BufHandle("B", {1}, kInt)); std::vector a_buffer = {42}; @@ -353,7 +327,6 @@ TEST(LLVM, LoadStoreTest) { } TEST(LLVM, IfThenElseTest) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {1}, kInt)); Placeholder b(BufHandle("B", {1}, kInt)); Placeholder c(BufHandle("C", {1}, kInt)); @@ -371,8 +344,6 @@ TEST(LLVM, IfThenElseTest) { // if (x < 10) x = x + 1 TEST(LLVM, CondNoFalseBlockTest) { - KernelScope kernel_scope; - Placeholder x(BufHandle("X", {1}, kInt)); auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT); auto cond = Cond::make(cmp, x.store({0}, x.load(0) + 1), nullptr); @@ -396,8 +367,6 @@ TEST(LLVM, CondNoFalseBlockTest) { // x = x - 1; // } TEST(LLVM, CondTest) { - KernelScope kernel_scope; - Placeholder x(BufHandle("X", {1}, kInt)); auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT); auto cond = @@ -434,8 +403,6 @@ TEST(LLVM, CondTest) { // } // } TEST(LLVM, CondNestedTest) { - KernelScope kernel_scope; - Placeholder x(BufHandle("X", {1}, kInt)); auto true_cmp = CompareSelect::make(x.load(0), 5, CompareSelectOperation::kGT); @@ -470,7 +437,6 @@ TEST(LLVM, CondNestedTest) { } TEST(LLVM, DirectVectorization) { - KernelScope ks; constexpr int M = 3; constexpr int N = 64; BufHandle a("a", {M, N}, kFloat); @@ -491,7 +457,6 @@ TEST(LLVM, DirectVectorization) { } TEST(LLVM, VecLoadStoreTest) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {1}, kInt)); Placeholder b(BufHandle("B", {1}, kInt)); std::vector a_buffer = {1, 1, 1, 1}; @@ -513,7 +478,6 @@ TEST(LLVM, VecLoadStoreTest) { #define FLOAT_INTRINSICS_TEST(Name, Lanes) \ TEST(LLVM, VecFloat_##Name##Lane##Lanes##Test) { \ - KernelScope kernel_scope; \ Placeholder a(BufHandle("A", {1}, kFloat)); \ Placeholder b(BufHandle("B", {1}, kFloat)); \ float val = 0.5f; \ @@ -552,7 +516,6 @@ FLOAT_INTRINSICS_TEST(lgamma, 8) #define DOUBLE_INTRINSICS_TEST(Name, Lanes) \ TEST(LLVM, VecDouble_##Name##Lane##Lanes##Test) { \ - KernelScope kernel_scope; \ Placeholder a(BufHandle("A", {1}, kDouble)); \ Placeholder b(BufHandle("B", {1}, kDouble)); \ float val = 0.5f; \ @@ -590,7 +553,6 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4) #undef DOUBLE_INTRINSICS_TEST TEST(LLVM, VectorizerLoadStoreTest) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {1}, kInt)); Tensor c = @@ -613,7 +575,6 @@ TEST(LLVM, VectorizerLoadStoreTest) { } TEST(LLVM, VectorizeBitCast) { - KernelScope kernel_scope; Placeholder a(BufHandle("A", {128}, kInt)); Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { @@ -639,7 +600,6 @@ TEST(LLVM, VectorizeBitCast) { } TEST(LLVM, MemcpyTest) { - KernelScope kernel_scope; constexpr int N = 32; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -661,7 +621,6 @@ TEST(LLVM, MemcpyTest) { } TEST(LLVM, BzeroTest) { - KernelScope kernel_scope; constexpr int N = 32; Placeholder b(BufHandle("B", {N}, kInt)); std::vector b_buffer(N, 11); @@ -679,7 +638,6 @@ TEST(LLVM, BzeroTest) { } TEST(LLVM, ElemwiseAdd) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -705,7 +663,6 @@ TEST(LLVM, ElemwiseAdd) { } TEST(LLVM, ElemwiseAddFloat) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -731,7 +688,6 @@ TEST(LLVM, ElemwiseAddFloat) { } TEST(LLVM, ElemwiseLog10Float) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -758,7 +714,6 @@ TEST(LLVM, ElemwiseLog10Float) { } TEST(LLVM, ElemwiseLog1pFloat) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -785,7 +740,6 @@ TEST(LLVM, ElemwiseLog1pFloat) { } TEST(LLVM, ElemwiseMaxInt) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -812,7 +766,6 @@ TEST(LLVM, ElemwiseMaxInt) { } TEST(LLVM, ElemwiseMinInt) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -839,7 +792,6 @@ TEST(LLVM, ElemwiseMinInt) { } TEST(LLVM, ElemwiseMaxFloat) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -866,7 +818,6 @@ TEST(LLVM, ElemwiseMaxFloat) { } TEST(LLVM, ElemwiseMaxNaNFloat) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -894,7 +845,6 @@ TEST(LLVM, ElemwiseMaxNaNFloat) { } TEST(LLVM, ElemwiseMinFloat) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -921,7 +871,6 @@ TEST(LLVM, ElemwiseMinFloat) { } TEST(LLVM, ElemwiseMinNaNFloat) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -949,7 +898,6 @@ TEST(LLVM, ElemwiseMinNaNFloat) { } TEST(LLVM, ElemwiseMod) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -975,7 +923,6 @@ TEST(LLVM, ElemwiseMod) { } TEST(LLVM, CompareSelectIntEQ) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kInt)); Placeholder b(BufHandle("B", {N}, kInt)); @@ -1016,7 +963,6 @@ TEST(LLVM, CompareSelectIntEQ) { } TEST(LLVM, CompareSelectFloatEQ) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kFloat)); Placeholder b(BufHandle("B", {N}, kFloat)); @@ -1050,7 +996,6 @@ TEST(LLVM, CompareSelectFloatEQ) { } TEST(LLVM, CompareSelectByteGT) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kByte)); Placeholder b(BufHandle("B", {N}, kByte)); @@ -1091,7 +1036,6 @@ TEST(LLVM, CompareSelectByteGT) { } TEST(LLVM, CompareSelectByteGE) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kByte)); Placeholder b(BufHandle("B", {N}, kByte)); @@ -1127,7 +1071,6 @@ TEST(LLVM, CompareSelectByteGE) { } TEST(LLVM, CompareSelectByteLT) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kByte)); Placeholder b(BufHandle("B", {N}, kByte)); @@ -1168,7 +1111,6 @@ TEST(LLVM, CompareSelectByteLT) { } TEST(LLVM, CompareSelectByteLE) { - KernelScope kernel_scope; constexpr int N = 1024; Placeholder a(BufHandle("A", {N}, kByte)); Placeholder b(BufHandle("B", {N}, kByte)); @@ -1204,7 +1146,6 @@ TEST(LLVM, CompareSelectByteLE) { } TEST(LLVM, StoreFloat) { - KernelScope kernel_scope; Placeholder result(BufHandle("result", {1}, kFloat)); std::vector result_buffer = {0.0f}; auto expr = result.store({0}, FloatImm::make(3.14f)); @@ -1215,7 +1156,6 @@ TEST(LLVM, StoreFloat) { } TEST(LLVM, SimpleMath01) { - KernelScope kernel_scope; const int N = 1024; Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) { return cast(i * i + 1); @@ -1237,7 +1177,6 @@ TEST(LLVM, SimpleMath01) { } TEST(LLVM, ComputeMul) { - KernelScope kernel_scope; const int N = 1024; Placeholder a(BufHandle("a", {N}, kFloat)); Placeholder b(BufHandle("b", {N}, kFloat)); @@ -1260,7 +1199,6 @@ TEST(LLVM, ComputeMul) { } TEST(LLVM, BroadcastAdd) { - KernelScope kernel_scope; const int M = 32; const int N = 1024; Placeholder a(BufHandle("a", {M, N}, kFloat)); @@ -1293,7 +1231,6 @@ TEST(LLVM, BroadcastAdd) { } TEST(LLVM, BitwiseOps) { - KernelScope kernel_scope; auto a = IntImm::make(59); auto b = IntImm::make(11); auto c = IntImm::make(101); @@ -1306,7 +1243,6 @@ TEST(LLVM, BitwiseOps) { } TEST(LLVM, ArithmeticRightShift) { - KernelScope ks; auto a = CharImm::make(-4); auto b = CharImm::make(1); ExprHandle f = a >> b; @@ -1315,7 +1251,6 @@ TEST(LLVM, ArithmeticRightShift) { } TEST(LLVM, LogicalRightShift) { - KernelScope ks; auto a = ByteImm::make(0xfc); auto b = ByteImm::make(1); ExprHandle f = a >> b; @@ -1324,7 +1259,6 @@ TEST(LLVM, LogicalRightShift) { } TEST(LLVM, DynamicShapeAdd) { - KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); @@ -1346,7 +1280,6 @@ TEST(LLVM, DynamicShapeAdd) { } TEST(LLVM, BindDynamicShapeAdd) { - KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); @@ -1367,7 +1300,6 @@ TEST(LLVM, BindDynamicShapeAdd) { } TEST(LLVM, TensorDynamicShapeAdd) { - KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); Placeholder a(BufHandle("a", {n}, kFloat)); @@ -1390,7 +1322,6 @@ TEST(LLVM, TensorDynamicShapeAdd) { } TEST(LLVM, DynamicShape2D) { - KernelScope kernel_scope; auto testWithSize = [](int32_t M, int32_t N) { VarHandle m("m", kInt); VarHandle n("n", kInt); @@ -1416,7 +1347,6 @@ TEST(LLVM, DynamicShape2D) { } TEST(LLVM, EmptyStmt) { - KernelScope kernel_scope; StmtPtr s = alloc(std::vector({})); LLVMCodeGen cg(s, {}); @@ -1425,7 +1355,6 @@ TEST(LLVM, EmptyStmt) { } TEST(LLVM, EliminatedStmt) { - KernelScope kernel_scope; Placeholder a(BufHandle("a", {1}, kFloat)); Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; }); @@ -1441,8 +1370,6 @@ TEST(LLVM, EliminatedStmt) { } TEST(LLVM, SimpleReduction) { - KernelScope kernel_scope; - int M = 128; int N = 64; const int kTotalSize = M * N; @@ -1480,8 +1407,6 @@ TEST(LLVM, SimpleReduction) { } TEST(LLVM, RFactorReduction) { - KernelScope kernel_scope; - int M = 128; int N = 64; const int kTotalSize = M * N; @@ -1530,8 +1455,6 @@ TEST(LLVM, RFactorReduction) { } TEST(LLVM, RFactorVectorizedReduction) { - KernelScope kernel_scope; - int M = 128; int N = 64; const int kTotalSize = M * N; @@ -1582,7 +1505,6 @@ TEST(LLVM, SimpleParallel) { for (int test_cfg = 0; test_cfg < 4; test_cfg++) { // Compute a simple operation, and try all loop-axis combination to be // parallel or sequential. - KernelScope kernel_scope; const int M = 4; const int N = 6; Tensor f = Compute( @@ -1623,7 +1545,6 @@ TEST(LLVM, CompositeParallel) { // Compute a composite operation, and try all loop-axis combination to be // parallel or sequential. for (int test_cfg = 0; test_cfg < test_count; test_cfg++) { - KernelScope kernel_scope; int M = 5; int N = 7; Tensor t1 = @@ -1687,8 +1608,6 @@ TEST(LLVM, CompositeParallel) { } TEST(LLVM, VectorizedGEMM) { - KernelScope ks; - int M = 32; int N = 32; int K = 48; @@ -1771,7 +1690,6 @@ TEST(LLVM, VectorizedGEMM) { } TEST(LLVM, CallRaw) { - KernelScope kernel_scope; const int M = 32; VarHandle N("N", kInt); Placeholder a(BufHandle("a", {M, N}, kFloat)); @@ -1813,7 +1731,6 @@ TEST(LLVM, CallRaw) { } TEST(LLVM, CustomTarget) { - KernelScope kernel_scope; constexpr int M = 16; Placeholder a("a", kFloat, {M}); Placeholder b("b", kFloat, {M}); diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 7c3eefaab3b1c..28934f622d057 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -41,7 +41,6 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) { } TEST(LoopNest, ExprSimple01) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; @@ -54,7 +53,6 @@ TEST(LoopNest, ExprSimple01) { } TEST(LoopNest, ExprLower01) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; @@ -68,7 +66,6 @@ TEST(LoopNest, ExprLower01) { } TEST(LoopNest, ExprSimple02) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x, const ExprHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }; @@ -162,7 +159,6 @@ void assertForRanges( } TEST(LoopNest, ExprSliceHeadWithLoopOptions) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -186,7 +182,6 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) { } TEST(LoopNest, ExprSliceTailWithLoopOptions) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -219,7 +214,6 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) { TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) { // When factor equals the For loop's original size, keep using the original // For loop. - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -240,7 +234,6 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) { } TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -261,7 +254,6 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) { } TEST(LoopNest, ExprSliceHead) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -284,7 +276,6 @@ TEST(LoopNest, ExprSliceHead) { } TEST(LoopNest, ExprSliceHeadWithNonZeroStart) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -311,7 +302,6 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) { TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) { // When factor equals the For loop's original size, keep using the original // For loop. - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -334,7 +324,6 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) { TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) { // When factor equals the For loop's original size, keep using the original // For loop. - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -355,7 +344,6 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) { } TEST(LoopNest, ExprSliceTail) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -381,7 +369,6 @@ TEST(LoopNest, ExprSplitAndSlice) { // 0: splitWithTail // 1: sliceTail on inner loop // 2: sliceHead on outer loop - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -434,7 +421,6 @@ TEST(LoopNest, ExprSplitAndSlice) { TEST(LoopNest, ExprSliceAndNormalize) { // 0: sliceHead // 1: normalize tail - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -467,7 +453,6 @@ TEST(LoopNest, ExprSliceWithVariableDimension) { auto testWithDimension = [](int dimension, const std::vector>& expected_for_ranges) { - KernelScope kernel_scope; VarHandle dim("dim", kInt); Tensor tensor = Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; }); @@ -504,7 +489,6 @@ TEST(LoopNest, ExprSliceWithVariableDimension) { } TEST(LoopNest, ExprSplitWithTail) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; @@ -534,7 +518,6 @@ TEST(LoopNest, ExprSplitWithTail) { } TEST(LoopNest, ExprSplitWithTailNone) { - KernelScope kernel_scope; auto func = [](const ExprHandle& x, const ExprHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }; @@ -592,7 +575,6 @@ TEST(LoopNest, ExprSplitWithTailNone) { } TEST(LoopNest, ExprSplitWithMask01) { - KernelScope kernel_scope; const int M = 26; const int N = 5; Placeholder a_buf("a", kFloat, {M, N}); @@ -628,7 +610,6 @@ TEST(LoopNest, ExprSplitWithMask01) { // Tests the case where we split a loop cleanly multiple times, we should not // insert any masks. TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) { - KernelScope kernel_scope; const int M = 64; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); @@ -655,8 +636,6 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) { } TEST(LoopNest, getLoopAt) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; i++) { // for (int j = 0; j < 100; j++) { @@ -714,7 +693,6 @@ TEST(LoopNest, getLoopAt) { } TEST(LoopNest, TileSimple) { - KernelScope kernel_scope; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) const int M = 64, N = 64; Placeholder a_buf("a", kFloat, {M, N}); @@ -760,7 +738,6 @@ TEST(LoopNest, TileSimple) { } TEST(LoopNest, TileWithTails) { - KernelScope kernel_scope; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) const int M = 64, N = 64; Placeholder a_buf("a", kFloat, {M, N}); @@ -807,7 +784,6 @@ TEST(LoopNest, TileWithTails) { } TEST(LoopNest, TileInMiddle) { - KernelScope kernel_scope; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) const int M = 8, N = 8, L = 8, K = 8; Placeholder a_buf("a", kFloat, {M, N, L, K}); @@ -868,7 +844,6 @@ TEST(LoopNest, TileInMiddle) { } TEST(LoopNest, SplitWithTailWithLoopOptions) { - KernelScope kernel_scope; const int M = 21; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); @@ -899,7 +874,6 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) { } TEST(LoopNest, SplitWithMaskWithLoopOptions) { - KernelScope kernel_scope; const int M = 21; Placeholder a_buf("a", kFloat, {M}); Placeholder b_buf("b", kFloat, {M}); @@ -924,7 +898,6 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) { } TEST(LoopNest, ScheduleBroadcastAddBuffer) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -973,7 +946,6 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) { } TEST(LoopNest, ScheduleFunctionCall01) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1030,7 +1002,6 @@ TEST(LoopNest, ScheduleFunctionCall01) { } TEST(LoopNest, ScheduleInlineSimple) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1111,7 +1082,6 @@ static std::string remove_space(const std::string& str) { } void InlineFunc01Helper(const std::vector& inline_order) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1230,7 +1200,6 @@ TEST(LoopNest, ScheduleInlineFunc01) { // Make sure we cache random vars if we should. TEST(LoopNest, ScheduleInlineRandom) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1266,7 +1235,6 @@ TEST(LoopNest, ScheduleInlineRandom) { // Make sure we don't cache random vars that are not being inlined. TEST(LoopNest, ScheduleInlineRandomUnrelated) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1303,7 +1271,6 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) { // Make sure we generate the right number of random values == the dimensionality // of the production tensor. TEST(LoopNest, ScheduleInlineRandomLowerDimensions) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1336,7 +1303,6 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) { // Make sure we don't screw up intrinsics thinking they're rand. TEST(LoopNest, ScheduleInlineIntrinsics) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1397,7 +1363,6 @@ TEST(LoopNest, ScheduleInlineIntrinsics) { // Make sure we can handle rand and non-rand intrinsics. TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1431,7 +1396,6 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { // Split a Compute then inline it into another compute. TEST(LoopNest, ScheduleSplitAThenInline) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { @@ -1446,7 +1410,6 @@ TEST(LoopNest, ScheduleSplitAThenInline) { // Split a Compute then inline another Compute into it. TEST(LoopNest, ScheduleSplitBThenInline) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { @@ -1471,7 +1434,6 @@ TEST(LoopNest, ScheduleSplitBThenInline) { // Split a Compute twice then inline it. TEST(LoopNest, ScheduleSplitTwiceThenInline) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { @@ -1489,7 +1451,6 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) { // Inline a Compute, then split. TEST(LoopNest, ScheduleInlineThenSplit) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { @@ -1514,7 +1475,6 @@ TEST(LoopNest, ScheduleInlineThenSplit) { // Split a Compute, inline it, then split the result. TEST(LoopNest, ScheduleSplitInlineThenSplit) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) { @@ -1541,7 +1501,6 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) { // Oversplit a loop that is simplified out after inlining. TEST(LoopNest, ScheduleSplitInlineSimplify) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return ExprHandle(4) * i - ExprHandle(2) * i; }); @@ -1557,7 +1516,6 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) { // Inline a Compute with two consumers. TEST(LoopNest, ScheduleInlineThreeMixedOnce) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { @@ -1587,7 +1545,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) { // Inline Compute A into B, then inline B into C. TEST(LoopNest, ScheduleInlineThreeMixedTwice) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { @@ -1618,7 +1575,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) { // Inline a Compute that is both a producer and consumer. TEST(LoopNest, ScheduleInlineThreeMixedInner) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { @@ -1648,7 +1604,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) { // Split 3 Computes, then inline the first two into the last. TEST(LoopNest, ScheduleInlineThreeMixedSplit) { - KernelScope kernel_scope; Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { @@ -1672,7 +1627,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedSplit) { // Check that inlining works for output tensors too TEST(LoopNest, ScheduleInlineOutputTensors) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1710,7 +1664,6 @@ TEST(LoopNest, ScheduleInlineOutputTensors) { } TEST(LoopNest, ScheduleFuserStyle) { - KernelScope kernel_scope; const int kVectorSize = 8; const int kVectorCount = 128; const int kTotalSize = kVectorSize * kVectorCount; @@ -1743,7 +1696,6 @@ TEST(LoopNest, ScheduleFuserStyle) { } TEST(LoopNest, ScheduleFuserThreeArg) { - KernelScope kernel_scope; const int kVectorSize = 8; const int kVectorCount = 128; const int kTotalSize = kVectorSize * kVectorCount; @@ -1782,7 +1734,6 @@ TEST(LoopNest, ScheduleFuserThreeArg) { } TEST(LoopNest, ScheduleDynamicShape2D) { - KernelScope kernel_scope; auto testWithSize = [](int32_t M, int32_t N) { VarHandle m("m", kInt); VarHandle n("n", kInt); @@ -1821,7 +1772,6 @@ TEST(LoopNest, LoopNestComputeAt_1) { // should be in that loop after the transformation. Also, computation of A // should not be inlined into B. Instead, it should be computed into the temp, // and the temp should be used in B. - KernelScope kernel_scope; VarHandle N("N", kInt); Tensor A = Compute( "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; }); @@ -1867,7 +1817,6 @@ TEST(LoopNest, LoopNestComputeAt_2) { // p[cy,cx+1] + p[cy+1,cx+1] // } // } - KernelScope kernel_scope; const int kW = 16, kH = 16; VarHandle W("W", kInt); @@ -1955,7 +1904,6 @@ TEST(LoopNest, LoopNestComputeAt_3) { // D(x,y) = A(x, y+1) + C(x, y) // // i.e. when 'A' comes to 'D' directly and indirectly through 'C'. - KernelScope kernel_scope; const int kW = 16, kH = 16; VarHandle W("W", kInt); @@ -2056,8 +2004,6 @@ TEST(LoopNest, LoopNestComputeAt_3) { using Axis = const VarHandle&; TEST(LoopNest, Reduce2dComputeAt) { - KernelScope kernel_scope; - const int kW = 16, kH = 16; VarHandle W("W", kInt); VarHandle H("H", kInt); @@ -2176,7 +2122,6 @@ TEST(LoopNest, DISABLED_Conv1d_NH) { // Lots of stuff is broken here. The computeAt swaps the axes for some odd // reason. Even without that, the index flattener fails due to "dimensions // mismatch in flatten index". - KernelScope kernel_scope; int N = 4; int H = 256; @@ -2270,7 +2215,6 @@ class LoopOrderHelper : public IRVisitor { }; TEST(LoopNest, LoopNestReorderAxis1) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; @@ -2319,7 +2263,6 @@ TEST(LoopNest, LoopNestReorderAxis1) { } TEST(LoopNest, LoopNestReorderPartialAxes) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}, {4, "z"}}, @@ -2367,7 +2310,6 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { } TEST(LoopNest, LoopNestReorderInternalAxis) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, @@ -2404,7 +2346,6 @@ TEST(LoopNest, LoopNestReorderInternalAxis) { } TEST(LoopNest, LoopNestReorderEnclosingAxis) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, @@ -2440,7 +2381,6 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) { } TEST(LoopNest, LoopNestReorderSameAxis) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; @@ -2469,8 +2409,6 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { * Stmt 4 */ - KernelScope kernel_scope; - Tensor tensor = Compute( "f", {{2, "x"}, {3, "y"}, {4, "z"}}, @@ -2605,8 +2543,6 @@ void LoopNestReorderTestHelper( bool append, int index1, int index2) { - KernelScope kernel_scope; - Tensor c = Compute( "5d", {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, @@ -2722,7 +2658,6 @@ TEST(LoopNest, LoopNestReorderLongStringFull) { } TEST(LoopNest, LoopNestReorderInternalLoopNest) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -2823,7 +2758,6 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { } TEST(LoopNest, OuterLoopVectorization) { - KernelScope kernel_scope; Tensor tensor = Compute( "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; @@ -2851,8 +2785,6 @@ TEST(LoopNest, OuterLoopVectorization) { } TEST(LoopNest, VectorizeLoopNotNormalized) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 10; i++) { // for (int j = 1; j < 5; j++) { @@ -2876,7 +2808,6 @@ TEST(LoopNest, VectorizeLoopNotNormalized) { namespace { std::string constantUpperBoundLoopIR(int upper_bound_val) { - KernelScope kernel_scope; ExprHandle upper_bound(upper_bound_val); Tensor A = Compute( "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); @@ -2903,7 +2834,6 @@ TEST(LoopNest, Unroll) { } TEST(LoopNest, UnrollOuter) { - KernelScope kernel_scope; ExprHandle outer_bound(3); ExprHandle inner_bound(4); Tensor A = Compute( @@ -2927,7 +2857,6 @@ TEST(LoopNest, UnrollOuter) { } TEST(LoopNest, UnrollInner) { - KernelScope kernel_scope; ExprHandle outer_bound(3); ExprHandle inner_bound(4); Tensor A = Compute( @@ -2949,7 +2878,6 @@ TEST(LoopNest, UnrollInner) { } TEST(LoopNest, UnrollMultipleStatements) { - KernelScope kernel_scope; const int kTotalSize = 3; BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); @@ -2975,8 +2903,6 @@ TEST(LoopNest, UnrollMultipleStatements) { } TEST(LoopNest, UnrollNonLiteralConstantBounds) { - KernelScope kernel_scope; - // Input IR: // for (int i = 2 - 1; i < 12 / 3; i++) { // for (int j = 0; j < 4; j++) { @@ -3021,7 +2947,6 @@ TEST(LoopNest, UnrollEmpty) { } TEST(LoopNest, NoUnroll) { - KernelScope kernel_scope; VarHandle upper_bound("N", kInt); Tensor A = Compute( "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); @@ -3033,7 +2958,6 @@ TEST(LoopNest, NoUnroll) { } TEST(LoopNest, UnrollWithLet) { - KernelScope kernel_scope; const int kTotalSize = 3; BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt); BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt); @@ -3076,8 +3000,6 @@ TEST(LoopNest, UnrollWithLet) { } TEST(LoopNest, IsNormalized) { - KernelScope kernel_scope; - // Input IR: // for (int i = 50; i < 100; i++) { // A[i] = B[i]; @@ -3100,8 +3022,6 @@ TEST(LoopNest, IsNormalized) { } TEST(LoopNest, NormalizeStartPositive) { - KernelScope kernel_scope; - // Input IR: // for (int x = 50; x < 100; x++) { // A[x] = B[x]; @@ -3132,8 +3052,6 @@ TEST(LoopNest, NormalizeStartPositive) { } TEST(LoopNest, NormalizeStartNegative) { - KernelScope kernel_scope; - // Input IR: // for (int x = -50; x < 100; x++) { // A[x + 50] = B[x + 50]; @@ -3164,8 +3082,6 @@ TEST(LoopNest, NormalizeStartNegative) { } TEST(LoopNest, NormalizeStartZero) { - KernelScope kernel_scope; - // Input IR: // for (int x = 0; x < 100; x++) { // A[x] = B[x]; @@ -3198,8 +3114,6 @@ TEST(LoopNest, NormalizeStartZero) { } TEST(LoopNest, NormalizeStartVariable) { - KernelScope kernel_scope; - // Input IR: // for (int x = y; x < 100; x++) { // A[x] = B[x]; @@ -3232,8 +3146,6 @@ TEST(LoopNest, NormalizeStartVariable) { } TEST(LoopNest, NormalizeOnNestedOuterLoop) { - KernelScope kernel_scope; - // Input IR: // for (int x = 50; x < 100; x++) { // for (int y = 10; y < 100; y++) { @@ -3266,8 +3178,6 @@ TEST(LoopNest, NormalizeOnNestedOuterLoop) { } TEST(LoopNest, NormalizeOnNestedInnerLoop) { - KernelScope kernel_scope; - // Input IR: // for (int x = 50; x < 100; x++) { // for (int y = 10; y < 100; y++) { @@ -3300,8 +3210,6 @@ TEST(LoopNest, NormalizeOnNestedInnerLoop) { } TEST(LoopNest, NormalizeAndSplitWithTail) { - KernelScope kernel_scope; - // Create a dummy tensor to construct LoopNest. ExprHandle n(100); Placeholder a(BufHandle("a", {n}, kFloat)); @@ -3349,8 +3257,6 @@ TEST(LoopNest, NormalizeAndSplitWithTail) { } TEST(LoopNest, FlattenSimpleLoopNest2D) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 10; i++) { // for (int j = 0; j < 5; j++) { @@ -3392,8 +3298,6 @@ TEST(LoopNest, FlattenSimpleLoopNest2D) { } TEST(LoopNest, FlattenSimpleLoopNest3D) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 10; i++) { // for (int j = 0; j < 5; j++) { @@ -3439,8 +3343,6 @@ TEST(LoopNest, FlattenSimpleLoopNest3D) { } TEST(LoopNest, FlattenLoopNestAfterNormalize) { - KernelScope kernel_scope; - // Input IR: // for (int i = 2; i < 10; i++) { // for (int j = 3; j < 15; j++) { @@ -3482,8 +3384,6 @@ TEST(LoopNest, FlattenLoopNestAfterNormalize) { } TEST(LoopNest, FlattenLoopNestWithNonLiteralConstantBounds) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 15-5; i++) { // for (int j = 0; j < 20/4; j++) { @@ -3524,8 +3424,6 @@ TEST(LoopNest, FlattenLoopNestWithNonLiteralConstantBounds) { } TEST(LoopNest, FlattenImperfectLoopNest) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 10; i++) { // A[i, i] = 0; @@ -3555,8 +3453,6 @@ TEST(LoopNest, FlattenImperfectLoopNest) { } TEST(LoopNest, FlattenReductionLoopNest) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 10; i++) { // S[i] = 0; @@ -3588,7 +3484,6 @@ TEST(LoopNest, FlattenReductionLoopNest) { } TEST(LoopNest, FlattenReductionLoopNestFromTensor) { - KernelScope kernel_scope; const int M = 3; const int N = 7; VarHandle m("m", kInt); @@ -3608,8 +3503,6 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) { } TEST(LoopNest, FlattenIncorrectLoopsAsInput) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 10; i++) { // for (int j = 0; j < 5; j++) { @@ -3648,7 +3541,6 @@ TEST(LoopNest, FlattenIncorrectLoopsAsInput) { } TEST(LoopNest, DetectInlineRankMismatch) { - KernelScope kernel_scope; const int kTotalSize = 8; Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); @@ -3666,8 +3558,6 @@ TEST(LoopNest, DetectInlineRankMismatch) { } TEST(LoopNest, CacheReadsSimple) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -3734,8 +3624,6 @@ TEST(LoopNest, CacheReadsSimple) { } TEST(LoopNest, CacheReadsOuter) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -3782,8 +3670,6 @@ TEST(LoopNest, CacheReadsOuter) { } TEST(LoopNest, CacheReadsInternal) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -3829,8 +3715,6 @@ TEST(LoopNest, CacheReadsInternal) { } TEST(LoopNest, CacheReadsInner) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -3877,8 +3761,6 @@ TEST(LoopNest, CacheReadsInner) { } TEST(LoopNest, CacheWritesSimple) { - KernelScope kernel_scope; - Tensor A = Compute( "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { return i * j; @@ -3929,7 +3811,6 @@ TEST(LoopNest, CacheWritesSimple) { } TEST(LoopNest, DeadStoreElimination) { - KernelScope kernel_scope; VarHandle y("y", kInt); VarHandle x("x_tail", kInt); BufHandle f("f", {26, 5}, kInt); @@ -3970,7 +3851,6 @@ TEST(LoopNest, DeadStoreElimination) { } TEST(LoopNest, DeadStoreEliminationWithIntermediates) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -4017,8 +3897,6 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) { } TEST(LoopNest, CompoundTensorSimple) { - KernelScope kernel_scope; - BufHandle a_buf("A", {10, 5}, kInt); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -4056,7 +3934,6 @@ TEST(LoopNest, CompoundTensorSimple) { } TEST(LoopNest, InlineConstantIndex) { - KernelScope kernel_scope; const int N = 10; Placeholder x_buf("a", kFloat, {1, N, 1}); Tensor y = Compute( @@ -4078,8 +3955,6 @@ TEST(LoopNest, InlineConstantIndex) { } TEST(LoopNest, CompoundTensorUsed) { - KernelScope kernel_scope; - BufHandle a_buf("A", {10, 5}, kInt); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -4124,8 +3999,6 @@ TEST(LoopNest, CompoundTensorUsed) { } TEST(LoopNest, InlineFromLoad) { - KernelScope kernel_scope; - constexpr int N = 1024; BufHandle a("A", {N}, kInt); BufHandle b("B", {N}, kInt); @@ -4150,8 +4023,6 @@ TEST(LoopNest, InlineFromLoad) { } TEST(LoopNest, OptimizeConditionalsSimple) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5]) @@ -4192,8 +4063,6 @@ TEST(LoopNest, OptimizeConditionalsSimple) { } TEST(LoopNest, OptimizeConditionalsNestedConditions) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = IfThenElse(i<10, IfThenElse(i<5, B[i], C[i-5]), D[i-10]) @@ -4241,8 +4110,6 @@ TEST(LoopNest, OptimizeConditionalsNestedConditions) { } TEST(LoopNest, OptimizeConditionalsMultipleStores) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5]) @@ -4301,8 +4168,6 @@ TEST(LoopNest, OptimizeConditionalsMultipleStores) { } TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 50; i++) { // A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5]) @@ -4355,8 +4220,6 @@ TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) { } TEST(LoopNest, OptimizeConditionalsOuterLoopVar) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -4400,8 +4263,6 @@ TEST(LoopNest, OptimizeConditionalsOuterLoopVar) { } TEST(LoopNest, OptimizeConditionalsCompValuesNotOrdered) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = IfThenElse(i<5, IfThenElse(i<10, B[i], C[i-5]), D[i-10]) @@ -4441,8 +4302,6 @@ TEST(LoopNest, OptimizeConditionalsCompValuesNotOrdered) { } TEST(LoopNest, OptimizeConditionalsCompValuesNotConstants) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = IfThenElse(i5, B[i], C[i-5]), D[i-10]) @@ -4524,8 +4381,6 @@ TEST(LoopNest, OptimizeConditionalsInvalidCondition) { } TEST(LoopNest, OptimizeConditionalsInvalidCondition2) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = IfThenElse(10 5 { @@ -4867,7 +4710,6 @@ TEST(LoopNest, ReorderAxisWithMultipleConds) { } TEST(LoopNest, VectorizeUse) { - KernelScope kernel_scope; constexpr int N = 8; Placeholder a("a", kFloat, {N}); Tensor b = Compute( @@ -4898,8 +4740,6 @@ const char* int64Loop = R"IR( )IR"; TEST(LoopNest, Int64Direct) { - KernelScope kernel_scope; - constexpr int64_t N = 12; Placeholder a("a", kLong, {N}); Placeholder b("b", kLong, {N}); @@ -4912,8 +4752,6 @@ TEST(LoopNest, Int64Direct) { } TEST(LoopNest, Int64Compute) { - KernelScope kernel_scope; - constexpr int64_t N = 12; Placeholder a("a", kLong, {N}); Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) { @@ -4928,8 +4766,6 @@ TEST(LoopNest, Int64Compute) { } TEST(LoopNest, DistributeLoopWithAllStmtsAsPivots) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = 0; @@ -4990,8 +4826,6 @@ TEST(LoopNest, DistributeLoopWithAllStmtsAsPivots) { } TEST(LoopNest, DistributeLoopWithOneStmtAsPivot) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = 0; @@ -5049,8 +4883,6 @@ TEST(LoopNest, DistributeLoopWithOneStmtAsPivot) { } TEST(LoopNest, DistributeLoopWithoutAnyPivot) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = 0; @@ -5111,8 +4943,6 @@ TEST(LoopNest, DistributeLoopWithoutAnyPivot) { } TEST(LoopNest, DistributeLoopOverInnerLoops) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = 0; @@ -5170,8 +5000,6 @@ TEST(LoopNest, DistributeLoopOverInnerLoops) { } TEST(LoopNest, DistributeLoopAndParentsWithoutAnyPivot) { - KernelScope kernel_scope; - // Input IR: // for (int m = 0; m < 50; m++) { // for (int i = 0; i < 20; i++) { @@ -5281,8 +5109,6 @@ TEST(LoopNest, DistributeLoopAndParentsWithoutAnyPivot) { } TEST(LoopNest, fuseLoopsSimple) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < 100; j++) { // A[j] = 10 * j; @@ -5317,8 +5143,6 @@ TEST(LoopNest, fuseLoopsSimple) { } TEST(LoopNest, fuseLoopsMultiple) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; i++) { // A[i+100] = 20 + i; @@ -5360,8 +5184,6 @@ TEST(LoopNest, fuseLoopsMultiple) { } TEST(LoopNest, fuseLoopsNested) { - KernelScope kernel_scope; - // Input IR: // for (int m = 0; m < 20; m++) { // A[m] = 0; @@ -5422,8 +5244,6 @@ TEST(LoopNest, fuseLoopsNested) { } TEST(LoopNest, fuseLoopsNested2D) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -5482,8 +5302,6 @@ TEST(LoopNest, fuseLoopsNested2D) { } TEST(LoopNest, fuseLoopsNested2DInner) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -5524,8 +5342,6 @@ TEST(LoopNest, fuseLoopsNested2DInner) { } TEST(LoopNest, fuseLoopsDifferentStopBounds) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < 100; j++) { // A[j] = 10 * j; @@ -5547,8 +5363,6 @@ TEST(LoopNest, fuseLoopsDifferentStopBounds) { } TEST(LoopNest, fuseLoopsDifferentStartBounds) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < 100; j++) { // A[j] = 10 * j; @@ -5570,8 +5384,6 @@ TEST(LoopNest, fuseLoopsDifferentStartBounds) { } TEST(LoopNest, fuseLoopsNotContiguous) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < 100; j++) { // A[j] = 10 * j; @@ -5595,8 +5407,6 @@ TEST(LoopNest, fuseLoopsNotContiguous) { } TEST(LoopNest, fuseLoopsWithDifferentParents) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 50; i++) { // for (int j = 0; j < 100; j++) { @@ -5624,8 +5434,6 @@ TEST(LoopNest, fuseLoopsWithDifferentParents) { } TEST(LoopNest, fuseLoopsWithVariableBounds) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < N; j++) { // A[j] = 10 * j; @@ -5662,8 +5470,6 @@ TEST(LoopNest, fuseLoopsWithVariableBounds) { } TEST(LoopNest, fuseLoopsWithExprBounds) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < M + N; j++) { // A[j] = 10 * j; @@ -5700,8 +5506,6 @@ TEST(LoopNest, fuseLoopsWithExprBounds) { } TEST(LoopNest, fuseLoopsWithDifferentExprBounds) { - KernelScope kernel_scope; - // Input IR: // for (int j = M; j < N * 2; j++) { // A[j] = 10 * j; @@ -5739,8 +5543,6 @@ TEST(LoopNest, fuseLoopsWithDifferentExprBounds) { } TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -5776,8 +5578,6 @@ TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) { } TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -5826,8 +5626,6 @@ TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) { } TEST(LoopNest, fuseLoopsWithReductions) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // A[i] = 0 @@ -5874,8 +5672,6 @@ TEST(LoopNest, fuseLoopsWithReductions) { } TEST(LoopNest, fuseLoopsWith2DReductions) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 50; j++) { @@ -5934,8 +5730,6 @@ TEST(LoopNest, fuseLoopsWith2DReductions) { } TEST(LoopNest, fuseLoopsWithComplexIndices) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 20; j++) { @@ -5982,8 +5776,6 @@ TEST(LoopNest, fuseLoopsWithComplexIndices) { } TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 20; j++) { @@ -6013,8 +5805,6 @@ TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) { } TEST(LoopNest, fuseLoopsWithTranspose) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 20; j++) { @@ -6044,8 +5834,6 @@ TEST(LoopNest, fuseLoopsWithTranspose) { } TEST(LoopNest, fuseLoopsThatViolateDependencies1) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -6067,8 +5855,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies1) { } TEST(LoopNest, fuseLoopsThatViolateDependencies2) { - KernelScope kernel_scope; - // Input IR: // for (int j = 10; j < 100; j++) { // A[j] = 10 * j; @@ -6090,8 +5876,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies2) { } TEST(LoopNest, fuseLoopsThatViolateDependencies3) { - KernelScope kernel_scope; - // Input IR: // for (int m = 0; m < 20; m++) { // A[m] = 0; @@ -6135,8 +5919,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies3) { } TEST(LoopNest, fuseLoopsThatViolateDependencies4) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -6179,8 +5961,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies4) { } TEST(LoopNest, fuseLoopsThatViolateDependencies5) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 100; j++) { @@ -6209,8 +5989,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies5) { } TEST(LoopNest, fuseLoopsThatViolateDependencies6) { - KernelScope kernel_scope; - // Input IR: // for (int j = 0; j < 100; j++) { // A[j] = 10 * j; @@ -6237,8 +6015,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies6) { } TEST(LoopNest, fuseLoopsThatViolateDependencies7) { - KernelScope kernel_scope; - // Input IR: // for (int k = 0; k < 100; k++) { // B[k] = 20 * A[99-k]; @@ -6265,8 +6041,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies7) { } TEST(LoopNest, areLoopsPerfectlyNested) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6309,8 +6083,6 @@ TEST(LoopNest, areLoopsPerfectlyNested) { } TEST(LoopNest, reorderNestedLoops2D) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6335,8 +6107,6 @@ TEST(LoopNest, reorderNestedLoops2D) { } TEST(LoopNest, reorderNestedLoops3D) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6366,8 +6136,6 @@ TEST(LoopNest, reorderNestedLoops3D) { } TEST(LoopNest, reorderNestedLoops4D) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6405,8 +6173,6 @@ TEST(LoopNest, reorderNestedLoops4D) { } TEST(LoopNest, reorderTrivialPermutation) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6436,8 +6202,6 @@ TEST(LoopNest, reorderTrivialPermutation) { } TEST(LoopNest, reorderInvalidPermutations) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6475,8 +6239,6 @@ TEST(LoopNest, reorderInvalidPermutations) { } TEST(LoopNest, reorderInvalidLoopNest) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 20; i++) { // for (int j = 0; j < 30; j++) { @@ -6518,8 +6280,6 @@ TEST(LoopNest, reorderInvalidLoopNest) { } TEST(LoopNest, compressBufferSimple) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { @@ -6564,8 +6324,6 @@ TEST(LoopNest, compressBufferSimple) { } TEST(LoopNest, compressBufferMultipleDims) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { @@ -6604,8 +6362,6 @@ TEST(LoopNest, compressBufferMultipleDims) { } TEST(LoopNest, compressBufferMultipleDims2) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { @@ -6654,8 +6410,6 @@ TEST(LoopNest, compressBufferMultipleDims2) { } TEST(LoopNest, compressBufferDifferentOrderIndices) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { @@ -6700,8 +6454,6 @@ TEST(LoopNest, compressBufferDifferentOrderIndices) { } TEST(LoopNest, compressBufferVariableBounds) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < M; ++i) { // for (int j = 0; j < N; ++j) { @@ -6749,8 +6501,6 @@ TEST(LoopNest, compressBufferVariableBounds) { } TEST(LoopNest, compressBufferNoCommonParentLoops) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { @@ -6800,8 +6550,6 @@ TEST(LoopNest, compressBufferNoCommonParentLoops) { } TEST(LoopNest, compressBufferIndicesMixed) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { @@ -6848,8 +6596,6 @@ TEST(LoopNest, compressBufferIndicesMixed) { } TEST(LoopNest, compressMultipleBuffers) { - KernelScope kernel_scope; - // Input IR: // for (int i = 0; i < 100; ++i) { // for (int j = 0; j < 200; ++j) { diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp index db37b66876976..c9990dcacfb41 100644 --- a/test/cpp/tensorexpr/test_memdependency.cpp +++ b/test/cpp/tensorexpr/test_memdependency.cpp @@ -19,8 +19,6 @@ using namespace torch::jit::tensorexpr; // larger and fully encloses B, while ContainedOrEqual is the reverse. Equal // ranges are ContainedOrEqual. TEST(MemDependency, BoundOverlap) { - KernelScope kernel_scope; - using namespace analysis; auto CB = [](int s, int e) { @@ -79,7 +77,6 @@ TEST(MemDependency, BoundOverlap) { } TEST(MemDependency, BoundOverlapSymbolic) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -116,8 +113,6 @@ TEST(MemDependency, BoundOverlapSymbolic) { // This uses boundOverlap on each dimension and return the "lowest" kind of // overlap. TEST(MemDependency, BoundOverlapMultiDim) { - KernelScope kernel_scope; - using namespace analysis; auto CB = [](int s, int e) { @@ -189,8 +184,6 @@ TEST(MemDependency, BoundOverlapMultiDim) { // Test the helper we use to subtract bounds: returns the regions(s) of A which // remain after removing the region of B. TEST(MemDependency, BoundSubtract) { - KernelScope kernel_scope; - using namespace analysis; auto CB = [](int s, int e) { @@ -224,7 +217,6 @@ TEST(MemDependency, BoundSubtract) { } TEST(MemDependency, BoundSubtractSymbolic) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -273,8 +265,6 @@ TEST(MemDependency, BoundSubtractSymbolic) { // Tests the helper function that does subtraction, but for multi dimensional // indices bounds. TEST(MemDependency, BoundSubtractMultiDim) { - KernelScope kernel_scope; - using namespace analysis; auto CB = [](int s, int e) { @@ -335,7 +325,6 @@ TEST(MemDependency, BoundSubtractMultiDim) { // Tests the multi dimensional subtraction code for bounds that cannot be fully // materialized. TEST(MemDependency, BoundSubtractMultiDimSymbolic) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -403,7 +392,6 @@ TEST(MemDependency, BoundSubtractMultiDimSymbolic) { // Simple check that the analyzer does anything at all... TEST(MemDependency, MemDependencyCheckerSimple) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {1}, kInt); @@ -429,7 +417,6 @@ TEST(MemDependency, MemDependencyCheckerSimple) { // Check that there is a difference between direct and indirect dependence. TEST(MemDependency, MemDependencyCheckerMultiStmt) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {1}, kInt); BufHandle c("C", {1}, kInt); @@ -466,7 +453,6 @@ TEST(MemDependency, MemDependencyCheckerMultiStmt) { // Verify that we do filter writes that are totally overlapped by later writes. TEST(MemDependency, MemDependencyCheckerOverlap) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {1}, kInt); @@ -499,7 +485,6 @@ TEST(MemDependency, MemDependencyCheckerOverlap) { // Verify that bounds match loop iterations, and that dependencies progress // across loop scopes. TEST(MemDependency, MemDependencyCheckerLoop) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {1}, kInt); VarHandle x("x", kInt); @@ -541,7 +526,6 @@ TEST(MemDependency, MemDependencyCheckerLoop) { // Reductions should promote dependencies as well. TEST(MemDependency, MemDependencyCheckerLoopReduce) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -600,7 +584,6 @@ TEST(MemDependency, MemDependencyCheckerLoopReduce) { // Lowering a reduction doesn't affect dependency analysis. TEST(MemDependency, MemDependencyCheckerLoopReduceExpanded) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -654,7 +637,6 @@ TEST(MemDependency, MemDependencyCheckerLoopReduceExpanded) { // Can determine dependencies of outputs, through to inputs. TEST(MemDependency, MemDependencyCheckerInputsOutputs) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -707,7 +689,6 @@ TEST(MemDependency, MemDependencyCheckerInputsOutputs) { // Can tell if an output does not depend on an input. TEST(MemDependency, MemDependencyCheckerOutputDoesntDepend) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -746,7 +727,6 @@ TEST(MemDependency, MemDependencyCheckerOutputDoesntDepend) { // Verify different loop extents produce accesses with different bounds, and // that later accesses find dependencies that overlap their entire bound range. TEST(MemDependency, MemDependencyCheckerLoopBounds) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); BufHandle c("C", {10}, kInt); @@ -928,7 +908,6 @@ TEST(MemDependency, MemDependencyCheckerLoopBounds) { // Verify that we can still infer bounds when the loop var is offset. TEST(MemDependency, MemDependencyCheckerLoopBoundsIndexShift) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -1111,7 +1090,6 @@ TEST(MemDependency, MemDependencyCheckerLoopBoundsIndexShift) { // iteration. This is affected by whether or not we can trust the execution // order of the loop. TEST(MemDependency, MemDependencyCheckerLoopSelfDependency) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); VarHandle x("x", kInt); @@ -1749,7 +1727,6 @@ TEST(MemDependency, MemDependencyCheckerLoopSelfDependency) { // TODO: actually this only works because of the size of the ranges, revist this // test after strided overlap is implemented. TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) { - KernelScope kernel_scope; BufHandle a("A", {20}, kInt); BufHandle b("B", {20}, kInt); VarHandle x("x", kInt); @@ -1775,7 +1752,6 @@ TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) { /* TODO(nickg) - this test will fail due to the lack of stride math in Bound TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) { - KernelScope kernel_scope; BufHandle a("A", {20}, kInt); BufHandle b("B", {20}, kInt); BufHandle c("C", {10}, kInt); @@ -1806,7 +1782,6 @@ TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) { // analysis on Stmts using Cond. TEST(MemDependency, MemDependencyCheckerLoopBoundsCond) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); BufHandle c("C", {10}, kInt); @@ -2002,7 +1977,6 @@ TEST(MemDependency, MemDependencyCheckerLoopBoundsCond) { // Stmts using IfThenElse. TEST(MemDependency, MemDependencyCheckerIfThenElse) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); BufHandle c("C", {10}, kInt); @@ -2112,7 +2086,6 @@ TEST(MemDependency, MemDependencyCheckerIfThenElse) { // Cutting a loop with single elem writes TEST(MemDependency, MemDependencyCheckerCutLoop) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -2194,7 +2167,6 @@ TEST(MemDependency, MemDependencyCheckerCutLoop) { // Dynamic shapes (load in indices). TEST(MemDependency, MemDependencyCheckerDynamicShapes) { - KernelScope kernel_scope; BufHandle a("A", {100}, kInt); BufHandle b("B", {100}, kInt); BufHandle c("C", {100}, kInt); @@ -2436,7 +2408,6 @@ TEST(MemDependency, MemDependencyCheckerDynamicShapes) { // Verify multi dimensional bounds work. TEST(MemDependency, MemDependencyCheckerMultiDim) { - KernelScope kernel_scope; int M = 10, N = 9, K = 12; BufHandle a("A", {M, N, K}, kInt); BufHandle b("B", {M, N, K}, kInt); @@ -2703,8 +2674,6 @@ TEST(MemDependency, MemDependencyCheckerMultiDim) { // Various tests using the external Compute/Reduce API. TEST(MemDependency, MemDependencyCheckerComputeAPI) { - KernelScope kernel_scope; - using namespace analysis; /* for (int m = 0; m < 4; m++) { @@ -2756,8 +2725,6 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) { } TEST(MemDependency, MemDependencyCheckerComputeInline) { - KernelScope kernel_scope; - using namespace analysis; /* for (int m = 0; m < 4; m++) { @@ -2803,8 +2770,6 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) { } TEST(MemDependency, MemDependencyCheckerComputeSplit) { - KernelScope kernel_scope; - using namespace analysis; // Split an axis, so the number of loops != the number of dimensions. @@ -2851,8 +2816,6 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) { } TEST(MemDependency, MemDependencyCheckerComputeReorder) { - KernelScope kernel_scope; - using namespace analysis; // Reorder an axis, so the loop order doesn't match the indexing order. @@ -2900,8 +2863,6 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) { } TEST(MemDependency, MemDependencyCheckerComputeReduce) { - KernelScope kernel_scope; - using namespace analysis; /* for (int l2 = 0; l2 < 2; l2++) { * for (int n1 = 0; n1 < 3; n1++) { @@ -2955,7 +2916,6 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) { } TEST(MemDependency, MemDependencyCheckerComputeGEMM) { - KernelScope kernel_scope; int M = 1024; int N = 1024; int K = 2048; diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp index 122a498276f24..586c093e213d1 100644 --- a/test/cpp/tensorexpr/test_ops.cpp +++ b/test/cpp/tensorexpr/test_ops.cpp @@ -20,8 +20,6 @@ std::unique_ptr compile( } TEST(Ops, Sum) { - KernelScope ks; - std::vector testDims = {{0}, {1}, {0, 1}}; for (auto const& dims : testDims) { constexpr int M = 8; diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index 6620ef2686a94..411b58db57f9e 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -24,7 +24,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(Reductions, ReduceSum0D_1) { - KernelScope kernel_scope; const int M = 10; Placeholder b(BufHandle("b", {M}, kFloat)); @@ -50,7 +49,6 @@ TEST(Reductions, ReduceSum0D_1) { } TEST(Reductions, ReduceSum0D_2) { - KernelScope kernel_scope; const int M = 10; Placeholder b(BufHandle("b", {}, kFloat)); @@ -73,8 +71,6 @@ TEST(Reductions, ReduceSum0D_2) { // Sum an array to a single value. TEST(Reductions, ReduceSum1D) { - KernelScope kernel_scope; - Placeholder b(BufHandle("b", {10}, kFloat)); std::vector in(10); for (int j = 0; j < 10; ++j) { @@ -96,8 +92,6 @@ TEST(Reductions, ReduceSum1D) { } // Sum a 2D tensor to a 1D tensor with dynamic shapes. TEST(Reductions, ReduceSum2D) { - KernelScope kernel_scope; - const int M = 3; const int N = 7; @@ -138,8 +132,6 @@ TEST(Reductions, ReduceSum2D) { // Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to // check our work. TEST(Reductions, ReduceSum3D) { - KernelScope kernel_scope; - const int M = 10; VarHandle m("m", kInt); @@ -209,8 +201,6 @@ TEST(Reductions, ReduceSum3D) { // Sum a large (10 D) Tensor 5 dimensions in. TEST(Reductions, ReduceSum10D) { - KernelScope kernel_scope; - Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat)); const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3; Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat)); @@ -243,8 +233,6 @@ TEST(Reductions, ReduceSum10D) { // Reduce via Mul rather than Add using a custom Reducer. TEST(Reductions, ReduceProduct) { - KernelScope kernel_scope; - const int M = 4; const int N = 4; @@ -284,8 +272,6 @@ TEST(Reductions, ReduceProduct) { // Maximum reductions. TEST(Reductions, ReduceMax) { - KernelScope kernel_scope; - Placeholder in_(BufHandle("b", {10}, kFloat)); std::vector in(10); @@ -325,8 +311,6 @@ TEST(Reductions, ReduceMax) { // Minimum reduction, with custom initialization. TEST(Reductions, ReduceMinCustomInitializer) { - KernelScope kernel_scope; - VarHandle minInit("minInit", kFloat); Placeholder in_(BufHandle("b", {10}, kFloat)); @@ -363,8 +347,6 @@ TEST(Reductions, ReduceMinCustomInitializer) { // Example implementation of Any/All. // TODO: this is very awkward without logical And/Or operators. TEST(Reductions, ReduceAnyAll) { - KernelScope kernel_scope; - VarHandle searchValue("searchValue", kInt); Placeholder b(BufHandle("b", {4, 10}, kInt)); @@ -449,8 +431,6 @@ TEST(Reductions, ReduceAnyAll) { } TEST(Reductions, ReduceMatmul2D) { - KernelScope kernel_scope; - Placeholder tA(BufHandle("tA", {3, 2}, kFloat)); Placeholder tB(BufHandle("tB", {2, 3}, kFloat)); @@ -491,8 +471,6 @@ TEST(Reductions, ReduceMatmul2D) { } TEST(Reductions, ReduceRfactorLike) { - KernelScope kernel_scope; - Placeholder in(BufHandle("in", {10, 10}, kFloat)); std::vector in_(100); for (int i = 0; i < 100; ++i) { @@ -518,8 +496,6 @@ TEST(Reductions, ReduceRfactorLike) { } TEST(Reductions, ReduceAsProducer) { - KernelScope kernel_scope; - const int M = 10; VarHandle m("m", kInt); @@ -563,8 +539,6 @@ TEST(Reductions, ReduceAsProducer) { } TEST(Reductions, ReduceAsConsumer) { - KernelScope kernel_scope; - const int M = 10; VarHandle m("m", kInt); @@ -614,8 +588,6 @@ TEST(Reductions, ReduceAsConsumer) { } TEST(Reductions, SplitReduceAxis) { - KernelScope kernel_scope; - Placeholder in(BufHandle("in", {16, 8}, kFloat)); std::vector in_(16 * 8); @@ -645,8 +617,6 @@ TEST(Reductions, SplitReduceAxis) { } TEST(Reductions, SplitNonReduceAxis) { - KernelScope kernel_scope; - Placeholder in(BufHandle("in", {16, 8}, kFloat)); std::vector in_(16 * 8); @@ -676,7 +646,6 @@ TEST(Reductions, SplitNonReduceAxis) { } TEST(Reductions, ReorderedReductionInitializer) { - KernelScope kernel_scope; /* From the quip: for k in 0..1: // blockIdx for m in 0..128: @@ -726,8 +695,6 @@ TEST(Reductions, ReorderedReductionInitializer) { } TEST(Reductions, ReduceRfactor) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; VarHandle m("m", kInt); @@ -759,8 +726,6 @@ TEST(Reductions, ReduceRfactor) { } TEST(Reductions, Reduce3DRfactorInner) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -794,8 +759,6 @@ TEST(Reductions, Reduce3DRfactorInner) { } TEST(Reductions, Reduce3DRfactorOuter) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -828,8 +791,6 @@ TEST(Reductions, Reduce3DRfactorOuter) { } TEST(Reductions, ReduceRepeatedInternalRfactor) { - KernelScope kernel_scope; - Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat)); const int InputSize = 2 * 3 * 4 * 5 * 6; @@ -875,8 +836,6 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { // Split a reduction axis with a tail loop. TEST(Reductions, ReduceSplitTail) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -908,8 +867,6 @@ TEST(Reductions, ReduceSplitTail) { // Split a reduction axis cleanly so there is no tail loop. TEST(Reductions, ReduceSplitNoTail) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -941,8 +898,6 @@ TEST(Reductions, ReduceSplitNoTail) { // Split a reduction axis with only a tail loop (the split loop will be size 0 // and eliminated out). TEST(Reductions, ReduceOverSplitTail) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -974,8 +929,6 @@ TEST(Reductions, ReduceOverSplitTail) { // Split a reduction axis with a mask. TEST(Reductions, ReduceSplitMask) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -1007,8 +960,6 @@ TEST(Reductions, ReduceSplitMask) { // Split a reduction axis cleanly not requiring a mask. TEST(Reductions, ReduceSplitNoMask) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -1039,8 +990,6 @@ TEST(Reductions, ReduceSplitNoMask) { // Split a reduction axis with all logic in the mask. TEST(Reductions, ReduceOverSplitMask) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -1073,8 +1022,6 @@ TEST(Reductions, ReduceOverSplitMask) { // Test an rfactor when there are two ReduceOps in the graph due to a // splitWithTail. TEST(Reductions, ReduceSplitRfactor) { - KernelScope kernel_scope; - const int M = 2; const int N = 10; const int K = 10; @@ -1117,8 +1064,6 @@ TEST(Reductions, ReduceSplitRfactor) { // Test an rfactor which ends up being eliminated since the total loop size is // smaller than the split factor. TEST(Reductions, ReduceOverSplitRfactor) { - KernelScope kernel_scope; - const int N = 10; const int K = 10; const int SPLIT_FACTOR = 16; @@ -1174,7 +1119,6 @@ TEST(Reductions, ReduceOverSplitRfactor) { } TEST(Reductions, ReduceInlineReduction) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1207,7 +1151,6 @@ TEST(Reductions, ReduceInlineReduction) { } TEST(Reductions, ReduceInlineConsumer) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1261,7 +1204,6 @@ TEST(Reductions, ReduceInlineConsumer) { } TEST(Reductions, ReduceInlineReducerInternal) { - KernelScope kernel_scope; const int M = 4; const int N = 5; const int K = 6; @@ -1319,8 +1261,6 @@ TEST(Reductions, ReduceInlineReducerInternal) { } TEST(Reductions, ReductionCacheAccessesOperatorAxis) { - KernelScope kernel_scope; - int L = 4; int N = 3; int M = 2; @@ -1396,8 +1336,6 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) { } TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { - KernelScope kernel_scope; - int L = 4; int N = 3; int M = 2; @@ -1471,8 +1409,6 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { } TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { - KernelScope kernel_scope; - int L = 4; int N = 3; int M = 2; @@ -1546,8 +1482,6 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { } TEST(Reductions, ReductionCacheBodyAccess) { - KernelScope kernel_scope; - Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); @@ -1587,8 +1521,6 @@ TEST(Reductions, ReductionCacheBodyAccess) { } TEST(Reductions, ReductionCacheConsumerAccess) { - KernelScope kernel_scope; - Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); @@ -1628,8 +1560,6 @@ TEST(Reductions, ReductionCacheConsumerAccess) { } TEST(Reductions, ReductionSplitCacheConsumerAccess) { - KernelScope kernel_scope; - Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); @@ -1676,8 +1606,6 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { } TEST(Reductions, ReductionReorderCacheConsumerAccess) { - KernelScope kernel_scope; - Placeholder a(BufHandle("a", {24, 32, 12}, kFloat)); Placeholder b(BufHandle("b", {24, 32, 12}, kFloat)); @@ -1725,8 +1653,6 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { } TEST(Reductions, ReductionRfactorCacheTempOuter) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -1794,8 +1720,6 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) { } TEST(Reductions, ReductionRfactorCacheTempInner) { - KernelScope kernel_scope; - const int M = 10; const int N = 10; const int K = 10; @@ -1858,8 +1782,6 @@ TEST(Reductions, ReductionRfactorCacheTempInner) { } TEST(Reductions, ReductionVectorize) { - KernelScope kernel_scope; - std::vector in_(8 * 8); for (int i = 0; i < 8; ++i) { for (int j = 0; j < 8; ++j) { @@ -1905,8 +1827,6 @@ TEST(Reductions, ReductionVectorize) { } TEST(Reductions, ReductionVectorizeInner) { - KernelScope kernel_scope; - Placeholder in(BufHandle("in", {8, 8}, kFloat)); Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); @@ -1916,8 +1836,6 @@ TEST(Reductions, ReductionVectorizeInner) { } TEST(Reductions, ReductionVectorizeRfactor) { - KernelScope kernel_scope; - std::vector in_(8 * 8); for (int i = 0; i < 8; ++i) { for (int j = 0; j < 8; ++j) { @@ -1983,7 +1901,6 @@ TEST(Reductions, ReductionVectorizeRfactor) { } TEST(Reductions, InitFunction) { - KernelScope ks; constexpr int M = 32; constexpr int N = 16; Placeholder A("A", kFloat, {M, N}); diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp index 98a53058a1a65..1338b6d19c929 100644 --- a/test/cpp/tensorexpr/test_registerizer.cpp +++ b/test/cpp/tensorexpr/test_registerizer.cpp @@ -13,7 +13,6 @@ using namespace torch::jit::tensorexpr; // Can replace a simple scalar access with a local variable. TEST(Registerizer, RegisterizerSimple) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -58,7 +57,6 @@ TEST(Registerizer, RegisterizerSimple) { // Won't do replacement of a loop access. TEST(Registerizer, RegisterizerLoop) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -105,7 +103,6 @@ TEST(Registerizer, RegisterizerLoop) { // Won't replace even if the load is a fixed scalar, since the store could // invalidate it. TEST(Registerizer, RegisterizerLoopFixedLoad) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -152,7 +149,6 @@ TEST(Registerizer, RegisterizerLoopFixedLoad) { // We can registerize accesses that occur entirely within inner scopes, even if // they depend on the loop var. TEST(Registerizer, RegisterizerLoopInternal) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({For::make( @@ -203,7 +199,6 @@ TEST(Registerizer, RegisterizerLoopInternal) { // An access can be overlapped by another read in the same Expr. In this case // B[z] and B[y] overlap and prevent registerization of both accesses. TEST(Registerizer, RegisterizerLoopInternalLoadOverlap) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -235,7 +230,6 @@ TEST(Registerizer, RegisterizerLoopInternalLoadOverlap) { } TEST(Registerizer, RegisterizerLoopInternalRepeated) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -307,7 +301,6 @@ TEST(Registerizer, RegisterizerLoopInternalRepeated) { } TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapLoopVar) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -353,7 +346,6 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapLoopVar) { } TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -400,7 +392,6 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) { // Will registerize multiple accesses of different items of the same buffer. TEST(Registerizer, RegisterizerMultiVar) { - KernelScope kernel_scope; BufHandle a("A", {2}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({ @@ -456,7 +447,6 @@ TEST(Registerizer, RegisterizerMultiVar) { // Will registerize the valid accesses while skipping invalid replacements. TEST(Registerizer, RegisterizerVariableLoad) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -512,7 +502,6 @@ TEST(Registerizer, RegisterizerVariableLoad) { // Can registerize variable accesses so long as the variable does not change. TEST(Registerizer, RegisterizerSymbolicIndices) { - KernelScope kernel_scope; VarHandle i("i", kInt); VarHandle N("N", kInt); BufHandle a("A", {N}, kInt); @@ -559,7 +548,6 @@ TEST(Registerizer, RegisterizerSymbolicIndices) { // Can registerize accesses dependent on multiple loop vars. TEST(Registerizer, RegisterizerMultiLoop) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -616,7 +604,6 @@ TEST(Registerizer, RegisterizerMultiLoop) { // Can registerize correctly if scalars already exist in the program. TEST(Registerizer, RegisterizerRepeated) { - KernelScope kernel_scope; BufHandle a("A", {2}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({ @@ -673,7 +660,6 @@ TEST(Registerizer, RegisterizerRepeated) { // Can registerize the load of A. TEST(Registerizer, RegisterizerNoLoads) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -714,7 +700,6 @@ TEST(Registerizer, RegisterizerNoLoads) { // Can registerize the load of A but not the store of B. TEST(Registerizer, RegisterizerNoRepeatedStores) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -763,7 +748,6 @@ TEST(Registerizer, RegisterizerNoRepeatedStores) { // Won't registerize if there are multiple accesses which may overlap. TEST(Registerizer, RegisterizerMultiVarOverlap) { - KernelScope kernel_scope; BufHandle a("A", {2}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({ @@ -792,8 +776,6 @@ TEST(Registerizer, RegisterizerMultiVarOverlap) { } TEST(Registerizer, RegisterizerAllocs) { - KernelScope kernel_scope; - BufHandle a("A", {2}, kInt); BufHandle c("C", {1}, kInt); VarHandle x("x", kInt); @@ -860,7 +842,6 @@ TEST(Registerizer, RegisterizerAllocs) { } TEST(Registerizer, RegisterizerNoInitializer) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({For::make( @@ -900,7 +881,6 @@ TEST(Registerizer, RegisterizerNoInitializer) { } TEST(Registerizer, RegisterizerNoInitializerLoopVar) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({For::make( @@ -929,7 +909,6 @@ TEST(Registerizer, RegisterizerNoInitializerLoopVar) { } TEST(Registerizer, RegisterizerLoadThenStore) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); BufHandle b("B", {1}, kInt); VarHandle x("x", kInt); @@ -980,7 +959,6 @@ TEST(Registerizer, RegisterizerLoadThenStore) { } TEST(Registerizer, RegisterizerParallelized) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); LoopOptions loopOpts; @@ -1009,7 +987,6 @@ TEST(Registerizer, RegisterizerParallelized) { // Should be able to registerize this since the scalar would exist before the // branch. TEST(Registerizer, RegisterizerConditionAfter) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1059,7 +1036,6 @@ TEST(Registerizer, RegisterizerConditionAfter) { // Should be able to registerize this since the scalar exists in the same form // after the branch and there is no overlap. TEST(Registerizer, RegisterizerConditionBefore) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1111,7 +1087,6 @@ TEST(Registerizer, RegisterizerConditionBefore) { // Should be able to registerize this as the combination of the two above rules. TEST(Registerizer, RegisterizerConditionInside) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1171,7 +1146,6 @@ TEST(Registerizer, RegisterizerConditionInside) { // condition, and both sides are large enough to be registerized but cannot be // because there is no safe place to put the initializer or finalizer. TEST(Registerizer, RegisterizerConditionInsideOverlap1) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1231,7 +1205,6 @@ TEST(Registerizer, RegisterizerConditionInsideOverlap1) { // the condition, and the first group must be finalized before the Cond, the // second initialized after it. TEST(Registerizer, RegisterizerConditionInsideOverlap2) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1317,7 +1290,6 @@ TEST(Registerizer, RegisterizerConditionInsideOverlap2) { // the accesses in it don't need to be valid (think size checks on the index). // In this case the accesses cannot be registerized. TEST(Registerizer, RegisterizerConditionHidden) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1359,7 +1331,6 @@ TEST(Registerizer, RegisterizerConditionHidden) { // the user's fault). It "unhides" the conditional accesses, allowing // registerization to occur. TEST(Registerizer, RegisterizerConditionUnhidden) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1420,7 +1391,6 @@ TEST(Registerizer, RegisterizerConditionUnhidden) { // Can registerize a load that occurs in the condition of a Cond. TEST(Registerizer, RegisterizerCondCondition) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1471,7 +1441,6 @@ TEST(Registerizer, RegisterizerCondCondition) { // Appearing in the condition of a Cond makes it visible to the enclosing scope, // and so we can registerize internal usages. TEST(Registerizer, RegisterizerCondConditionUnhidden) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1520,7 +1489,6 @@ TEST(Registerizer, RegisterizerCondConditionUnhidden) { // Conditional hiding also works for IfThenElse exprs. TEST(Registerizer, RegisterizerIfThenElseHidden) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1562,7 +1530,6 @@ TEST(Registerizer, RegisterizerIfThenElseHidden) { // Conditional unhiding also works for IfThenElse exprs. TEST(Registerizer, RegisterizerIfThenElseUnhidden) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1617,7 +1584,6 @@ TEST(Registerizer, RegisterizerIfThenElseUnhidden) { // Nested IfThenElse exprs can't promote to higher level scopes. TEST(Registerizer, RegisterizerIfThenElseNested) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1661,7 +1627,6 @@ TEST(Registerizer, RegisterizerIfThenElseNested) { // to check that we don't promote the initializer/finalizer to the enclosing // Block. TEST(Registerizer, RegisterizerIfThenElseInternal) { - KernelScope kernel_scope; // Making these floats so they don't get simplified to a single access. BufHandle a("A", {5}, kFloat); BufHandle b("B", {5}, kFloat); @@ -1740,7 +1705,6 @@ TEST(Registerizer, RegisterizerIfThenElseInternal) { // Can registerize a load that occurs in the condition of an IfThenElse; TEST(Registerizer, RegisterizerIfThenElseCondition) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1786,7 +1750,6 @@ TEST(Registerizer, RegisterizerIfThenElseCondition) { // Appearing in the condition of a Cond makes it visible to the enclosing scope, // and so we can registerize internal usages. TEST(Registerizer, RegisterizerIfThenElseConditionUnhidden) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1826,7 +1789,6 @@ TEST(Registerizer, RegisterizerIfThenElseConditionUnhidden) { // Cannot promote accesses internal to IfThenElse branches even if the enclosing // scope if conditional. TEST(Registerizer, RegisterizerConditionBranchOnly) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({For::make( @@ -1877,7 +1839,6 @@ TEST(Registerizer, RegisterizerConditionBranchOnly) { // We can registerize an IfThenElse that appears in the condition branch of a // Cond. This is a weird but valid thing to do. TEST(Registerizer, RegisterizerCondIfThenElse) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); BufHandle c("C", {5}, kInt); @@ -1927,7 +1888,6 @@ TEST(Registerizer, RegisterizerCondIfThenElse) { // Can registerize a conditional access in the RHS of a store unhidden by it's // LHS, and hoist it out of a loop. TEST(Registerizer, RegisterizerIfThenElseLoop) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); VarHandle x("x", kInt); @@ -1977,7 +1937,6 @@ TEST(Registerizer, RegisterizerIfThenElseLoop) { // Cannot registerize if the RHS overlaps the access creating visibility. TEST(Registerizer, RegisterizerIfThenElseLoopCut) { - KernelScope kernel_scope; BufHandle a("A", {5}, kInt); BufHandle b("B", {5}, kInt); VarHandle x("x", kInt); @@ -2016,7 +1975,6 @@ TEST(Registerizer, RegisterizerIfThenElseLoopCut) { // Simple case where an access is cut by an overlapping access later in the // program, we can registerize up until the overlap. TEST(Registerizer, RegisterizerPartialAfter) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2073,7 +2031,6 @@ TEST(Registerizer, RegisterizerPartialAfter) { // We can registerize an access which overlaps a previous access, the // initializer must be inserted after the previous access. TEST(Registerizer, RegisterizerPartialBefore) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2130,7 +2087,6 @@ TEST(Registerizer, RegisterizerPartialBefore) { // The combination of the previous two tests, an access is cut by an overlapping // access in both directions. TEST(Registerizer, RegisterizerPartialInside) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x1("x1", kInt); VarHandle x2("x2", kInt); @@ -2200,7 +2156,6 @@ TEST(Registerizer, RegisterizerPartialInside) { // access, we should break this into two scalars and write back to the buffer // before the condition. TEST(Registerizer, RegisterizerPartialCondition) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2270,7 +2225,6 @@ TEST(Registerizer, RegisterizerPartialCondition) { // Tests case where an access is cut by an internal conditional access which // itself is registerized. TEST(Registerizer, RegisterizerPartialConditionInternalCut) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2333,7 +2287,6 @@ TEST(Registerizer, RegisterizerPartialConditionInternalCut) { // First statment in condition closes outer access, but can be registerized with // later statements. TEST(Registerizer, RegisterizerPartialConditionInternalStart) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2397,7 +2350,6 @@ TEST(Registerizer, RegisterizerPartialConditionInternalStart) { // An access cuts two open overlaps and creates four scalar variables. TEST(Registerizer, RegisterizerPartialOverlapsTwo) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2468,7 +2420,6 @@ TEST(Registerizer, RegisterizerPartialOverlapsTwo) { // Nested blocks will automatically be flattened and do not provent // registerization of enclosed accesses. TEST(Registerizer, RegisterizerNestedBlocks) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2522,7 +2473,6 @@ TEST(Registerizer, RegisterizerNestedBlocks) { // The access can be registerized internally to a condition, but must ensure // that both initializer and finalizer are within the same condition. TEST(Registerizer, RegisterizerNestedConditions) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make({Cond::make( @@ -2578,7 +2528,6 @@ TEST(Registerizer, RegisterizerNestedConditions) { // If an access exists outside the scope of the condition then we can lift // nested conditional usages into the same scalar. TEST(Registerizer, RegisterizerNestedConditionsUnhidden) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2634,7 +2583,6 @@ TEST(Registerizer, RegisterizerNestedConditionsUnhidden) { } TEST(Registerizer, RegisterizerNestedConditionsHiddenFirst) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2677,7 +2625,6 @@ TEST(Registerizer, RegisterizerNestedConditionsHiddenFirst) { } TEST(Registerizer, RegisterizerNestedConditionsHiddenSecond) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2722,7 +2669,6 @@ TEST(Registerizer, RegisterizerNestedConditionsHiddenSecond) { // If an access is cut by another access internal to a condition block, it still // cuts the access. TEST(Registerizer, RegisterizerNestedConditionsCut) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -2761,7 +2707,6 @@ TEST(Registerizer, RegisterizerNestedConditionsCut) { } TEST(Registerizer, RegisterizerNestedConditionLoopHidden) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -2808,7 +2753,6 @@ TEST(Registerizer, RegisterizerNestedConditionLoopHidden) { // Three loops and four element regions, three of which should be registerized // at different levels of the IR. TEST(Registerizer, RegisterizerNestedConditionThreeDeep) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -2908,7 +2852,6 @@ TEST(Registerizer, RegisterizerNestedConditionThreeDeep) { // Can replace a simple scalar access with a local variable even when that // variable is an outer loop var. TEST(Registerizer, RegisterizerNestedLoopSimple) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -2963,7 +2906,6 @@ TEST(Registerizer, RegisterizerNestedLoopSimple) { // conditional access can be hoisted up through a loop to match an existing // access in a higher scope and the two can be registerized. TEST(Registerizer, RegisterizerHiddenAccessYes) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -3046,7 +2988,6 @@ TEST(Registerizer, RegisterizerHiddenAccessYes) { // never unhidden at a higher scope and registerization occurs at the lower // scope. TEST(Registerizer, RegisterizerHiddenAccessNo) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -3126,7 +3067,6 @@ TEST(Registerizer, RegisterizerHiddenAccessNo) { // two accesses here one is unhidden and the other isnt. A[0] can be // registerized but B[0] cannot. TEST(Registerizer, RegisterizerHiddenAccessMultiLoop) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); @@ -3208,7 +3148,6 @@ TEST(Registerizer, RegisterizerHiddenAccessMultiLoop) { // Accesses are registerized inside two conditions, but the immeidate parent is // not a condition. TEST(Registerizer, RegisterizerTwoConditionalLoops) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -3280,7 +3219,6 @@ TEST(Registerizer, RegisterizerTwoConditionalLoops) { // Accesses are registerized inside two conditions, cut in the middle. TEST(Registerizer, RegisterizerTwoConditionalLoopsCut) { - KernelScope kernel_scope; BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -3362,7 +3300,6 @@ TEST(Registerizer, RegisterizerTwoConditionalLoopsCut) { // references a Let var in a local scope which cannot be hoisted out of the // loop. TEST(Registerizer, RegisterizerLoopLetVar) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -3396,7 +3333,6 @@ TEST(Registerizer, RegisterizerLoopLetVar) { // references a Let var in an outer scope that does not prevent hoisting the // initializer. TEST(Registerizer, RegisterizerLoopLetVarOuter) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -3444,7 +3380,6 @@ TEST(Registerizer, RegisterizerLoopLetVarOuter) { // Okay so the registerizer generally goes after index flattening, but just in // case. Test multi index registerization. TEST(Registerizer, RegisterizerMultiDim) { - KernelScope kernel_scope; BufHandle a("A", {3, 4, 5}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -3490,7 +3425,6 @@ TEST(Registerizer, RegisterizerMultiDim) { // Wont registerize if only some dims match, but will still registerize distinct // elements. TEST(Registerizer, RegisterizerMultiDimPartial) { - KernelScope kernel_scope; BufHandle a("A", {3, 4, 5}, kInt); VarHandle x("x", kInt); StmtPtr stmt = Block::make( @@ -3538,7 +3472,6 @@ TEST(Registerizer, RegisterizerMultiDimPartial) { // If they could overlap across all dimensions we cannot registerize. TEST(Registerizer, RegisterizerMultiDimOverlap) { - KernelScope kernel_scope; BufHandle a("A", {3, 4, 5}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -3573,7 +3506,6 @@ TEST(Registerizer, RegisterizerMultiDimOverlap) { // But, if one dimension is known to be distinct they do not overlap. TEST(Registerizer, RegisterizerMultiDimPartialOverlap) { - KernelScope kernel_scope; BufHandle a("A", {3, 4, 5}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -3619,7 +3551,6 @@ TEST(Registerizer, RegisterizerMultiDimPartialOverlap) { // A 3D reduction with different input dimensionality. TEST(Registerizer, RegisterizerMultiDim3DReduction1) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10, 10}, kInt); BufHandle c("C", {10, 10, 10}, kInt); @@ -3691,7 +3622,6 @@ TEST(Registerizer, RegisterizerMultiDim3DReduction1) { // A 3D reduction with the same smaller dimensionality using different loop // vars. TEST(Registerizer, RegisterizerMultiDim3DReduction2) { - KernelScope kernel_scope; BufHandle a("A", {10}, kInt); BufHandle b("B", {10}, kInt); BufHandle c("C", {10}, kInt); diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index 0df9e9242e198..48983c8f4ba33 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -14,7 +14,6 @@ using namespace torch::jit::tensorexpr; using SimpleIRExprEval = ExprEval; TEST(Simplify, ConstantFoldSimple) { - KernelScope kernel_scope; ExprHandle a(2.0f); ExprHandle b(3.0f); ExprHandle f = (a + b); @@ -28,7 +27,6 @@ TEST(Simplify, ConstantFoldSimple) { } TEST(Simplify, ConstantFoldTwoLayer) { - KernelScope kernel_scope; ExprHandle a(2.0f); ExprHandle b(3.0f); ExprHandle c(4.0f); @@ -44,7 +42,6 @@ TEST(Simplify, ConstantFoldTwoLayer) { } TEST(Simplify, ConstantFoldShifts) { - KernelScope kernel_scope; ExprHandle a(7); ExprHandle b(2); ExprHandle c(3); @@ -59,7 +56,6 @@ TEST(Simplify, ConstantFoldShifts) { } TEST(Simplify, ConstantFoldBitwise) { - KernelScope kernel_scope; ExprHandle a(59); ExprHandle b(22); ExprHandle c(101); @@ -74,7 +70,6 @@ TEST(Simplify, ConstantFoldBitwise) { } TEST(Simplify, ConstantFoldMultiOp) { - KernelScope kernel_scope; ExprHandle a(2.0f); ExprHandle b(3.0f); ExprHandle c(4.0f); @@ -93,7 +88,6 @@ TEST(Simplify, ConstantFoldMultiOp) { } TEST(Simplify, ConstantFoldMinMax) { - KernelScope kernel_scope; ExprHandle a(12.0f); ExprHandle b(15.0f); ExprHandle c(17.0f); @@ -113,7 +107,6 @@ TEST(Simplify, ConstantFoldMinMax) { } TEST(Simplify, ConstantFoldIntrinsics) { - KernelScope kernel_scope; ExprHandle a(2.0f); ExprHandle b(3.0f); ExprHandle c(4.0f); @@ -135,7 +128,6 @@ TEST(Simplify, ConstantFoldIntrinsics) { } TEST(Simplify, ConstantFoldCastToBool) { - KernelScope kernel_scope; ExprHandle f = Cast::make(kBool, IntImm::make(0)); ExprHandle newF = IRSimplifier::simplify(f); SimpleIRExprEval eval(newF); @@ -143,7 +135,6 @@ TEST(Simplify, ConstantFoldCastToBool) { } TEST(Simplify, ConstantFoldWithVar) { - KernelScope kernel_scope; { VarHandle x("x", kInt); ExprHandle body = x * (ExprHandle(2) + ExprHandle(4)); @@ -174,7 +165,6 @@ TEST(Simplify, ConstantFoldWithVar) { } TEST(Simplify, ConditionalSelectFoldSimple) { - KernelScope kernel_scope; ExprHandle a(3.0f); ExprHandle b(4.0f); ExprHandle c(3.0f); @@ -221,7 +211,6 @@ TEST(Simplify, ConditionalSelectFoldSimple) { } TEST(Simplify, ConditionalSelectFoldTwoLayer) { - KernelScope kernel_scope; ExprHandle a(3.0f); ExprHandle b(2.0f); ExprHandle c(2.0f); @@ -269,7 +258,6 @@ TEST(Simplify, ConditionalSelectFoldTwoLayer) { } TEST(Simplify, ConditionalSelectFoldWithVar) { - KernelScope kernel_scope; VarHandle x("x", kFloat); ExprHandle f = x < 4.f; @@ -290,7 +278,6 @@ TEST(Simplify, ConditionalSelectFoldWithVar) { } TEST(Simplify, UnFoldableExpr) { - KernelScope kernel_scope; VarHandle x("x", kFloat); VarHandle y("y", kFloat); ExprHandle body = (ExprHandle(3) * x) + (ExprHandle(5) * y); @@ -308,7 +295,6 @@ TEST(Simplify, UnFoldableExpr) { } TEST(Simplify, HashSimple) { - KernelScope kernel_scope; VarHandle x("x", kFloat); ExprHandle a(2.0f); ExprHandle b(3.0f); @@ -329,7 +315,6 @@ TEST(Simplify, HashSimple) { } TEST(Simplify, HashEquivalence) { - KernelScope kernel_scope; VarHandle x("x", kFloat); VarHandle y("y", kFloat); ExprHandle f = (x * y) + (x * y); @@ -366,7 +351,6 @@ TEST(Simplify, HashEquivalence) { } TEST(Simplify, HashEquivalenceRand) { - KernelScope kernel_scope; ExprHandle f = Intrinsics::make(kRand, kFloat) + Intrinsics::make(kRand, kInt); @@ -386,7 +370,6 @@ TEST(Simplify, HashEquivalenceRand) { } TEST(Simplify, HashEquivalenceAfterFolding) { - KernelScope kernel_scope; VarHandle x("x", kFloat); ExprHandle a(2.0f); ExprHandle b(3.0f); @@ -412,8 +395,6 @@ TEST(Simplify, HashEquivalenceAfterFolding) { } TEST(Simplify, HashDifferenceTypes) { - KernelScope kernel_scope; - HashProvider hasher; std::vector immediates; @@ -446,7 +427,6 @@ TEST(Simplify, HashDifferenceTypes) { } TEST(Simplify, HashLargeExpression) { - KernelScope kernel_scope; constexpr int N = 1024; BufHandle a("A", {N}, kInt); BufHandle b("B", {N}, kInt); @@ -490,7 +470,6 @@ TEST(Simplify, HashLargeExpression) { } TEST(Simplify, HashForLoopOptions) { - KernelScope kernel_scope; constexpr int N = 1024; BufHandle a("A", {N}, kInt); BufHandle b("B", {N}, kInt); @@ -532,7 +511,6 @@ TEST(Simplify, HashForLoopOptions) { /// (2 + x) + 4 => x + 6 TEST(Simplify, SimplifyAdd) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -558,7 +536,6 @@ TEST(Simplify, SimplifyAdd) { /// (2 - x) - 4 => -2 - x TEST(Simplify, SimplifySub) { - KernelScope kernel_scope; VarHandle x("x", kInt); ExprHandle body = (ExprHandle(2) - x) - ExprHandle(4); @@ -575,7 +552,6 @@ TEST(Simplify, SimplifySub) { /// 2 * (1 - x) - 4 => 2 * (-3 - x) TEST(Simplify, SimplifyMultiLayer) { - KernelScope kernel_scope; VarHandle x("x", kInt); ExprHandle body = ExprHandle(2) * ((ExprHandle(1) - x) - ExprHandle(4)); ExprHandle simplified = IRSimplifier::simplify(body); @@ -588,7 +564,6 @@ TEST(Simplify, SimplifyMultiLayer) { /// 2 * (3 * x) - (x * 4) => 2 * x TEST(Simplify, SimplifyMultiTerm) { - KernelScope kernel_scope; VarHandle x("x", kInt); ExprHandle body = (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4))); @@ -606,7 +581,6 @@ TEST(Simplify, SimplifyMultiTerm) { /// 2 * (3 * (long)x) - (x * 4) => 2 * x TEST(Simplify, SimplifyCasts) { - KernelScope kernel_scope; VarHandle x("x", kLong); ExprHandle body = (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4))); @@ -624,7 +598,6 @@ TEST(Simplify, SimplifyCasts) { /// (x + 0) * 1 => x TEST(Simplify, SimplifyEliminatesNoOps) { - KernelScope kernel_scope; VarHandle x("x", kInt); ExprHandle body = (x + ExprHandle(0)) * 1; @@ -636,7 +609,6 @@ TEST(Simplify, SimplifyEliminatesNoOps) { /// Cannot simplify this. TEST(Simplify, SimplifyMultiVar) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); ExprHandle body = x * 24 + y * 34; @@ -659,7 +631,6 @@ TEST(Simplify, SimplifyMultiVar) { // x + 2 + y => x + y + 2 TEST(Simplify, DISABLED_SimplifyReorderings) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); ExprHandle body = x + 2 + y; @@ -676,7 +647,6 @@ TEST(Simplify, DISABLED_SimplifyReorderings) { /// y + x * 0 => y TEST(Simplify, SimplifyEliminatesVar) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); ExprHandle body = y + x * ExprHandle(0); @@ -686,7 +656,6 @@ TEST(Simplify, SimplifyEliminatesVar) { } TEST(Simplify, SimplifyAdds) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -757,7 +726,6 @@ TEST(Simplify, SimplifyAdds) { } TEST(Simplify, SimplifyMuls) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -959,7 +927,6 @@ TEST(Simplify, SimplifyMuls) { // Sub an expr from itself will result in zero. TEST(Simplify, SimplifySubs) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -1125,7 +1092,6 @@ TEST(Simplify, SimplifySubs) { } TEST(Simplify, SimplifyDiv) { - KernelScope kernel_scope; VarHandle x("x", kInt); { @@ -1144,7 +1110,6 @@ TEST(Simplify, SimplifyDiv) { } TEST(Simplify, SimplifyDivWithLoopContext1) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // A[i] = (i + 24) / 6; @@ -1166,7 +1131,6 @@ TEST(Simplify, SimplifyDivWithLoopContext1) { } TEST(Simplify, SimplifyDivWithLoopContext2) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 5; i++) { // A[i] = (i + 25) / 6; @@ -1188,7 +1152,6 @@ TEST(Simplify, SimplifyDivWithLoopContext2) { } TEST(Simplify, SimplifyDivWithLoopContext3) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // A[i] = (i + 24) / (-6); @@ -1210,7 +1173,6 @@ TEST(Simplify, SimplifyDivWithLoopContext3) { } TEST(Simplify, SimplifyDivWithLoopContext4) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 5; i++) { // A[i] = (i - 5) / 6; @@ -1232,7 +1194,6 @@ TEST(Simplify, SimplifyDivWithLoopContext4) { } TEST(Simplify, SimplifyDivWithLoopContext5) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // for (int j = 0; j < 10; j++) { @@ -1259,7 +1220,6 @@ TEST(Simplify, SimplifyDivWithLoopContext5) { } TEST(Simplify, SimplifyDivWithLoopContext6) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // for (int j = -1; j < 9; j++) { @@ -1287,7 +1247,6 @@ TEST(Simplify, SimplifyDivWithLoopContext6) { } TEST(Simplify, SimplifyDivWithLoopContext7) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // for (int j = 0; j < 10; j++) { @@ -1315,7 +1274,6 @@ TEST(Simplify, SimplifyDivWithLoopContext7) { } TEST(Simplify, SimplifyModWithLoopContext0) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 100; i++) { // A[i] = i % 100; @@ -1337,7 +1295,6 @@ TEST(Simplify, SimplifyModWithLoopContext0) { } TEST(Simplify, SimplifyModWithLoopContext1) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // A[i] = (i + 24) % 6; @@ -1359,7 +1316,6 @@ TEST(Simplify, SimplifyModWithLoopContext1) { } TEST(Simplify, SimplifyModWithLoopContext2) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 5; i++) { // A[i] = (i + 25) % 6; @@ -1381,7 +1337,6 @@ TEST(Simplify, SimplifyModWithLoopContext2) { } TEST(Simplify, SimplifyModWithLoopContext3) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // A[i] = (i + 24) % (-6); @@ -1403,7 +1358,6 @@ TEST(Simplify, SimplifyModWithLoopContext3) { } TEST(Simplify, SimplifyModWithLoopContext4) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 5; i++) { // A[i] = (i - 5) % 6; @@ -1425,7 +1379,6 @@ TEST(Simplify, SimplifyModWithLoopContext4) { } TEST(Simplify, SimplifyModWithLoopContext5) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // for (int j = 0; j < 10; j++) { @@ -1452,7 +1405,6 @@ TEST(Simplify, SimplifyModWithLoopContext5) { } TEST(Simplify, SimplifyModWithLoopContext6) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // for (int j = -1; j < 9; j++) { @@ -1480,7 +1432,6 @@ TEST(Simplify, SimplifyModWithLoopContext6) { } TEST(Simplify, SimplifyModWithLoopContext7) { - KernelScope kernel_scope; // Stmt to simplify: // for (int i = 0; i < 6; i++) { // for (int j = 0; j < 10; j++) { @@ -1508,7 +1459,6 @@ TEST(Simplify, SimplifyModWithLoopContext7) { } TEST(Simplify, SimplifyMod) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -1635,7 +1585,6 @@ TEST(Simplify, SimplifyMod) { // Test that mixing ops together simplifies as expected. TEST(Simplify, SimplifyMultiOp) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -1704,7 +1653,6 @@ TEST(Simplify, SimplifyMultiOp) { // Test that chaining many ops together works as expected. TEST(Simplify, SimplifyManyOps) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -1752,7 +1700,6 @@ TEST(Simplify, SimplifyManyOps) { } TEST(Simplify, SimplifyFactorization) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -1874,7 +1821,6 @@ TEST(Simplify, SimplifyFactorization) { // (4 * x + y + z * 2) + (4 * x + y + z * 4) => 2 * (y + 3 * z + 4 * x) TEST(Simplify, SimplifyFactorizeUneven) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -1901,7 +1847,6 @@ TEST(Simplify, SimplifyFactorizeUneven) { // (x * y) + (2 * x) * (x + y) => 2 * (x * x) + 3 * (x * y) // This is kind of a placeholder test for variable factorization. TEST(Simplify, SimplifyDeeperTerms) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); ExprHandle body = (x * y) + (ExprHandle(2) * x) * (x + y); @@ -1925,7 +1870,6 @@ TEST(Simplify, SimplifyDeeperTerms) { // Tests the difference between two less trivial expressions. // (m * (1 * n_1) + (n + 1)) - (m * (1 * n_1) + n) => 1 TEST(Simplify, SimplifyDeeperDifference) { - KernelScope kernel_scope; VarHandle n("n", kInt); VarHandle n_1("n_1", kInt); VarHandle m("m", kInt); @@ -1939,7 +1883,6 @@ TEST(Simplify, SimplifyDeeperDifference) { // Test constant folding into the difference between expressions. // 2 + char((m * (1 * n_1) + (n + 1)) - (m * (1 * n_1) + n)) => 3 TEST(Simplify, SimplifyFoldComplexDifference) { - KernelScope kernel_scope; VarHandle n("n", kInt); VarHandle n_1("n_1", kInt); VarHandle m("m", kInt); @@ -1954,7 +1897,6 @@ TEST(Simplify, SimplifyFoldComplexDifference) { } TEST(Simplify, SimplifyIfComponents) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); ExprHandle body = IfThenElse::make( @@ -1976,7 +1918,6 @@ TEST(Simplify, SimplifyIfComponents) { } TEST(Simplify, SimplifyOpaqueTerms) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -2002,8 +1943,6 @@ TEST(Simplify, SimplifyOpaqueTerms) { } TEST(Simplify, SimplifySymbolicMinMax) { - KernelScope kernel_scope; - { // Minimum with constant difference between terms. VarHandle x("x", kInt); @@ -2038,7 +1977,6 @@ TEST(Simplify, SimplifySymbolicMinMax) { } TEST(Simplify, SimplifyNestedMax) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -2315,7 +2253,6 @@ TEST(Simplify, SimplifyNestedMax) { } TEST(Simplify, SimplifyNestedMin) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); VarHandle z("z", kInt); @@ -2592,8 +2529,6 @@ TEST(Simplify, SimplifyNestedMin) { } TEST(Simplify, SimplifyWontReorderFloat) { - KernelScope kernel_scope; - { // 3 * (3 * x) - 3 * (3 * y) => 9 * (x - y) // This is an expression we can simplify. @@ -2704,8 +2639,6 @@ TEST(Simplify, SimplifyWontReorderFloat) { } TEST(Simplify, SimplifyRoundModPattern) { - KernelScope kernel_scope; - { // (x/y)*y + x%y => x. VarHandle x("x", kInt); @@ -2887,8 +2820,6 @@ TEST(Simplify, SimplifyRoundModPattern) { } TEST(Simplify, SimplifyRoundModPatternFactorization) { - KernelScope kernel_scope; - { // Full factorization. // 2 * (x/y * y) + 2 * (x%y) => 2 * x. @@ -2947,8 +2878,6 @@ TEST(Simplify, SimplifyRoundModPatternFactorization) { } TEST(Simplify, SimplifyRoundModPatternMultivar) { - KernelScope kernel_scope; - { // Multivar. // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => x + y. @@ -2997,8 +2926,6 @@ TEST(Simplify, SimplifyRoundModPatternMultivar) { } TEST(Simplify, SimplifyModRoundModPattern) { - KernelScope kernel_scope; - { // t/7 % 9 * 7 + t % 7 => t%63 VarHandle t("t", kInt); @@ -3085,8 +3012,6 @@ TEST(Simplify, SimplifyModRoundModPattern) { } TEST(Simplify, SimplifyModRoundModPatternFactorization) { - KernelScope kernel_scope; - { // 2 * (t /7 % 9 * 7) + 2 * (t % 7) => 2 * (t % 63) VarHandle t("t", kInt); @@ -3154,8 +3079,6 @@ TEST(Simplify, SimplifyModRoundModPatternFactorization) { } TEST(Simplify, SimplifyModRoundModPatternMultivar) { - KernelScope kernel_scope; - { // t/7 % 9 * 7 + t % 7 + t => t % 63 + t VarHandle t("t", kInt); @@ -3260,8 +3183,6 @@ TEST(Simplify, SimplifyModRoundModPatternMultivar) { } TEST(Simplify, SimplifyDivisionScalarFactorization) { - KernelScope kernel_scope; - { // Simple factorization of numerator and denominator. // 8x / 4y => 2x / y. @@ -3332,8 +3253,6 @@ TEST(Simplify, SimplifyDivisionScalarFactorization) { } TEST(Simplify, SimplifyConstantBranches) { - KernelScope kernel_scope; - { // If the condition is constant true then take the true_value. // 1 ? x : y => x @@ -3390,8 +3309,6 @@ TEST(Simplify, SimplifyConstantBranches) { } TEST(Simplify, SimplifyConstantCond) { - KernelScope kernel_scope; - { // If the condition is constant true then take the true_value. // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1 @@ -3508,7 +3425,6 @@ TEST(Simplify, SimplifyConstantCond) { } TEST(Simplify, SimplifyEliminateEmptyCond) { - KernelScope kernel_scope; // If the branches are empty in different ways, eliminate. { VarHandle x("x", kInt); @@ -3536,8 +3452,6 @@ TEST(Simplify, SimplifyEliminateEmptyCond) { } TEST(Simplify, SimplifyConstantComparisons) { - KernelScope kernel_scope; - auto ComparisonTest = [](ExprHandle a, ExprHandle b, CompareSelectOperation op, int result) { ExprHandle body = CompareSelect::make(a, b, op); @@ -3582,7 +3496,6 @@ TEST(Simplify, SimplifyConstantComparisons) { } TEST(Simplify, SimplifySymbolicComparisons) { - KernelScope kernel_scope; VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -3720,8 +3633,6 @@ TEST(Simplify, SimplifySymbolicComparisons) { } TEST(Simplify, SimplifyEliminateZeroLengthFor) { - KernelScope kernel_scope; - { // Will eliminate zero loop For. BufHandle a("A", {4}, kInt); @@ -3780,8 +3691,6 @@ TEST(Simplify, SimplifyEliminateZeroLengthFor) { } TEST(Simplify, SimplifyOneLoopFor) { - KernelScope kernel_scope; - { // Will remove the loop if the body is run once. BufHandle a("A", {4}, kInt); @@ -3849,8 +3758,6 @@ TEST(Simplify, SimplifyOneLoopFor) { } TEST(Simplify, SimplifyForWontLoseLoopOptions) { - KernelScope kernel_scope; - { // Sanity check does nothing if the condition is not met. BufHandle a("A", {4}, kInt); @@ -3868,8 +3775,6 @@ TEST(Simplify, SimplifyForWontLoseLoopOptions) { } TEST(Simplify, SimplifyMultilevelFor) { - KernelScope kernel_scope; - { // Multiple layers of For will be simplified out. BufHandle a("A", {4}, kInt); @@ -3927,8 +3832,6 @@ TEST(Simplify, SimplifyMultilevelFor) { } TEST(Simplify, SimplifyForCleansUp) { - KernelScope kernel_scope; - { Placeholder a("a", kFloat, {1, 12, 1}); VarHandle x("x", kInt); @@ -3957,8 +3860,6 @@ TEST(Simplify, SimplifyForCleansUp) { } TEST(Simplify, SimplifyEliminateEmptyFor) { - KernelScope kernel_scope; - { // Flatten many layers around an empty block to an empty block. StmtPtr last = alloc(std::vector({})); @@ -3974,8 +3875,6 @@ TEST(Simplify, SimplifyEliminateEmptyFor) { } TEST(Simplify, SimplifyFlattenBlock) { - KernelScope kernel_scope; - { // Flatten multiple blocks down to one. // { { { stmt1, stmt2 } } } => { stmt1, stmt2 } @@ -4059,8 +3958,6 @@ TEST(Simplify, SimplifyFlattenBlock) { } TEST(Simplify, SimplifyEliminateZeroLengthAlloc) { - KernelScope kernel_scope; - { // Simple positive case. BufHandle b("x", {0}, kInt); @@ -4135,8 +4032,6 @@ TEST(Simplify, SimplifyEliminateZeroLengthAlloc) { } TEST(Simplify, DontSimplifyRand) { - KernelScope kernel_scope; - { // rand() + rand() = rand() + rand() NOT 2 * rand(). ExprHandle body = @@ -4169,7 +4064,6 @@ TEST(Simplify, DontSimplifyRand) { } TEST(Simplify, SimplifyReorderForCond) { - KernelScope kernel_scope; BufHandle a("A", {4}, kInt); BufHandle b("B", {1}, kInt); BufHandle c("C", {4}, kInt); @@ -4368,7 +4262,6 @@ TEST(Simplify, SimplifyReorderForCond) { } TEST(Simplify, SimplifyFuseConditions) { - KernelScope kernel_scope; BufHandle a("A", {2}, kInt); BufHandle b("B", {2}, kInt); VarHandle i("i", kInt); @@ -4778,7 +4671,6 @@ TEST(Simplify, SimplifyFuseConditions) { } TEST(Simplify, SimplifySyncThreads) { - KernelScope kernel_scope; BufHandle a("A", {4}, kInt); VarHandle i("i", kInt); @@ -4876,7 +4768,6 @@ TEST(Simplify, SimplifySyncThreads) { } TEST(Simplify, SimplifyRampSubBroadcast) { - KernelScope kernel_scope; int num_lanes = 4; ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes); ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes); @@ -4890,7 +4781,6 @@ TEST(Simplify, SimplifyRampSubBroadcast) { } TEST(Simplify, SimplifyBroadcastTermExpander) { - KernelScope kernel_scope; int num_lanes = 8; ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes); ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes); @@ -4920,7 +4810,6 @@ TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) { // for (int n = 1; n < N; n++) { // b[n] = 1.f; // } - KernelScope kernel_scope; constexpr int N = 8; Placeholder b("b", kFloat, {N}); VarHandle n("n", kInt); @@ -4945,7 +4834,6 @@ TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) { // for (int n = 1; n < N; n++) { // b[n] = 1.f; // } - KernelScope kernel_scope; constexpr int N = 8; Placeholder b("b", kFloat, {N}); VarHandle n("n", kInt); @@ -4974,7 +4862,6 @@ TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) { // for (int i = 1; i < 7; i++) { // for (int j = 1; j < 7; j++) { // b[i, j] = 1.f; - KernelScope kernel_scope; constexpr int N = 8; Placeholder b("b", kFloat, {N, N}); VarHandle i("i", kInt); @@ -5010,7 +4897,6 @@ TEST(Simplify, DISABLED_SimplifyLoopBounds) { // for (int i = 1; i < 3; i++) { // for (int j = 1; j < 3; j++) { // b[i, j] = (b[i, j]) + 1.f; - KernelScope kernel_scope; constexpr int N = 8; constexpr int K = 3; Placeholder a("a", kFloat, {N, N}); diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index 8dd616453362b..723a8fef81bea 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp index cc8a6967b7255..67c1a0a528b7c 100644 --- a/test/cpp/tensorexpr/test_type.cpp +++ b/test/cpp/tensorexpr/test_type.cpp @@ -9,7 +9,6 @@ namespace jit { using namespace torch::jit::tensorexpr; TEST(Type, Test01) { - KernelScope kernel_scope; { Dtype dt1 = kInt; ASSERT_EQ(dt1, kInt); @@ -45,28 +44,24 @@ TEST(Type, Test01) { TEST(Type, BitCasting) { { - KernelScope kernel_scope; VarHandle x("x", kFloat); ExprHandle y = bitcast(x); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) ASSERT_EQ(y.dtype(), kInt); } { - KernelScope kernel_scope; VarHandle x("x", kInt); ExprHandle y = bitcast(x); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) ASSERT_EQ(y.dtype(), kFloat); } { - KernelScope kernel_scope; VarHandle x("x", kShort); ExprHandle y = bitcast(x); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) ASSERT_EQ(y.dtype(), kHalf); } { - KernelScope kernel_scope; VarHandle x("x", kHalf); ExprHandle y = bitcast(x); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) @@ -82,7 +77,6 @@ TEST(Type, BitCasting) { using SimpleIRExprEval = ExprEval; // this is broken /*{ - KernelScope kernel_scope; at::Half k_; at::Half* k = &k_; *reinterpret_cast(k) = ref16; @@ -93,7 +87,6 @@ TEST(Type, BitCasting) { }*/ { - KernelScope kernel_scope; float k = raw_bitcast(ref32); auto a = FloatImm::make(k); auto b = BitCast::make(kInt, a); @@ -102,7 +95,6 @@ TEST(Type, BitCasting) { } { - KernelScope kernel_scope; double k = raw_bitcast(ref64); auto a = DoubleImm::make(k); auto b = BitCast::make(kLong, a); @@ -111,7 +103,6 @@ TEST(Type, BitCasting) { } { - KernelScope kernel_scope; int64_t k = raw_bitcast(reff64); auto a = LongImm::make(k); auto b = BitCast::make(kDouble, a); @@ -120,7 +111,6 @@ TEST(Type, BitCasting) { } { - KernelScope kernel_scope; int32_t k = raw_bitcast(reff32); auto a = IntImm::make(k); auto b = BitCast::make(kFloat, a); @@ -130,27 +120,22 @@ TEST(Type, BitCasting) { // This segfaults :( /*{ - KernelScope kernel_scope; VarHandle x("x", kDouble); ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); } { - KernelScope kernel_scope; VarHandle x("x", kFloat); ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); } { - KernelScope kernel_scope; VarHandle x("x", kLong); ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); } { - KernelScope kernel_scope; VarHandle x("x", kShort); ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); } { - KernelScope kernel_scope; VarHandle x("x", kInt); ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); }*/ @@ -159,7 +144,6 @@ TEST(Type, BitCasting) { TEST(Type, Propagation) { // Same types: { - KernelScope kernel_scope; VarHandle x("x", kFloat); VarHandle y("y", kFloat); ExprHandle body = FloatImm::make(2.f) + @@ -168,7 +152,6 @@ TEST(Type, Propagation) { } // Int to bigger int: { - KernelScope kernel_scope; VarHandle x("x", kShort); VarHandle y("y", kLong); ExprHandle body = @@ -177,7 +160,6 @@ TEST(Type, Propagation) { } // Float to bigger float: { - KernelScope kernel_scope; VarHandle x("x", kHalf); VarHandle y("y", kDouble); ExprHandle body = @@ -186,7 +168,6 @@ TEST(Type, Propagation) { } // Int to Float: { - KernelScope kernel_scope; VarHandle x("x", kFloat); VarHandle y("y", kInt); ExprHandle body = @@ -195,7 +176,6 @@ TEST(Type, Propagation) { } // Smaller float, bigger Int: { - KernelScope kernel_scope; VarHandle x("x", kHalf); VarHandle y("y", kLong); ExprHandle body = @@ -204,7 +184,6 @@ TEST(Type, Propagation) { } // Bigger float, smaller Int: { - KernelScope kernel_scope; VarHandle x("x", kChar); VarHandle y("y", kDouble); ExprHandle body = @@ -213,7 +192,6 @@ TEST(Type, Propagation) { } // Sign change char/byte upgrades to short: { - KernelScope kernel_scope; VarHandle x("x", kChar); VarHandle y("y", kByte); ExprHandle body = diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index 0f0277e37292e..16605e5e6d501 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -49,19 +49,6 @@ using namespace torch::jit::tensorexpr; int main(int argc, char* argv[]) { - // Memory management for tensor expressions is currently done with memory - // arenas. That is, whenever an object is created it registers itself in an - // arena and the object is kept alive as long as the arena is alive. When the - // arena gets destructed, it deletes all objects registered in it. - // - // The easiest way to set up a memory arena is to use `KernelScope` class - it - // is a resource guard that creates a new arena on construction and restores - // the previously set arena on destruction. - // - // We will create a kernel scope here, and thus we'll set up a mem arena for - // the entire tutorial. - KernelScope kernel_scope; - std::cout << "*** Structure of tensor expressions ***" << std::endl; { // A tensor expression is a tree of expressions. Each expression has a type, diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py index 0ae59e1c56484..6a348053c01fd 100644 --- a/test/test_tensorexpr_pybind.py +++ b/test/test_tensorexpr_pybind.py @@ -9,14 +9,6 @@ LLVM_ENABLED = torch._C._llvm_enabled() -class kernel_arena_scope(object): - def __enter__(self): - self.scope = torch._C._te.KernelScope() - - def __exit__(self, typ, val, traceback): - self.scope = None - - def construct_adder(n: int, dtype=te.Dtype.Float): dN = te.ExprHandle.int(n) A = te.Placeholder('A', dtype, [dN]) @@ -36,85 +28,80 @@ def compute(i): class TestTensorExprPyBind(JitTestCase): def test_simple_sum(self): - with kernel_arena_scope(): - n = 32 - cg = construct_adder(n) + n = 32 + cg = construct_adder(n) - tA = torch.randn(n) - tB = torch.randn(n) - tC = torch.empty(n) - cg.call([tA, tB, tC]) - torch.testing.assert_close(tA + tB, tC) + tA = torch.randn(n) + tB = torch.randn(n) + tC = torch.empty(n) + cg.call([tA, tB, tC]) + torch.testing.assert_close(tA + tB, tC) def test_call_raw(self): - with kernel_arena_scope(): - n = 16 - cg = construct_adder(n, dtype=torch.float64) + n = 16 + cg = construct_adder(n, dtype=torch.float64) - tA = torch.randn(n, dtype=torch.float64) - tB = torch.randn(n, dtype=torch.float64) - tC = torch.empty(n, dtype=torch.float64) - cg.call_raw([tA.data_ptr(), tB.data_ptr(), tC.data_ptr()]) - torch.testing.assert_close(tA + tB, tC) + tA = torch.randn(n, dtype=torch.float64) + tB = torch.randn(n, dtype=torch.float64) + tC = torch.empty(n, dtype=torch.float64) + cg.call_raw([tA.data_ptr(), tB.data_ptr(), tC.data_ptr()]) + torch.testing.assert_close(tA + tB, tC) def test_external_calls(self): - with kernel_arena_scope(): - dtype = torch.float32 + dtype = torch.float32 - ONE = te.ExprHandle.int(1) - FOUR = te.ExprHandle.int(4) - A = te.BufHandle('A', [ONE, FOUR], dtype) - B = te.BufHandle('B', [FOUR, ONE], dtype) - C = te.BufHandle('C', [ONE, ONE], dtype) + ONE = te.ExprHandle.int(1) + FOUR = te.ExprHandle.int(4) + A = te.BufHandle('A', [ONE, FOUR], dtype) + B = te.BufHandle('B', [FOUR, ONE], dtype) + C = te.BufHandle('C', [ONE, ONE], dtype) - s = te.ExternalCall(C, "nnc_aten_matmul", [A, B], []) + s = te.ExternalCall(C, "nnc_aten_matmul", [A, B], []) - loopnest = te.LoopNest(s, [C]) - loopnest.prepare_for_codegen() - codegen = te.construct_codegen('ir_eval', s, [te.BufferArg(x) for x in [A, B, C]]) + loopnest = te.LoopNest(s, [C]) + loopnest.prepare_for_codegen() + codegen = te.construct_codegen('ir_eval', s, [te.BufferArg(x) for x in [A, B, C]]) - tA = torch.ones(1, 4) - tB = torch.ones(4, 1) - tC = torch.empty(1, 1) - codegen.call([tA, tB, tC]) - torch.testing.assert_close(torch.matmul(tA, tB), tC) + tA = torch.ones(1, 4) + tB = torch.ones(4, 1) + tC = torch.empty(1, 1) + codegen.call([tA, tB, tC]) + torch.testing.assert_close(torch.matmul(tA, tB), tC) def test_dynamic_shape(self): - with kernel_arena_scope(): - dN = te.VarHandle(torch.int32) - A = te.BufHandle(torch.float64) - B = te.BufHandle(torch.float64) + dN = te.VarHandle(torch.int32) + A = te.BufHandle(torch.float64) + B = te.BufHandle(torch.float64) - def compute(i): - return A.load(i) - B.load(i) + def compute(i): + return A.load(i) - B.load(i) - C = te.Compute('C', [dN], compute) + C = te.Compute('C', [dN], compute) - loopnest = te.LoopNest([C]) - loopnest.prepare_for_codegen() + loopnest = te.LoopNest([C]) + loopnest.prepare_for_codegen() - cg = te.construct_codegen( - 'ir_eval', - loopnest.simplify(), - [A, B, C, dN]) + cg = te.construct_codegen( + 'ir_eval', + loopnest.simplify(), + [A, B, C, dN]) - def test_with_shape(n): - tA = torch.randn(n, dtype=torch.double) - tB = torch.randn(n, dtype=torch.double) - tC = torch.empty(n, dtype=torch.double) - cg.call([tA, tB, tC, n]) - torch.testing.assert_close(tA - tB, tC) + def test_with_shape(n): + tA = torch.randn(n, dtype=torch.double) + tB = torch.randn(n, dtype=torch.double) + tC = torch.empty(n, dtype=torch.double) + cg.call([tA, tB, tC, n]) + torch.testing.assert_close(tA - tB, tC) - test_with_shape(8) - test_with_shape(31) + test_with_shape(8) + test_with_shape(31) def test_dtype_error(self): - with kernel_arena_scope(): - one = te.ExprHandle.int(1) - te.Placeholder([one], torch.float32) # ok - te.Placeholder([one]) # ok - self.assertRaises(TypeError, - lambda: te.Placeholder([one], "float55")) + one = te.ExprHandle.int(1) + te.Placeholder([one], torch.float32) # ok + te.Placeholder([one]) # ok + self.assertRaises(TypeError, + lambda: te.Placeholder([one], "float55")) @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled") def test_kernel_with_tensor_inputs(self): @@ -396,23 +383,22 @@ def f(a): @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled") def test_alloc_in_loop(self): - with kernel_arena_scope(): - a, tmp, b = [ - te.Placeholder(name, te.Dtype.Float, [te.ExprHandle.int(1)]) - for name in ["a", "tmp", "b"]] - t0, t100 = [te.ExprHandle.int(n) for n in [0, 100]] - body = te.Block([ - tmp.store([t0], a.load([t0])), - b.store([t0], tmp.load([t0])) - ]) - for _ in range(4): - i = te.VarHandle("i", te.Dtype.Int) - body = te.For.make(i, t0, t100, body) - nest = te.LoopNest(body, [b.data()]) - nest.prepare_for_codegen() - f = te.construct_codegen("llvm", nest.simplify(), [a, b]) - ta, tb = [torch.ones(1) for _ in range(2)] - f.call([ta.data_ptr(), tb.data_ptr()]) + a, tmp, b = [ + te.Placeholder(name, te.Dtype.Float, [te.ExprHandle.int(1)]) + for name in ["a", "tmp", "b"]] + t0, t100 = [te.ExprHandle.int(n) for n in [0, 100]] + body = te.Block([ + tmp.store([t0], a.load([t0])), + b.store([t0], tmp.load([t0])) + ]) + for _ in range(4): + i = te.VarHandle("i", te.Dtype.Int) + body = te.For.make(i, t0, t100, body) + nest = te.LoopNest(body, [b.data()]) + nest.prepare_for_codegen() + f = te.construct_codegen("llvm", nest.simplify(), [a, b]) + ta, tb = [torch.ones(1) for _ in range(2)] + f.call([ta.data_ptr(), tb.data_ptr()]) if __name__ == '__main__': run_tests() diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 2eabbd0a8b230..157c30663ce1c 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -301,7 +301,6 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/tensorexpr/llvm_codegen.cpp", "torch/csrc/jit/tensorexpr/llvm_jit.cpp", "torch/csrc/jit/tensorexpr/loopnest.cpp", - "torch/csrc/jit/tensorexpr/mem_arena.cpp", "torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp", "torch/csrc/jit/tensorexpr/operators/conv2d.cpp", "torch/csrc/jit/tensorexpr/operators/matmul.cpp", diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 0a34f476b0d3b..5ef770c6755fc 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -481,8 +481,6 @@ static constexpr int kVectorWidth = 16; #ifdef TORCH_ENABLE_LLVM struct TEWrapper { - tensorexpr::KernelArena ka; - tensorexpr::KernelScope ks; std::unique_ptr cg; TEWrapper() = default; void update(std::unique_ptr&& cg_) { @@ -534,8 +532,6 @@ std::shared_ptr wrapTECompute( #else struct TEWrapper { - tensorexpr::KernelArena ka; - tensorexpr::KernelScope ks; TEWrapper() = default; template void operator()(const Ts&... ts) { diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 1b942eaf353fc..108236e2e17f8 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -8,7 +8,6 @@ #include #include #include -#include #include namespace torch { diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 8076ba2b71d67..fed5e1e139d3d 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -2909,7 +2909,6 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) { } void TensorExprKernel::compile() { - KernelScope kernelScope(&kernelArena_); GRAPH_DUMP("TensorExprKernel graph:", graph_); device_ = *pickDeviceType(graph_); @@ -3080,8 +3079,6 @@ StmtPtr TensorExprKernel::getCodeGenStmt() { } void TensorExprKernel::runKernel(Stack& stack) { - KernelScope kernelScope(&kernelArena_); - // Set up arguments (inputs, then outputs) for kernel call. auto inputs = last(stack, nInputs_); std::vector outputs; @@ -3101,8 +3098,6 @@ void TensorExprKernel::runKernel(Stack& stack) { void TensorExprKernel::runFast( const std::vector& inputs, const std::vector& outputs) { - KernelScope kernelScope(&kernelArena_); - std::vector args(inputs); args.reserve(inputs.size() + outputs.size() + constants_.size()); args.insert(args.end(), outputs.begin(), outputs.end()); diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index a8a57b9f15a16..99a3b123a6816 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -266,7 +266,6 @@ class TORCH_API TensorExprKernel { std::unordered_map input_name_map_; std::unique_ptr codegen_; at::Device device_ = at::kCPU; - KernelArena kernelArena_; std::shared_ptr graph_; Code code_; bool allow_fallback_{false}; diff --git a/torch/csrc/jit/tensorexpr/mem_arena.cpp b/torch/csrc/jit/tensorexpr/mem_arena.cpp deleted file mode 100644 index 1769563424f5c..0000000000000 --- a/torch/csrc/jit/tensorexpr/mem_arena.cpp +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include -#include - -namespace torch { -namespace jit { -namespace tensorexpr { - -namespace { -// Define in an anonymous namespace to hide this symbol from other compilation -// units -thread_local KernelArena* current_arena = nullptr; -} // namespace - -KernelArena::~KernelArena() { - for (KernelScopedObject* p : kernel_objects_) { - delete p; - } -} - -KernelScopedObject::KernelScopedObject() { - KernelArena* kernel = KernelArena::GetCurrentKernelArena(); - if (kernel == nullptr) { - throw std::runtime_error( - "KernelScope() must be constructed before calling this"); - } - kernel->kernel_objects_.push_back(this); -} - -void KernelArena::SetCurrentKernelArena(KernelArena* new_kernel_arena) { - current_arena = new_kernel_arena; -} - -KernelArena* KernelArena::GetCurrentKernelArena() { - return current_arena; -} - -KernelScope::KernelScope() - : kernel_arena_(new KernelArena()), - old_kernel_arena_(KernelArena::GetCurrentKernelArena()), - owning_(true) { - KernelArena::SetCurrentKernelArena(kernel_arena_); -} - -KernelScope::KernelScope(KernelArena* arena_) - : kernel_arena_(arena_), - old_kernel_arena_(KernelArena::GetCurrentKernelArena()), - owning_(false) { - KernelArena::SetCurrentKernelArena(kernel_arena_); -} - -KernelScope::~KernelScope() { - if (KernelArena::GetCurrentKernelArena() != kernel_arena_) { - // This should be an error, but it gets triggered in - // caffe2/benchmarks/static_runtime:static_runtime_cpptest - TORCH_WARN("KernelScope() destructed out of order, leaking memory"); - return; - } - KernelArena::SetCurrentKernelArena(old_kernel_arena_); - if (owning_) { - delete kernel_arena_; - } -} - -} // namespace tensorexpr -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/tensorexpr/mem_arena.h b/torch/csrc/jit/tensorexpr/mem_arena.h deleted file mode 100644 index a39ab6f0068c7..0000000000000 --- a/torch/csrc/jit/tensorexpr/mem_arena.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once -#include -#include - -namespace torch { -namespace jit { -namespace tensorexpr { - -class KernelScopedObject; - -// An arena that manages all the underlying kernel-scoped objects. -// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class KernelArena { - public: - static KernelArena* GetCurrentKernelArena(); - static void SetCurrentKernelArena(KernelArena* new_arena); - TORCH_API KernelArena() = default; - TORCH_API ~KernelArena(); - KernelArena(const KernelArena&) = delete; - KernelArena& operator=(const KernelArena&) = delete; - - private: - friend class KernelScopedObject; - std::vector kernel_objects_; // owned -}; - -// A RAII convenience wrapper on top of a kernel. -// It either creates or takes an existing Kernel and sets it as the current -// Kernel. When this object is destroyed, the previous Kernel is set as current, -// and the created kernel is freed. If the kernel was passed, it stays alive. -class KernelScope { - public: - TORCH_API KernelScope(); - TORCH_API explicit KernelScope(KernelArena* arena_); - TORCH_API ~KernelScope(); - KernelScope(const KernelScope&) = delete; - KernelScope& operator=(const KernelScope&) = delete; - - private: - KernelArena* kernel_arena_; // maybe owned - KernelArena* old_kernel_arena_; // previous arena, restored in destructor - bool owning_; // determines whether the arena will be freed along with - // the scope object -}; - -// The base object managed by the Kernel. -// The object must be created through "new", and when the Kernel is destroyed, -// All its registered objects are destroyed through "delete". -class TORCH_API KernelScopedObject { - public: - KernelScopedObject(); - virtual ~KernelScopedObject() = default; - - KernelScopedObject(const KernelScopedObject&) = delete; - KernelScopedObject& operator=(const KernelScopedObject&) = delete; -}; - -} // namespace tensorexpr -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp index c380233cce16a..c7f48824303d4 100644 --- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp +++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp @@ -61,7 +61,6 @@ void initTensorExprBindings(PyObject* module) { // Tensor Expr Classes auto te = m.def_submodule("_te"); - py::class_(te, "KernelScope").def(py::init<>()); auto dtype_class = py::class_(te, "Dtype").def(py::init(&parsePythonDtype)); From c545b099aac56a7394875346613789bfc7f71cb7 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 24 Aug 2021 01:43:33 -0700 Subject: [PATCH 161/530] Separating quantization test from distributed_test (#63058) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63058 Dedicating separate tests for different quantization methods. Currently supporting FP16 method. ghstack-source-id: 136499767 Test Plan: uck test mode/dev //caffe2/test/distributed/algorithms/quantization:quantization_gloo_fork -- name_of_the_test Reviewed By: wanchaol Differential Revision: D30142580 fbshipit-source-id: 3aacec1a231a662067d2b48c001f0c69fefcdd60 --- .../quantization/test_quantization.py | 180 ++++++++++++++++++ .../{ => quantization}/quantization.py | 3 - .../_internal/distributed/distributed_test.py | 36 +--- 3 files changed, 183 insertions(+), 36 deletions(-) create mode 100644 test/distributed/algorithms/quantization/test_quantization.py rename torch/distributed/algorithms/{ => quantization}/quantization.py (99%) diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py new file mode 100644 index 0000000000000..7872920f21141 --- /dev/null +++ b/test/distributed/algorithms/quantization/test_quantization.py @@ -0,0 +1,180 @@ +import torch +import os +import torch.cuda +import sys +import torch.distributed as dist +import torch.distributed.algorithms.quantization.quantization as quant +from torch.distributed.algorithms.quantization.quantization import DQuantType +from torch.testing._internal.common_distributed import ( + MultiProcessTestCase, + requires_gloo, + skip_if_lt_x_gpu, + requires_nccl, +) +from torch.testing._internal.distributed.distributed_test import ( + apply_hack_for_nccl +) +from torch.testing._internal.common_utils import sandcastle_skip_if, run_tests, TEST_WITH_DEV_DBG_ASAN, NO_MULTIPROCESSING_SPAWN + +torch.backends.cuda.matmul.allow_tf32 = False + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +def _build_tensor(size, value=None, dtype=torch.float, device_id=None): + if value is None: + value = size + if device_id is None: + return torch.empty(size, size, size, dtype=dtype).fill_(value) + else: + return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id) +if TEST_WITH_DEV_DBG_ASAN: + print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr) + sys.exit(0) + +if NO_MULTIPROCESSING_SPAWN: + print("Spawn not available, skipping tests.", file=sys.stderr) + sys.exit(0) + +BACKEND = os.environ["BACKEND"] + +if BACKEND == "gloo" or BACKEND == "nccl": + class DistQuantizationTests(MultiProcessTestCase): + + def setUp(self): + super(DistQuantizationTests, self).setUp() + self._spawn_processes() + torch.backends.cudnn.flags(allow_tf32=False).__enter__() + + def tearDown(self): + super(DistQuantizationTests, self).tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def op_timeout_sec(self): + return 1 + + @property + def world_size(self): + return 2 + + def _init_multigpu_helper(self): + """Multigpu tests are designed to simulate the multi nodes with multi + GPUs on each node. Nccl backend requires equal #GPUs in each process. + On a single node, all visible GPUs are evenly + divided to subsets, each process only uses a subset. + """ + nGPUs = torch.cuda.device_count() + world_size = dist.get_world_size() + visible_devices = range(nGPUs) + + if BACKEND == "nccl": + apply_hack_for_nccl() + + # If rank is lesser than or equal to number of available GPU's + # then each rank can be mapped to corresponding GPU. + nGPUs_per_process = 1 + if world_size > nGPUs: + nGPUs_per_process = nGPUs // world_size + rank_to_GPU = { + i: list( + visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process] + ) + for i in range(world_size) + } + return rank_to_GPU + + @requires_gloo() + @sandcastle_skip_if(BACKEND != "gloo", "Only gloo backend supports all_gather_fp16") + def test_all_gather_fp16(self): + store = dist.FileStore(self.file_name, int(self.world_size)) + dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') + device = torch.device(f"cuda:{self.rank}") + group = list(range(0, self.world_size)) + group_id = dist.group.WORLD + self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16) + + @requires_nccl() + @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16") + @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) + def test_all_to_all_fp16(self): + store = dist.FileStore(self.file_name, int(self.world_size)) + dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') + device = torch.device(f"cuda:{self.rank}") + group = list(range(0, self.world_size)) + group_id = dist.new_group(range(self.world_size)) + rank_to_GPU = self._init_multigpu_helper() + self._test_all_to_all( + group, + group_id, + self.rank, + cuda=True, + rank_to_GPU=rank_to_GPU, + dtype=torch.float32, + qtype=DQuantType.FP16) + + def _test_all_gather( + self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None): + for dest in group: + tensor = _build_tensor(dest + 1, rank, dtype=dtype) + tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group] + expected_tensors = [_build_tensor(dest + 1, i, dtype=dtype) for i in group] + if (qtype is not None): + allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None) + else: + allgather = dist.all_gather + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors] + if tensors[0].dtype == torch.complex64: + tensor_shapes = [torch.view_as_real(tensors[0]).shape] + else: + tensor_shapes = [tensors[0].shape] + allgather(tensors, tensor, group=group_id, async_op=False) + + for t1, t2 in zip(tensors, expected_tensors): + self.assertEqual(t1, t2) + + def _test_all_to_all( + self, + group, + group_id, + rank, + cuda=False, + rank_to_GPU=None, + dtype=torch.float, + qtype=None + ): + if group_id is not None: + size = len(group) + in_splits = [i + 1 for i in group] + in_tensors = [ + torch.ones([in_splits[i], size], dtype=dtype) * rank + for i, _ in enumerate(group) + ] + out_tensors = [ + torch.ones([(rank + 1), size], dtype=dtype) for _ in group + ] + expected_tensors = [ + torch.ones([rank + 1, size], dtype=dtype) * i for i in group + ] + if cuda: + in_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in in_tensors] + expected_tensors = [ + t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors + ] + out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors] + if(qtype is not None): + quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None) + quantize_alltoall(out_tensors, in_tensors, group=group_id) + else: + dist.all_to_all(out_tensors, in_tensors, group=group_id) + for t1, t2 in zip(out_tensors, expected_tensors): + self.assertEqual(t1, t2) + +if __name__ == "__main__": + run_tests() diff --git a/torch/distributed/algorithms/quantization.py b/torch/distributed/algorithms/quantization/quantization.py similarity index 99% rename from torch/distributed/algorithms/quantization.py rename to torch/distributed/algorithms/quantization/quantization.py index dead78af600b2..724d6aa362487 100644 --- a/torch/distributed/algorithms/quantization.py +++ b/torch/distributed/algorithms/quantization/quantization.py @@ -86,16 +86,13 @@ def auto_quantize(func, qtype, quant_loss=None): """ This is a prototype API that automatically quantize the input tensors, choose the precision types, and pass other necessary arguments and then dequantizes the output. - Currently it only supports: . FP16 quantization method . all_gather, all_to_all collective ops - Args: func (callable): A function representing collective operations. qtype (QuantType): Quantization method quant_loss (float, optional): This can be used to improve accuracy in the dequantization. - Returns: (callable): the same collective as func but enables automatic quantization/dequantization. """ diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 1631983d32ec7..f4bc073a4317e 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -19,7 +19,6 @@ import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD import torch.distributed.algorithms.model_averaging.averagers as averagers import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils -import torch.distributed.algorithms.quantization as quant import torch.nn as nn import torch.nn.functional as F from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR @@ -29,7 +28,6 @@ from torch.distributed.algorithms.ddp_comm_hooks import ( quantization as quantization_hooks, ) -from torch.distributed.algorithms.quantization import DQuantType from torch.distributed.distributed_c10d import ( get_world_size, _get_default_group, @@ -2764,15 +2762,12 @@ def test_gather_full_group(self): # ALL GATHER def _test_all_gather_helper( - self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None + self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float ): for dest in group: tensor = _build_tensor(dest + 1, rank, dtype=dtype) tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group] - if qtype is not None: - allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None) - else: - allgather = dist.all_gather + allgather = dist.all_gather if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors] @@ -2838,12 +2833,6 @@ def test_all_gather_full_group(self): group, group_id, rank = self._init_full_group_test() self._test_all_gather_helper(group, group_id, rank) - @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors") - @sandcastle_skip_if(BACKEND == "mpi", "all_gather_quantized does not support MPI") - def test_all_gather_quantized(self): - group, group_id, rank = self._init_global_test() - self._test_all_gather_helper(group, group_id, rank, dtype=torch.float32, qtype=DQuantType.FP16) - def _run_all_gather_coalesced_and_verify( self, output_tensor_lists, input_tensors, expected_tensors, group_id ): @@ -3046,7 +3035,6 @@ def _test_all_to_all_helper( cuda=False, rank_to_GPU=None, dtype=torch.float, - qtype=None ): if group_id is not None: size = len(group) @@ -3067,11 +3055,7 @@ def _test_all_to_all_helper( t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors ] out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors] - if(qtype is not None): - quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None) - quantize_alltoall(out_tensors, in_tensors, group=group_id) - else: - dist.all_to_all(out_tensors, in_tensors, group=group_id) + dist.all_to_all(out_tensors, in_tensors, group=group_id) for t1, t2 in zip(out_tensors, expected_tensors): self.assertEqual(t1, t2) self._barrier() @@ -3154,20 +3138,6 @@ def test_all_to_all(self): group, group_id, rank = self._init_global_test() self._test_all_to_all_helper(group, group_id, rank) - @sandcastle_skip_if(BACKEND != "nccl", "Only NCCL supports all_to_all") - @skip_if_rocm - def test_all_to_all_quantized(self): - group, group_id, rank = self._init_global_test() - rank_to_GPU = self._init_multigpu_helper() - self._test_all_to_all_helper( - group, - group_id, - rank, - cuda=True, - rank_to_GPU=rank_to_GPU, - dtype=torch.float32, - qtype=DQuantType.FP16) - @sandcastle_skip_if(BACKEND != "nccl", "Only NCCL supports CUDA all_to_all") @skip_if_rocm def test_all_to_all_cuda(self): From 83d9bad44a1e1e6202103cd22e4dbd2bd3d7dae0 Mon Sep 17 00:00:00 2001 From: albanD Date: Tue, 24 Aug 2021 06:52:38 -0700 Subject: [PATCH 162/530] Add a common autograd TLS state (#63114) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63114 This PR collapses the GradMode and InferenceMode thread local booleans into a single thread local uint8. This helps reducing the number of thread local variable accesses done when we propagate ThreadLocalStates. Note that this is even more beneficial as we will add a forward mode AD TLS (similar to GradMode) higher in this stack and this new structure should reduce the perf impact of adding this new TLS. Here is the full benchmark result between master and the top of this stack: https://gist.github.com/albanD/e421101e9ed344e94999bef3a54bf0f3 tl;dr: give a benefit in most cases. It is never detrimental. Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30388099 Pulled By: albanD fbshipit-source-id: 8e03f940150ff063c2edd792733663413ae2f486 --- aten/src/ATen/ThreadLocalState.cpp | 22 +++++++++++------ aten/src/ATen/ThreadLocalState.h | 5 +--- c10/core/AutogradState.cpp | 19 +++++++++++++++ c10/core/AutogradState.h | 39 ++++++++++++++++++++++++++++++ c10/core/GradMode.cpp | 7 +++--- c10/core/InferenceMode.cpp | 8 +----- c10/core/InferenceMode.h | 19 +++++++-------- 7 files changed, 87 insertions(+), 32 deletions(-) create mode 100644 c10/core/AutogradState.cpp create mode 100644 c10/core/AutogradState.h diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index ba7be1a06b8a1..fc4b8fa9c27ec 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -12,15 +12,12 @@ namespace at { ThreadLocalState::ThreadLocalState(bool keep_grad_mode) : dispatch_key_(c10::impl::tls_local_dispatch_key_set()), debug_info_(c10::ThreadLocalDebugInfo::current()), - inference_mode_enabled_(c10::InferenceMode::is_enabled()) { + autograd_tls_(c10::AutogradState::get_tls_state()) { rf_tls_ = at::get_record_function_tls_(); saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks(); #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) keep_grad_mode_ = keep_grad_mode; - if (keep_grad_mode_) { - grad_mode_enabled_ = GradMode::is_enabled(); - } #endif bumped_record_all_functions_ = at::checkRecordAllFunctions(); } @@ -28,10 +25,23 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode) /* static */ void ThreadLocalState::setThreadLocalState( const ThreadLocalState& state) { + // Note that setting the InferenceMode TLS in this function is ONLY ok because we always + // restore the dispatch key set TLS at the same time. #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) if (state.keep_grad_mode_) { - GradMode::set_enabled(state.grad_mode_enabled_); + c10::AutogradState::set_tls_state(state.autograd_tls_); + } else { + auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), + /* inference_mode */ state.autograd_tls_.get_inference_mode()); + c10::AutogradState::set_tls_state(new_state); } +#else + // The mobile build explicitly ignore grad_mode but fails if we propagate + // its value across threads or set it to a fixed value. + // So we have to make sure the grad_mode value is not changed here. + auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), + /* inference_mode */ state.autograd_tls_.get_inference_mode()); + c10::AutogradState::set_tls_state(new_state); #endif at::set_record_function_tls_(state.rf_tls_); @@ -43,8 +53,6 @@ void ThreadLocalState::setThreadLocalState( c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_); c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_); - - c10::InferenceMode::_set_enabled(state.inference_mode_enabled_); } } // namespace at diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index f30f5e3442cc1..4942399cbd6d7 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -35,14 +35,11 @@ class TORCH_API ThreadLocalState { // RecordFunction TLS RecordFunctionTLS rf_tls_; + AutogradState autograd_tls_; #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) bool keep_grad_mode_ = true; - bool grad_mode_enabled_; #endif - // TLS for InferenceMode - bool inference_mode_enabled_; - // TLS for saved tensors default hooks std::pair saved_tensors_default_hooks_; diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp new file mode 100644 index 0000000000000..9684a76b78564 --- /dev/null +++ b/c10/core/AutogradState.cpp @@ -0,0 +1,19 @@ +#include + +namespace c10 { + +namespace { +// By default, grad mode is enabled and inference mode is disabled +thread_local AutogradState autograd_state_tls = + AutogradState(/* grad_mode */ true, /* inference_mode */ false); +} // namespace + +AutogradState& AutogradState::get_tls_state() { + return autograd_state_tls; +} + +void AutogradState::set_tls_state(AutogradState state) { + autograd_state_tls = state; +} + +} // namespace c10 diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h new file mode 100644 index 0000000000000..83ea3607cd2af --- /dev/null +++ b/c10/core/AutogradState.h @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include + +namespace c10 { + +// Structure used to pack all the thread local boolean +// flags used by autograd +struct TORCH_API AutogradState { + static AutogradState& get_tls_state(); + static void set_tls_state(AutogradState state); + + AutogradState(bool grad_mode, bool inference_mode) + : grad_mode_(grad_mode), inference_mode_(inference_mode) {} + + void set_grad_mode(bool enabled) { + grad_mode_ = enabled; + } + + void set_inference_mode(bool enabled) { + inference_mode_ = enabled; + } + + bool get_grad_mode() const { + return grad_mode_; + } + + bool get_inference_mode() const { + return inference_mode_; + } + + private: + bool grad_mode_ : 1; + bool inference_mode_ : 1; +}; + +} // namespace c10 diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp index 32747a6698afa..a5db198083b2b 100644 --- a/c10/core/GradMode.cpp +++ b/c10/core/GradMode.cpp @@ -1,16 +1,15 @@ +#include #include #include namespace c10 { -thread_local bool GradMode_enabled = true; - bool GradMode::is_enabled() { - return GradMode_enabled; + return AutogradState::get_tls_state().get_grad_mode(); } void GradMode::set_enabled(bool enabled) { - GradMode_enabled = enabled; + AutogradState::get_tls_state().set_grad_mode(enabled); } } // namespace c10 diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp index b588ab4da54b5..59eca760cf504 100644 --- a/c10/core/InferenceMode.cpp +++ b/c10/core/InferenceMode.cpp @@ -2,18 +2,12 @@ #include namespace c10 { -thread_local bool InferenceMode_enabled = false; - // Invariant: // is_enabled() == // !c10::impl::tls_is_dispatch_key_included(DispatchKey::ADInplaceOrView); // InferenceMode::is_enabled() is in perf critical path (TensorImpl constructor) // so it worths a separate TLS to skip the DispatchKeySet check. bool InferenceMode::is_enabled() { - return InferenceMode_enabled; -} - -void InferenceMode::_set_enabled(bool enabled) { - InferenceMode_enabled = enabled; + return AutogradState::get_tls_state().get_inference_mode(); } } // namespace c10 diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h index 7a9c2c593a453..9748d6eccfb54 100644 --- a/c10/core/InferenceMode.h +++ b/c10/core/InferenceMode.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -50,10 +51,12 @@ struct TORCH_API InferenceMode { // are applicable to InferenceMode as well, e.g. // `tensorTypeInCurrentExecutionContext` in interpreter.cpp. InferenceMode(bool enabled = true) - : prev_mode(InferenceMode::is_enabled()), - prev_keyset(c10::impl::tls_local_dispatch_key_set()), - grad_mode(at::AutoGradMode(!enabled)) { - _set_enabled(enabled); + : prev_mode(AutogradState::get_tls_state()), + prev_keyset(c10::impl::tls_local_dispatch_key_set()) { + // Enabling inference mode means disabling grad mode + // And disabling inference mode means enabling grad mode + AutogradState::set_tls_state( + AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled)); DispatchKeySet included = enabled ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView) : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView); @@ -67,17 +70,13 @@ struct TORCH_API InferenceMode { } ~InferenceMode() { - _set_enabled(prev_mode); + AutogradState::set_tls_state(prev_mode); c10::impl::_force_tls_local_dispatch_key_set(prev_keyset); } static bool is_enabled(); - // _set_enabled() is not user facing and should be only used in - // ThreadLocalState.cpp. - static void _set_enabled(bool enabled); private: - bool prev_mode; + AutogradState prev_mode; c10::impl::LocalDispatchKeySet prev_keyset; - at::AutoGradMode grad_mode; }; } // namespace c10 From 9914fb66155ac2dc674f62e2705f78f837d00c3f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 24 Aug 2021 06:58:05 -0700 Subject: [PATCH 163/530] ENH Adds no_batch_dim tests/docs for LPPool1d and Identity (#62190) Summary: Fixes https://github.com/pytorch/pytorch/issues/60585 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62190 Reviewed By: ejguan Differential Revision: D29942385 Pulled By: jbschlosser fbshipit-source-id: 00df6f6f01ad039631bb8679f8de94863aac7650 --- torch/nn/modules/linear.py | 4 ++++ torch/nn/modules/pooling.py | 4 ++-- torch/testing/_internal/common_nn.py | 8 ++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py index 07fe1063283fc..21425f2be2aad 100644 --- a/torch/nn/modules/linear.py +++ b/torch/nn/modules/linear.py @@ -16,6 +16,10 @@ class Identity(Module): args: any argument (unused) kwargs: any keyword argument (unused) + Shape: + - Input: :math:`(*)`, where :math:`*` means any number of dimensions. + - Output: :math:`(*)`, same shape as the input. + Examples:: >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False) diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py index d09e257452e44..3665e893fa5ec 100644 --- a/torch/nn/modules/pooling.py +++ b/torch/nn/modules/pooling.py @@ -882,8 +882,8 @@ class LPPool1d(_LPPoolNd): ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape Shape: - - Input: :math:`(N, C, L_{in})` - - Output: :math:`(N, C, L_{out})`, where + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. + - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where .. math:: L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index aeaf6616e28b1..6b1bcf66f8bd5 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -2246,6 +2246,14 @@ def single_batch_reference_fn(input, parameters, module): cpp_constructor_args='torch::nn::LPPool1dOptions(2, 2).stride(3)', input_size=(1, 3, 7), ), + dict( + module_name='LPPool1d', + constructor_args=(2, 2, 3), + cpp_constructor_args='torch::nn::LPPool1dOptions(2, 2).stride(3)', + input_size=(3, 7), + reference_fn=single_batch_reference_fn, + desc='no_batch_dim', + ), dict( module_name='LocalResponseNorm', constructor_args=(3, ), From 688f06cac353149d92dfda4793a3fb003b4c0e5a Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Tue, 24 Aug 2021 07:20:56 -0700 Subject: [PATCH 164/530] Revert D30388099: Add a common autograd TLS state Test Plan: revert-hammer Differential Revision: D30388099 (https://github.com/pytorch/pytorch/commit/83d9bad44a1e1e6202103cd22e4dbd2bd3d7dae0) Original commit changeset: 8e03f940150f fbshipit-source-id: f6d60fec66e8292f5268335bb8a3e7e1a662f23b --- aten/src/ATen/ThreadLocalState.cpp | 22 ++++++----------- aten/src/ATen/ThreadLocalState.h | 5 +++- c10/core/AutogradState.cpp | 19 --------------- c10/core/AutogradState.h | 39 ------------------------------ c10/core/GradMode.cpp | 7 +++--- c10/core/InferenceMode.cpp | 8 +++++- c10/core/InferenceMode.h | 19 ++++++++------- 7 files changed, 32 insertions(+), 87 deletions(-) delete mode 100644 c10/core/AutogradState.cpp delete mode 100644 c10/core/AutogradState.h diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index fc4b8fa9c27ec..ba7be1a06b8a1 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -12,12 +12,15 @@ namespace at { ThreadLocalState::ThreadLocalState(bool keep_grad_mode) : dispatch_key_(c10::impl::tls_local_dispatch_key_set()), debug_info_(c10::ThreadLocalDebugInfo::current()), - autograd_tls_(c10::AutogradState::get_tls_state()) { + inference_mode_enabled_(c10::InferenceMode::is_enabled()) { rf_tls_ = at::get_record_function_tls_(); saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks(); #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) keep_grad_mode_ = keep_grad_mode; + if (keep_grad_mode_) { + grad_mode_enabled_ = GradMode::is_enabled(); + } #endif bumped_record_all_functions_ = at::checkRecordAllFunctions(); } @@ -25,23 +28,10 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode) /* static */ void ThreadLocalState::setThreadLocalState( const ThreadLocalState& state) { - // Note that setting the InferenceMode TLS in this function is ONLY ok because we always - // restore the dispatch key set TLS at the same time. #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) if (state.keep_grad_mode_) { - c10::AutogradState::set_tls_state(state.autograd_tls_); - } else { - auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), - /* inference_mode */ state.autograd_tls_.get_inference_mode()); - c10::AutogradState::set_tls_state(new_state); + GradMode::set_enabled(state.grad_mode_enabled_); } -#else - // The mobile build explicitly ignore grad_mode but fails if we propagate - // its value across threads or set it to a fixed value. - // So we have to make sure the grad_mode value is not changed here. - auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), - /* inference_mode */ state.autograd_tls_.get_inference_mode()); - c10::AutogradState::set_tls_state(new_state); #endif at::set_record_function_tls_(state.rf_tls_); @@ -53,6 +43,8 @@ void ThreadLocalState::setThreadLocalState( c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_); c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_); + + c10::InferenceMode::_set_enabled(state.inference_mode_enabled_); } } // namespace at diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index 4942399cbd6d7..f30f5e3442cc1 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -35,11 +35,14 @@ class TORCH_API ThreadLocalState { // RecordFunction TLS RecordFunctionTLS rf_tls_; - AutogradState autograd_tls_; #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) bool keep_grad_mode_ = true; + bool grad_mode_enabled_; #endif + // TLS for InferenceMode + bool inference_mode_enabled_; + // TLS for saved tensors default hooks std::pair saved_tensors_default_hooks_; diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp deleted file mode 100644 index 9684a76b78564..0000000000000 --- a/c10/core/AutogradState.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include - -namespace c10 { - -namespace { -// By default, grad mode is enabled and inference mode is disabled -thread_local AutogradState autograd_state_tls = - AutogradState(/* grad_mode */ true, /* inference_mode */ false); -} // namespace - -AutogradState& AutogradState::get_tls_state() { - return autograd_state_tls; -} - -void AutogradState::set_tls_state(AutogradState state) { - autograd_state_tls = state; -} - -} // namespace c10 diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h deleted file mode 100644 index 83ea3607cd2af..0000000000000 --- a/c10/core/AutogradState.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include - -#include - -namespace c10 { - -// Structure used to pack all the thread local boolean -// flags used by autograd -struct TORCH_API AutogradState { - static AutogradState& get_tls_state(); - static void set_tls_state(AutogradState state); - - AutogradState(bool grad_mode, bool inference_mode) - : grad_mode_(grad_mode), inference_mode_(inference_mode) {} - - void set_grad_mode(bool enabled) { - grad_mode_ = enabled; - } - - void set_inference_mode(bool enabled) { - inference_mode_ = enabled; - } - - bool get_grad_mode() const { - return grad_mode_; - } - - bool get_inference_mode() const { - return inference_mode_; - } - - private: - bool grad_mode_ : 1; - bool inference_mode_ : 1; -}; - -} // namespace c10 diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp index a5db198083b2b..32747a6698afa 100644 --- a/c10/core/GradMode.cpp +++ b/c10/core/GradMode.cpp @@ -1,15 +1,16 @@ -#include #include #include namespace c10 { +thread_local bool GradMode_enabled = true; + bool GradMode::is_enabled() { - return AutogradState::get_tls_state().get_grad_mode(); + return GradMode_enabled; } void GradMode::set_enabled(bool enabled) { - AutogradState::get_tls_state().set_grad_mode(enabled); + GradMode_enabled = enabled; } } // namespace c10 diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp index 59eca760cf504..b588ab4da54b5 100644 --- a/c10/core/InferenceMode.cpp +++ b/c10/core/InferenceMode.cpp @@ -2,12 +2,18 @@ #include namespace c10 { +thread_local bool InferenceMode_enabled = false; + // Invariant: // is_enabled() == // !c10::impl::tls_is_dispatch_key_included(DispatchKey::ADInplaceOrView); // InferenceMode::is_enabled() is in perf critical path (TensorImpl constructor) // so it worths a separate TLS to skip the DispatchKeySet check. bool InferenceMode::is_enabled() { - return AutogradState::get_tls_state().get_inference_mode(); + return InferenceMode_enabled; +} + +void InferenceMode::_set_enabled(bool enabled) { + InferenceMode_enabled = enabled; } } // namespace c10 diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h index 9748d6eccfb54..7a9c2c593a453 100644 --- a/c10/core/InferenceMode.h +++ b/c10/core/InferenceMode.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -51,12 +50,10 @@ struct TORCH_API InferenceMode { // are applicable to InferenceMode as well, e.g. // `tensorTypeInCurrentExecutionContext` in interpreter.cpp. InferenceMode(bool enabled = true) - : prev_mode(AutogradState::get_tls_state()), - prev_keyset(c10::impl::tls_local_dispatch_key_set()) { - // Enabling inference mode means disabling grad mode - // And disabling inference mode means enabling grad mode - AutogradState::set_tls_state( - AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled)); + : prev_mode(InferenceMode::is_enabled()), + prev_keyset(c10::impl::tls_local_dispatch_key_set()), + grad_mode(at::AutoGradMode(!enabled)) { + _set_enabled(enabled); DispatchKeySet included = enabled ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView) : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView); @@ -70,13 +67,17 @@ struct TORCH_API InferenceMode { } ~InferenceMode() { - AutogradState::set_tls_state(prev_mode); + _set_enabled(prev_mode); c10::impl::_force_tls_local_dispatch_key_set(prev_keyset); } static bool is_enabled(); + // _set_enabled() is not user facing and should be only used in + // ThreadLocalState.cpp. + static void _set_enabled(bool enabled); private: - AutogradState prev_mode; + bool prev_mode; c10::impl::LocalDispatchKeySet prev_keyset; + at::AutoGradMode grad_mode; }; } // namespace c10 From f4aff3a346a0525e37d6071f318f7a4c54d5e1fb Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Tue, 24 Aug 2021 08:01:36 -0700 Subject: [PATCH 165/530] [BE] add distributed run_test options (#63147) Summary: Currently distributed tests are mixed within test_python. We would like to split the distributed tests into its own batch thus we need to split them out. Adding an option to include/exclude distributed tests with CUSTOM_HANDLERS. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63147 Test Plan: - locally run with the addition run_test.py options. - CI Dependency: found a bug in mpiexec test and need https://github.com/pytorch/pytorch/issues/63580 to fix it first. Reviewed By: bdhirsh Differential Revision: D30496178 Pulled By: walterddr fbshipit-source-id: 7903a57b619f2425028028f944211938823918a6 --- .jenkins/pytorch/test.sh | 11 ++++++++--- test/run_test.py | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 4fce9ab00dcc9..e27ba3e0cd838 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -158,17 +158,17 @@ test_python_legacy_jit() { } test_python_shard1() { - time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM" + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 1 2 --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } test_python_shard2() { - time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM" + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 2 2 --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } test_python() { - time python test/run_test.py --exclude-jit-executor --verbose --determine-from="$DETERMINE_FROM" + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } @@ -297,6 +297,10 @@ test_vulkan() { } test_distributed() { + echo "Testing distributed python tests" + time python test/run_test.py --distributed-tests --verbose --determine-from="$DETERMINE_FROM" + assert_git_not_dirty + if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then echo "Testing distributed C++ tests" ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR" @@ -505,6 +509,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$ test_without_numpy install_torchvision test_python_shard1 + test_distributed test_aten elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then install_torchvision diff --git a/test/run_test.py b/test/run_test.py index ad3cbb90b6d30..f3b7bf72bbbba 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -394,6 +394,11 @@ 'test_jit_fuser_legacy', ] +DISTRIBUTED_TESTS = [ + 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', +] + # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when # options.run_specified_test_cases is enabled. # For example: @@ -640,6 +645,11 @@ def parse_args(): '--jit', action='store_true', help='run all jit tests') + parser.add_argument( + '--distributed-tests', + '--distributed-tests', + action='store_true', + help='run all distributed tests') parser.add_argument( '-pt', '--pytest', action='store_true', help='If true, use `pytest` to execute the tests. E.g., this runs ' @@ -723,6 +733,11 @@ def parse_args(): action='store_true', help='exclude tests that are run for a specific jit config' ) + parser.add_argument( + '--exclude-distributed-tests', + action='store_true', + help='exclude distributed tests' + ) parser.add_argument( '--run-specified-test-cases', nargs='?', @@ -800,6 +815,7 @@ def exclude_tests(exclude_list, selected_tests, exclude_message=None): def get_selected_tests(options): + # First make sure run specific test cases options are processed. if options.run_specified_test_cases: if options.use_specified_test_cases_by == 'include': options.include = list(SPECIFIED_TEST_CASES_DICT.keys()) @@ -808,6 +824,16 @@ def get_selected_tests(options): selected_tests = options.include + # filter if there's JIT only and distributed only test options + if options.jit: + selected_tests = list( + filter(lambda test_name: "jit" in test_name, selected_tests)) + + if options.distributed_tests: + selected_tests = list( + filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests)) + + # process reordering if options.bring_to_front: to_front = set(options.bring_to_front) selected_tests = options.bring_to_front + list(filter(lambda name: name not in to_front, @@ -821,9 +847,13 @@ def get_selected_tests(options): last_index = find_test_index(options.last, selected_tests, find_last_index=True) selected_tests = selected_tests[:last_index + 1] + # process exclusion if options.exclude_jit_executor: options.exclude.extend(JIT_EXECUTOR_TESTS) + if options.exclude_distributed_tests: + options.exclude.extend(DISTRIBUTED_TESTS) + selected_tests = exclude_tests(options.exclude, selected_tests) if sys.platform == 'win32' and not options.ignore_win_blocklist: @@ -840,6 +870,7 @@ def get_selected_tests(options): elif TEST_WITH_ROCM: selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, 'on ROCm') + # sharding if options.shard: assert len(options.shard) == 2, "Unexpected shard format" assert min(options.shard) > 0, "Shards must be positive numbers" @@ -1030,9 +1061,6 @@ def main(): if options.coverage and not PYTORCH_COLLECT_COVERAGE: shell(['coverage', 'erase']) - if options.jit: - selected_tests = filter(lambda test_name: "jit" in test_name, TESTS) - if options.determine_from is not None and os.path.exists(options.determine_from): slow_tests = get_slow_tests_based_on_S3(TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD) print('Added the following tests to target_det tests as calculated based on S3:') From 1385f9fb12e6607c98d2d9d5edaaaab2bc07386f Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Tue, 24 Aug 2021 08:19:38 -0700 Subject: [PATCH 166/530] [JIT] Add variadic stack op (#63578) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63578 Added a new op `prim::VarStack` and a pass that transforms instances of `aten::stack(list, dim)` into `prim::VarStack(list[0], ..., list[n], dim)`. Also provided a JIT interpreter implementation. Most of the implementation/tests are the same as `prim::VarConcat`. Test Plan: `buck test caffe2/test/cpp/jit:jit -- TestStackOpt` Reviewed By: navahgar Differential Revision: D30426232 fbshipit-source-id: 9829a7db6e0a5038c9b7528c43c25b0c221aa2ce --- aten/src/ATen/core/interned_strings.h | 1 + test/cpp/jit/CMakeLists.txt | 1 + test/cpp/jit/test_stack_opt.cpp | 308 +++++++++++++++++++ torch/csrc/jit/passes/variadic_ops.cpp | 9 + torch/csrc/jit/passes/variadic_ops.h | 7 + torch/csrc/jit/runtime/register_prim_ops.cpp | 12 + torch/csrc/jit/runtime/static/ops.cpp | 2 +- 7 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 test/cpp/jit/test_stack_opt.cpp diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index c1dbc75a26b99..69e5f97f7127a 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -84,6 +84,7 @@ namespace c10 { _(prim, NumToTensor) \ _(prim, Uninitialized) \ _(prim, VarConcat) \ + _(prim, VarStack) \ _(prim, With) \ _(prim, Enter) \ _(prim, Exit) \ diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index 08115433312f5..e766f33a250b2 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -62,6 +62,7 @@ set(JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_qualified_name.cpp ${JIT_TEST_ROOT}/test_save_load.cpp ${JIT_TEST_ROOT}/test_schema_matching.cpp + ${JIT_TEST_ROOT}/test_stack_opt.cpp ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp ${JIT_TEST_ROOT}/test_subgraph_utils.cpp diff --git a/test/cpp/jit/test_stack_opt.cpp b/test/cpp/jit/test_stack_opt.cpp new file mode 100644 index 0000000000000..fea1bb5f81042 --- /dev/null +++ b/test/cpp/jit/test_stack_opt.cpp @@ -0,0 +1,308 @@ +#include + +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +TEST(StackOptTest, UseVariadicStack) { + auto graph = std::make_shared(); + + const std::string input = + R"IR( + graph(%0: Float(56, 56, 56), + %1: Float(56, 56, 56), + %2: Float(56, 56, 56), + %3: Float(56, 56, 56), + %4: Float(56, 56, 56), + %5: Float(56, 56, 56)): + %10 : int = prim::Constant[value=0]() + %input : Tensor[] = prim::ListConstruct(%0, %1, %2, %3, %4, %5) + %stack : Float(5, 56, 56, 56) = aten::stack(%input, %10) + return (%stack) + )IR"; + parseIR(input, graph.get()); + std::vector inputs = { + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU)}; + auto orig_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(UseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // After replacing `aten::stack` with `prim::VarStack` we should have the + // following graph: + // + // graph(%0 : ..., + // %1 : ...): + // %zero : int = prim:Constant[value=0]() + // %varstack : Tensor = prim::VarStack(%0, %1, %2, %3, %4, %5, %zero) + // return (%varstack) + testing::FileCheck() + .check_count("= prim::VarStack(", 1, /*exactly*/ true) + ->check_count("= aten::stack(", 0, /*exactly*/ true) + ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true) + ->run(*graph); +} + +TEST(StackOptTest, UseVariadicStackReplaceMultiple) { + auto graph = std::make_shared(); + + const std::string input = + R"IR( + graph(%0: Float(56, 56, 56), + %1: Float(56, 56, 56), + %2: Float(56, 56, 56), + %3: Float(56, 56, 56)): + %10 : int = prim::Constant[value=0]() + %input1 : Tensor[] = prim::ListConstruct(%0, %1) + %stack1 : Float(4, 56, 56, 56) = aten::stack(%input1, %10) + %input2 : Tensor[] = prim::ListConstruct(%2, %3) + %stack2 : Float(4, 56, 56, 56) = aten::stack(%input2, %10) + return (%stack1, %stack2) + )IR"; + parseIR(input, graph.get()); + std::vector inputs = { + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU)}; + auto orig_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(UseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // After full stack optimization we should have the following graph: + // + // graph(%0 : ..., + // %1 : ..., + // %2 : ..., + // %3 : ....): + // %zero : int = prim:Constant[value=0]() + // %varcat1 : Tensor = prim::VarStack(%0, %1, %zero) + // %varcat2 : Tensor = prim::VarStack(%2, %3, %zero) + // return (%varcat1, %varcat2) + testing::FileCheck() + .check_count("= prim::VarStack(", 2, /*exactly*/ true) + ->check_count("= aten::stack(", 0, /*exactly*/ true) + ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true) + ->run(*graph); +} + +TEST(StackOptTest, UseVariadicStackWithMultipleListUses) { + auto graph = std::make_shared(); + + const std::string input = + R"IR( + graph(%0: Float(56, 56, 56), + %1: Float(56, 56, 56)): + %2 : int = prim::Constant[value=0]() + %input : Tensor[] = prim::ListConstruct(%0, %1) + %stack : Float(2, 56, 56, 56) = aten::stack(%input, %2) + return (%stack, %input) + )IR"; + parseIR(input, graph.get()); + std::vector inputs = { + at::rand({56, 56, 56}, at::kCPU), at::rand({56, 56, 56}, at::kCPU)}; + auto orig_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(UseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // After replacing `aten::stack` with `prim::VarStack` we should have the + // following graph: + // + // graph(%0 : ..., + // %1 : ...): + // %zero : int = prim:Constant[value=0]() + // %input : Tensor[] = prim::ListConstruct(%0, %1) + // %varcat : Tensor = prim::VarStack(%0, %1, %zero) + // return (%varcat, %input) + testing::FileCheck() + .check_count("= prim::ListConstruct(", 1, /*exactly*/ true) + ->check_count("= prim::VarStack(", 1, /*exactly*/ true) + ->check_count("= aten::stack(", 0, /*exactly*/ true) + ->run(*graph); +} + +TEST(StackOptTest, UseVariadicStackWithListMutationAfterCat) { + auto graph = std::make_shared(); + + const std::string input = + R"IR( + graph(%0: Float(56, 56, 56), + %1: Float(56, 56, 56), + %2: Float(56, 56, 56)): + %10 : int = prim::Constant[value=0]() + %input : Tensor[] = prim::ListConstruct(%0, %1) + %stack : Float(3, 56, 56, 56) = aten::stack(%input, %10) + %11 : Tensor = aten::append(%input, %2) + return (%stack, %input) + )IR"; + parseIR(input, graph.get()); + std::vector inputs = { + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU)}; + auto orig_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(UseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // The input list to `aten::stack` is mutated only after `aten::stack` op. So, + // it should have been replaced with `prim::VarStack`. The transformed graph + // should look like the following: + // + // graph(%0 : ..., + // %1 : ..., + // %2 : ...): + // %3 : int = prim:Constant[value=0]() + // %4 : Tensor[] = prim::ListConstruct(%0, %1) + // %7 : Tensor = prim::VarStack(%0, %1, %3) + // %6 : Tensor = aten::append(%4, %2) + // return (%7, %4) + testing::FileCheck() + .check_count("= prim::ListConstruct(", 1, /*exactly*/ true) + ->check_count("= prim::VarStack(", 1, /*exactly*/ true) + ->check_count("= aten::stack(", 0, /*exactly*/ true) + ->run(*graph); +} + +TEST(StackOptTest, UseVariadicStackWithListMutationBeforeCat) { + auto graph = std::make_shared(); + + const std::string input = + R"IR( + graph(%0: Float(56, 56, 56), + %1: Float(56, 56, 56), + %2: Float(56, 56, 56)): + %10 : int = prim::Constant[value=0]() + %input : Tensor[] = prim::ListConstruct(%0, %1) + %11 : Tensor = aten::append(%input, %2) + %stack : Float(3, 56, 56, 56) = aten::stack(%input, %10) + return (%stack) + )IR"; + parseIR(input, graph.get()); + std::vector inputs = { + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU)}; + auto orig_outputs = runGraph(graph, inputs); + + { + ASSERT_FALSE(UseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // No transformation should have happened since the `prim::ListConstruct` is + // mutated before `aten::stack`. + testing::FileCheck() + .check_count("= prim::ListConstruct(", 1, /*exactly*/ true) + ->check_count("= aten::stack(", 1, /*exactly*/ true) + ->check_count("= prim::VarStack(", 0, /*exactly*/ true) + ->run(*graph); + } + + { + ASSERT_TRUE(RemoveListMutationAndUseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // The mutation of the list must be removed and the `aten::stack` op must + // be replaced with the `prim::VarStack` op in the graph. The transformed + // graph should look like the following: + // + // graph(%0 : ..., + // %1 : ..., + // %2 : ...): + // %3 : int = prim:Constant[value=0]() + // %7 : Tensor = prim::VarStack(%0, %1, %2, %3) + // return (%7) + testing::FileCheck() + .check_count("= prim::VarStack(", 1, /*exactly*/ true) + ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true) + ->check_count("= aten::stack(", 0, /*exactly*/ true) + ->run(*graph); + } +} + +TEST(StackOptTest, UseVariadicStackWithMultipleListMutations) { + auto graph = std::make_shared(); + + const std::string input = + R"IR( + graph(%0: Float(56, 56, 56), + %1: Float(56, 56, 56), + %2: Float(56, 56, 56), + %3: Float(56, 56, 56), + %4: Float(56, 56, 56)): + %10 : int = prim::Constant[value=0]() + %input : Tensor[] = prim::ListConstruct(%0, %1) + %stack.1 : Float(5, 56, 56, 56) = aten::stack(%input, %10) + %11 : Tensor = aten::append(%input, %2) + %stack.2 : Float(5, 56, 56, 56) = aten::stack(%input, %10) + %12 : Tensor = aten::append(%input, %3) + %stack.3 : Float(5, 56, 56, 56) = aten::stack(%input, %10) + %13 : Tensor = aten::append(%input, %4) + %stack.4 : Float(5, 56, 56, 56) = aten::stack(%input, %10) + return (%stack.1, %stack.2, %stack.3, %stack.4) + )IR"; + parseIR(input, graph.get()); + std::vector inputs = { + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU), + at::rand({56, 56, 56}, at::kCPU)}; + auto orig_outputs = runGraph(graph, inputs); + + ASSERT_TRUE(RemoveListMutationAndUseVariadicStack(graph)); + graph->lint(); + auto opt_outputs = runGraph(graph, inputs); + ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs)); + + // All the mutations of the list must be removed and the `aten::stack` ops + // must be replaced with `prim::VarStack` ops in the graph. The transformed + // graph should look like the following: + // + // graph(%0 : ..., + // %1 : ..., + // %2 : ..., + // %3 : ..., + // %4 : ...): + // %10 : int = prim:Constant[value=0]() + // %5 : Tensor = prim::VarStack(%0, %1, %10) + // %6 : Tensor = prim::VarStack(%0, %1, %2, %10) + // %7 : Tensor = prim::VarStack(%0, %1, %2, %3, %10) + // %8 : Tensor = prim::VarStack(%0, %1, %2, %3, %4, %10) + // return (%5, %6, %7, %8) + testing::FileCheck() + .check_count("= prim::VarStack(", 4, /*exactly*/ true) + ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true) + ->check_count("= aten::stack(", 0, /*exactly*/ true) + ->run(*graph); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/passes/variadic_ops.cpp b/torch/csrc/jit/passes/variadic_ops.cpp index aeb70747b3a57..6f4d23cec7b66 100644 --- a/torch/csrc/jit/passes/variadic_ops.cpp +++ b/torch/csrc/jit/passes/variadic_ops.cpp @@ -122,5 +122,14 @@ bool RemoveListMutationAndUseVariadicCat(const std::shared_ptr& graph) { return RemoveListMutationAndUseVariadicOp(graph, aten::cat, prim::VarConcat); } +bool UseVariadicStack(const std::shared_ptr& graph) { + return UseVariadicOp(graph, aten::stack, prim::VarStack); +} + +bool RemoveListMutationAndUseVariadicStack( + const std::shared_ptr& graph) { + return RemoveListMutationAndUseVariadicOp(graph, aten::stack, prim::VarStack); +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/variadic_ops.h b/torch/csrc/jit/passes/variadic_ops.h index 1c52e9513ae2b..20cc6648dddb4 100644 --- a/torch/csrc/jit/passes/variadic_ops.h +++ b/torch/csrc/jit/passes/variadic_ops.h @@ -12,5 +12,12 @@ TORCH_API bool UseVariadicCat(const std::shared_ptr& graph); TORCH_API bool RemoveListMutationAndUseVariadicCat( const std::shared_ptr& graph); +// Replaces the `aten::stack` ops in the given graph with variadic cat ops. +// Returns true if the graph is modified. +TORCH_API bool UseVariadicStack(const std::shared_ptr& graph); + +TORCH_API bool RemoveListMutationAndUseVariadicStack( + const std::shared_ptr& graph); + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index 984073fbf72c1..60458a0ae11e5 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -776,6 +776,18 @@ RegisterOperators reg( push(stack, at::cat(inputs, dim)); }, aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + auto dim = pop(stack).toInt(); + std::vector inputs(num_inputs - 1); + for (int i = 0; i < num_inputs - 1; ++i) { + inputs[num_inputs - 2 - i] = pop(stack).toTensor(); + } + push(stack, at::stack(inputs, dim)); + }, + aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"), diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 5ef770c6755fc..62d86126a0577 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -214,7 +214,7 @@ std::function getOutOfPlaceOperation(Node* n) { // Returns true if the node represents an op with variadic arguments. bool hasVarArgs(Node* n) { - if (n->kind() == prim::VarConcat) { + if (n->kind() == prim::VarConcat || n->kind() == prim::VarStack) { return true; } return false; From d3be02d1004aa84cb2e001a2d72a04356d4d4cd3 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Tue, 24 Aug 2021 08:22:47 -0700 Subject: [PATCH 167/530] fix batchnorm2d issue when input is non contiguous (#63392) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63392 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30476317 Pulled By: VitalyFedyunin fbshipit-source-id: 03055a0aec21cf2c029b6f32315da2b09cb722d0 --- aten/src/ATen/native/Normalization.cpp | 14 ++- .../src/ATen/native/cpu/batch_norm_kernel.cpp | 95 ++++++++----------- test/test_nn.py | 19 ++++ 3 files changed, 71 insertions(+), 57 deletions(-) diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 611faf010abaf..25ae1a765e85f 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -74,6 +74,13 @@ static inline bool is_contiguous(const Tensor& t) { return t.is_contiguous() || t.is_contiguous(at::MemoryFormat::ChannelsLast); } +// For some ambiguous cases, it is possible a channels last contiguous Tensor has +// `suggest_memory_format` of Contiguous. +// See https://github.com/pytorch/pytorch/issues/63224 for details. +static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) { + return t.is_contiguous() ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; +} + template std::tuple batch_norm_cpu_transform_input_template( const Tensor& input, const Tensor& weight, const Tensor& bias, @@ -87,10 +94,9 @@ std::tuple batch_norm_cpu_transform_input_template( && running_mean.is_contiguous() && running_var.is_contiguous(); - Tensor output = at::empty_like(input, input.suggest_memory_format()); - // inference contiguous path if (all_contiguous) { + Tensor output = at::empty_like(input, suggest_memory_format_contig(input)); batch_norm_cpu_stub(kCPU, output, input, weight, bias, save_mean, save_invstd, running_mean, running_var, train, eps); return std::make_tuple(output, save_mean, save_invstd); @@ -120,6 +126,7 @@ std::tuple batch_norm_cpu_transform_input_template( auto b = bias.defined() ? as_nd(bias) : at::detail::scalar_tensor_static(0, input.scalar_type(), kCPU); + Tensor output = at::empty_like(input, input.suggest_memory_format()); auto iter = TensorIteratorConfig() .add_output(output) .add_input(input) @@ -250,6 +257,9 @@ std::tuple batch_norm_backward_cpu_template( && input.suggest_memory_format() == grad_out_.suggest_memory_format(); if (all_contiguous) { + if (grad_input_mask[0]) { + grad_input = at::empty_like(input, suggest_memory_format_contig(input)); + } batch_norm_cpu_backward_stub(kCPU, grad_input, grad_weight, grad_bias, grad_out_, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); return std::make_tuple(grad_input, grad_weight, grad_bias); diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index 2d1275538d89f..75037606d3ff4 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -611,48 +611,38 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad void batch_norm_cpu_kernel(Tensor& output, const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd, const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { - switch (input.suggest_memory_format()) { - case at::MemoryFormat::Contiguous: { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_contiguous", [&] { - batch_norm_cpu_contiguous_impl(output, input, weight, bias, - save_mean, save_invstd, running_mean, running_var, train, eps); - }); - break; - } - case at::MemoryFormat::ChannelsLast: { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_channels_last", [&] { - batch_norm_cpu_channels_last_impl(output, input, weight, bias, - save_mean, save_invstd, running_mean, running_var, train, eps); - }); - break; - } - default: - TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + if (input.is_contiguous()) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_contiguous", [&] { + batch_norm_cpu_contiguous_impl(output, input, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + }); + } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_channels_last", [&] { + batch_norm_cpu_channels_last_impl(output, input, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + }); + } else { + TORCH_CHECK(false, "batch_norm_cpu_kernel: expecting input to be contiguous."); } } void batch_norm_cpu_collect_stats_kernel( Tensor& mean, Tensor& var_sum, const Tensor& input) { int64_t image_size = input.numel() / input.size(0) / input.size(1); - switch (input.suggest_memory_format()) { - case at::MemoryFormat::Contiguous: { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] { - if (image_size == 1) { // NC11 is also channels last - batch_norm_cpu_collect_stats_channels_last_impl(mean, var_sum, input); - } else { - batch_norm_cpu_collect_stats_contiguous_impl(mean, var_sum, input); - } - }); - break; - } - case at::MemoryFormat::ChannelsLast: { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] { + if (input.is_contiguous()) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] { + if (image_size == 1) { // NC11 is also channels last batch_norm_cpu_collect_stats_channels_last_impl(mean, var_sum, input); - }); - break; - } - default: - TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } else { + batch_norm_cpu_collect_stats_contiguous_impl(mean, var_sum, input); + } + }); + } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] { + batch_norm_cpu_collect_stats_channels_last_impl(mean, var_sum, input); + }); + } else { + TORCH_CHECK(false, "batch_norm_cpu_collect_stats_kernel: expecting input to be contiguous."); } } @@ -661,28 +651,23 @@ void batch_norm_cpu_backward_kernel(Tensor& grad_input, Tensor& grad_weight, Ten const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, bool train, double eps) { int64_t image_size = input.numel() / input.size(0) / input.size(1); - switch (input.suggest_memory_format()) { - case at::MemoryFormat::Contiguous: { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] { - if (image_size == 1) { // NC11 is also channels last - batch_norm_cpu_backward_channels_last_impl(grad_input, grad_weight, grad_bias, - grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); - } else { - batch_norm_cpu_backward_contiguous_impl(grad_input, grad_weight, grad_bias, - grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); - } - }); - break; - } - case at::MemoryFormat::ChannelsLast: { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] { + if (input.is_contiguous()) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] { + if (image_size == 1) { // NC11 is also channels last batch_norm_cpu_backward_channels_last_impl(grad_input, grad_weight, grad_bias, grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); - }); - break; - } - default: - TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } else { + batch_norm_cpu_backward_contiguous_impl(grad_input, grad_weight, grad_bias, + grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); + } + }); + } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] { + batch_norm_cpu_backward_channels_last_impl(grad_input, grad_weight, grad_bias, + grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); + }); + } else { + TORCH_CHECK(false, "batch_norm_cpu_backward_kernel: expecting input to be contiguous."); } } diff --git a/test/test_nn.py b/test/test_nn.py index f4691e6a5fa03..07a2b48cc6a20 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -8929,6 +8929,25 @@ def helper(self, size): helper(self, (4, 1, 9, 9)) helper(self, (4, 9, 1, 1)) + def test_batchnorm_non_contig_cpu(self): + input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu() + input = input.permute(0, 2, 1, 3) + + bn = torch.nn.BatchNorm2d(2).cpu().float().eval() + bn.weight.data.uniform_() + bn.bias.data.uniform_() + + ref_input = input.detach().clone().contiguous() + ref_bn = nn.BatchNorm2d(2).cpu().float().eval() + ref_bn.load_state_dict(bn.state_dict()) + + out = bn(input) + ref_out = ref_bn(ref_input) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertEqual(out, ref_out) + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @unittest.skipIf(not TEST_CUDNN, "needs cudnn") @skipIfRocm From 2ca2761f3c448e58d587b440d459d328ae8efc95 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 24 Aug 2021 08:26:21 -0700 Subject: [PATCH 168/530] ENH Adds no_batch_dim for NLLLoss (#62651) Summary: Towards https://github.com/pytorch/pytorch/issues/60585 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62651 Reviewed By: VitalyFedyunin Differential Revision: D30303340 Pulled By: jbschlosser fbshipit-source-id: 7ab478cf63bf6cd1f850cad5fd101e74a2cfe3f5 --- aten/src/ATen/native/LossNLL.cpp | 32 +++++++++++++++++----------- aten/src/ATen/native/cuda/Loss.cu | 1 - torch/nn/modules/loss.py | 5 +++-- torch/testing/_internal/common_nn.py | 11 ++++++++-- 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 7c306c2bb863c..c7c65f7b8cc22 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -22,10 +22,12 @@ TORCH_META_FUNC(nll_loss_forward) TORCH_CHECK( self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D"); TORCH_CHECK( - target.dim() == 1, - "1D target tensor expected, multi-target not supported"); + target.dim() <= 1, + "0D or 1D target tensor expected, multi-target not supported"); + + auto no_batch_dim = self.dim() == 1 && target.dim() == 0; TORCH_CHECK( - self.size(0) == target.size(0), + no_batch_dim || (self.size(0) == target.size(0)), "size mismatch (got input: ", self.sizes(), ", target: ", @@ -66,10 +68,12 @@ TORCH_META_FUNC(nll_loss_backward) TORCH_CHECK( self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D"); TORCH_CHECK( - target.dim() == 1, - "1D target tensor expected, multi-target not supported"); + target.dim() <= 1, + "0D or 1D target tensor expected, multi-target not supported"); + + auto no_batch_dim = self.dim() == 1 && target.dim() == 0; TORCH_CHECK( - self.size(0) == target.size(0), + no_batch_dim || (self.size(0) == target.size(0)), "size mismatch (got input: ", self.sizes(), ", target: ", @@ -181,7 +185,6 @@ static void nll_loss_out_frame( const int64_t ndim = input.dim(); TORCH_CHECK(ndim <= 2); const int64_t batch_size = ndim == 1 ? 1 : input.size(0); - TORCH_CHECK(target.size(0) == batch_size); constexpr int64_t cascade_sum_num_levels = 8; const int64_t level_power = @@ -298,7 +301,11 @@ static void nll_loss_backward_out_frame( const auto n_dims = input.dim(); const auto n_classes = input.size(-1); - auto target_acc = target.accessor(); + auto target_ = target; + if (target.dim() == 0) { + target_ = target.unsqueeze(0); + } + auto target_acc = target_.accessor(); auto weight_contiguous = optional_contiguous(weight); const scalar_t* weight_data = optional_data(weight_contiguous); @@ -349,7 +356,6 @@ static void nll_loss_backward_out_frame( auto grad_input_acc = grad_input.accessor(); const auto batch_size = input.size(0); - TORCH_CHECK(target.size(0) == batch_size); for (int64_t i = 0; i < batch_size; i++) { const auto cur_target = target_acc[i]; @@ -548,12 +554,12 @@ Tensor nll_loss_nd( const c10::optional& weight, int64_t reduction, int64_t ignore_index) { - if (self.dim() < 2) { + if (self.dim() < 1) { TORCH_CHECK_VALUE( - false, "Expected 2 or more dimensions (got ", self.dim(), ")"); + false, "Expected 1 or more dimensions (got ", self.dim(), ")"); } - if (self.sizes()[0] != target.sizes()[0]) { + if (self.dim() != 1 && self.sizes()[0] != target.sizes()[0]) { TORCH_CHECK_VALUE( false, "Expected input batch_size (", @@ -566,7 +572,7 @@ Tensor nll_loss_nd( Tensor ret; Tensor input_ = self; Tensor target_ = target; - if (input_.dim() == 2) { + if (input_.dim() == 1 || input_.dim() == 2) { ret = at::nll_loss(input_, target_, weight, reduction, ignore_index); } else if (input_.dim() == 4) { ret = at::nll_loss2d(input_, target_, weight, reduction, ignore_index); diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu index d814eae01f4ec..ac9c3c0d8130f 100644 --- a/aten/src/ATen/native/cuda/Loss.cu +++ b/aten/src/ATen/native/cuda/Loss.cu @@ -468,7 +468,6 @@ void nll_loss_backward_out_cuda_template( int64_t n_dims = input.dim(); int64_t n_classes = input.size(-1); int64_t batch_size = n_dims == 1 ? 1 : input.size(0); - int64_t num_targets = target.size(0); auto weight_ = weight.defined() ? weight.contiguous() : weight; diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 7f39db405c861..03732b6d192d8 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -164,10 +164,11 @@ class NLLLoss(_WeightedLoss): :attr:`reduction`. Default: ``'mean'`` Shape: - - Input: :math:`(N, C)` where `C = number of classes`, or + - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of `K`-dimensional loss. - - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or + - Target: :math:`(N)` or :math:`()`, where each value is + :math:`0 \leq \text{targets}[i] \leq C-1`, or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss. - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 6b1bcf66f8bd5..90024dea510ed 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -97,6 +97,7 @@ def get_weight(m): # - `test_cpp_api_parity`: if `False`, skips the C++ parity test for this test dict. Default: True. # - `has_parity`: if `False`, expects this test dict to fail the C++ parity test. Default: True. + module_tests = [ dict( module_name='Linear', @@ -1308,6 +1309,7 @@ def single_batch_reference_fn(input, parameters, module): with freeze_rng_state(): return module(single_batch_input).squeeze(0) + new_module_tests = [ poissonnllloss_no_reduce_test(), bceloss_no_reduce_test(), @@ -4055,6 +4057,7 @@ def kldivloss_reference(input, target, reduction='mean'): return result.sum() / result.size(0) return result + def kldivloss_log_target_reference(input, target, reduction='mean'): result = torch.exp(target) * (target - input) if reduction == 'mean': @@ -5182,6 +5185,7 @@ def single_batch_reference_criterion_fn(*args): ('HingeEmbeddingLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)), ('MultiLabelMarginLoss', lambda: torch.randn(4), lambda: torch.tensor([3, 0, -1, 1])), ('SoftMarginLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)), + ('NLLLoss', lambda: F.log_softmax(torch.randn(3), dim=0), lambda: torch.tensor(1)), ] classification_criterion_no_batch_extra_info: Dict[str, dict] = { 'MultiLabelMarginLoss': {'check_gradgrad': False}, @@ -5580,6 +5584,7 @@ def test_cuda(self, test_case): self.test_noncontig(test_case, gpu_module, gpu_input_tuple) + class InputVariableMixin(object): def _get_input(self): input = TestBase._get_input(self, False) # type: ignore[arg-type] @@ -5888,8 +5893,10 @@ def convert_dtype(obj, dtype, requires_grad=False): test_case.assertEqualIgnoreType(cpu_output, gpu_output, atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0) - cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args) - gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args) + cpu_gradInput = test_case._backward_criterion( + cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args) + gpu_gradInput = test_case._backward_criterion( + gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args) # dtype used to be able to be None, so set precision in this way instead of a precision map # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput, From 33a163d886d1a7b236bc34e69d5a7415a133bc23 Mon Sep 17 00:00:00 2001 From: yanbing-j Date: Tue, 24 Aug 2021 08:32:33 -0700 Subject: [PATCH 169/530] Enable BFloat16 LeakyReLU and RReLU in CPU path (#61514) Summary: Enable and optimize BFloat16 LeakyReLU and RReLU in CPU path. Pull Request resolved: https://github.com/pytorch/pytorch/pull/61514 Reviewed By: ejguan Differential Revision: D30257612 Pulled By: VitalyFedyunin fbshipit-source-id: 8cc0d1faacd02dcc9827af724a86d95b6952748f --- aten/src/ATen/native/Activation.cpp | 2 +- aten/src/ATen/native/cpu/Activation.cpp | 93 +++++++++++++------ test/cpp/api/functional.cpp | 70 +++++++------- test/cpp/api/modules.cpp | 80 ++++++++-------- test/test_autograd.py | 6 ++ .../_internal/common_methods_invocations.py | 1 + 6 files changed, 154 insertions(+), 98 deletions(-) diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index e6ae3c9ebc3d7..37700bb586793 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -524,7 +524,7 @@ Tensor& rrelu_with_noise_out_cpu(const Tensor& self, c10::optional generator, Tensor& output) { if (training) { - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "rrelu_with_noise_out_cpu", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "rrelu_with_noise_out_cpu", [&] { _rrelu_with_noise_train(output, self.contiguous(), noise, lower, upper, generator); }); return output; diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index ae1403d1a25d1..fc5cc0d1924fb 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -401,41 +401,80 @@ void hardswish_backward_kernel(TensorIterator& iter) { } static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&] { - using Vec = Vectorized; - auto zero_vec = Vec((scalar_t)(0)); - auto one_vec = Vec((scalar_t)(1)); - scalar_t negval = negval_.to(); - Vec negval_v = Vec(negval); + if (iter.common_dtype() == kBFloat16) { + auto zero_vec = Vectorized((float)(0)); + auto one_vec = Vectorized((float)(1)); + float negval = negval_.to(); + Vectorized negval_v = Vectorized(negval); cpu_kernel_vec( iter, - [&](scalar_t a) -> scalar_t { - return a > scalar_t(0) ? a : a * negval; + [&](BFloat16 a) -> BFloat16 { + return float(a) > float(0) ? float(a) : float(a) * negval; }, - [&](Vec a) -> Vec { - auto r = Vec::blendv(negval_v, one_vec, a > zero_vec); - return a * r; + [&](Vectorized a) -> Vectorized { + Vectorized a0, a1; + std::tie(a0, a1) = convert_bfloat16_float(a); + auto res0 = a0 * (Vectorized::blendv(negval_v, one_vec, a0 > zero_vec)); + auto res1 = a1 * (Vectorized::blendv(negval_v, one_vec, a1 > zero_vec)); + return convert_float_bfloat16(res0, res1); }); - }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&] { + using Vec = Vectorized; + auto zero_vec = Vec((scalar_t)(0)); + auto one_vec = Vec((scalar_t)(1)); + scalar_t negval = negval_.to(); + Vec negval_v = Vec(negval); + cpu_kernel_vec( + iter, + [&](scalar_t a) -> scalar_t { + return a > scalar_t(0) ? a : a * negval; + }, + [&](Vec a) -> Vec { + auto r = Vec::blendv(negval_v, one_vec, a > zero_vec); + return a * r; + }); + }); + } } static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&] { - using Vec = Vectorized; - auto zero_vec = Vec((scalar_t)(0)); - auto one_vec = Vec((scalar_t)(1)); - scalar_t negval = negval_.to(); - Vec negval_v = Vec(negval); + if (iter.common_dtype() == kBFloat16) { + auto zero_vec = Vectorized((float)(0)); + auto one_vec = Vectorized((float)(1)); + float negval = negval_.to(); + Vectorized negval_v = Vectorized(negval); cpu_kernel_vec( - iter, - [&](scalar_t a, scalar_t b) -> scalar_t { - return a > scalar_t(0) ? b : b * negval; - }, - [&](Vec a, Vec b) -> Vec { - auto r = Vec::blendv(negval_v, one_vec, a > zero_vec); - return b * r; - }); - }); + iter, + [&](BFloat16 a, BFloat16 b) -> BFloat16 { + return float(a) > float(0) ? float(b) : float(b) * negval; + }, + [&](Vectorized a, Vectorized b) -> Vectorized { + Vectorized a0, a1, b0, b1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + auto res0 = b0 * (Vectorized::blendv(negval_v, one_vec, a0 > zero_vec)); + auto res1 = b1 * (Vectorized::blendv(negval_v, one_vec, a1 > zero_vec)); + return convert_float_bfloat16(res0, res1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&] { + using Vec = Vectorized; + auto zero_vec = Vec((scalar_t)(0)); + auto one_vec = Vec((scalar_t)(1)); + scalar_t negval = negval_.to(); + Vec negval_v = Vec(negval); + cpu_kernel_vec( + iter, + [&](scalar_t a, scalar_t b) -> scalar_t { + return a > scalar_t(0) ? b : b * negval; + }, + [&](Vec a, Vec b) -> Vec { + auto r = Vec::blendv(negval_v, one_vec, a > zero_vec); + return b * r; + }); + }); + } } void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) { diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp index 582b1eebdb784..2ecb84189c55a 100644 --- a/test/cpp/api/functional.cpp +++ b/test/cpp/api/functional.cpp @@ -1034,17 +1034,19 @@ TEST_F(FunctionalTest, LeakyReLU) { const auto size = 3; for (const auto negative_slope : {0.0, 0.42, 1.0}) { for (const auto inplace : {false, true}) { - auto x = torch::linspace(-10.0, 10.0, size * size * size); - x.resize_({size, size, size}); - auto y_exp = (x < 0) * x * negative_slope + (x >= 0) * x; - auto y = F::leaky_relu(x, F::LeakyReLUFuncOptions() - .negative_slope(negative_slope).inplace(inplace)); + for (const auto type : {torch::kFloat, torch::kBFloat16}) { + auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type); + x.resize_({size, size, size}); + auto y_exp = (x < 0) * x * negative_slope + (x >= 0) * x; + auto y = F::leaky_relu(x, F::LeakyReLUFuncOptions() + .negative_slope(negative_slope).inplace(inplace)); - ASSERT_EQ(y.ndimension(), 3); - ASSERT_EQ(y.sizes(), std::vector({size, size, size})); - ASSERT_TRUE(torch::allclose(y, y_exp)); - if (inplace) { - ASSERT_TRUE(torch::allclose(x, y_exp)); + ASSERT_EQ(y.ndimension(), 3); + ASSERT_EQ(y.sizes(), std::vector({size, size, size})); + ASSERT_TRUE(torch::allclose(y, y_exp)); + if (inplace) { + ASSERT_TRUE(torch::allclose(x, y_exp)); + } } } } @@ -1443,19 +1445,21 @@ TEST_F(FunctionalTest, RReLU) { for (const auto lower : {0.01, 0.1, 0.2}) { for (const auto upper : {0.3, 0.4, 0.5}) { for (const auto inplace : {false, true}) { - auto x = torch::linspace(-10.0, 10.0, size * size * size); - x.resize_({size, size, size}); - auto x_copy = x.clone(); - auto y = F::rrelu(x, F::RReLUFuncOptions().lower(lower) - .upper(upper).inplace(inplace)); - auto z = ((x_copy >= 0) * (x_copy == y) + - (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0; - - ASSERT_EQ(y.ndimension(), 3); - ASSERT_EQ(y.sizes(), std::vector({size, size, size})); - ASSERT_TRUE(torch::allclose(z, torch::ones_like(z))); - if (inplace) { - ASSERT_TRUE(torch::allclose(x, y)); + for (const auto type : {torch::kFloat, torch::kBFloat16}) { + auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type); + x.resize_({size, size, size}); + auto x_copy = x.clone(); + auto y = F::rrelu(x, F::RReLUFuncOptions().lower(lower) + .upper(upper).inplace(inplace)); + auto z = ((x_copy >= 0) * (x_copy == y) + + (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0; + + ASSERT_EQ(y.ndimension(), 3); + ASSERT_EQ(y.sizes(), std::vector({size, size, size})); + ASSERT_TRUE(torch::allclose(z, torch::ones_like(z))); + if (inplace) { + ASSERT_TRUE(torch::allclose(x, y)); + } } } } @@ -1467,16 +1471,18 @@ TEST_F(FunctionalTest, RReLUDefaultOptions) { const auto size = 3; const auto lower = 1.0 / 8.0; const auto upper = 1.0 / 3.0; - auto x = torch::linspace(-10.0, 10.0, size * size * size); - x.resize_({size, size, size}); - auto x_copy = x.clone(); - auto y = F::rrelu(x); - auto z = ((x_copy >= 0) * (x_copy == y) + - (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0; + for (const auto type : {torch::kFloat, torch::kBFloat16}) { + auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type); + x.resize_({size, size, size}); + auto x_copy = x.clone(); + auto y = F::rrelu(x); + auto z = ((x_copy >= 0) * (x_copy == y) + + (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0; - ASSERT_EQ(y.ndimension(), 3); - ASSERT_EQ(y.sizes(), std::vector({size, size, size})); - ASSERT_TRUE(torch::allclose(z, torch::ones_like(z))); + ASSERT_EQ(y.ndimension(), 3); + ASSERT_EQ(y.sizes(), std::vector({size, size, size})); + ASSERT_TRUE(torch::allclose(z, torch::ones_like(z))); + } } TEST_F(FunctionalTest, CELU) { diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 036ff5e4bf2ec..23d75efeee21f 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -2521,25 +2521,27 @@ TEST_F(ModulesTest, LeakyReLU) { const auto size = 3; for (const auto inplace : {false, true}) { for (const auto negative_slope : {0.0, 0.42, 1.0}) { - LeakyReLU model {LeakyReLUOptions().negative_slope(negative_slope).inplace(inplace)}; - auto x = torch::linspace(-10.0, 10.0, size * size * size); - x.resize_({size, size, size}); - if (!inplace) { - x.requires_grad_(true); - } - auto x_orig = x.clone(); - auto y = model(x); - torch::Tensor s = y.sum(); + for (const auto type : {torch::kFloat, torch::kBFloat16}) { + LeakyReLU model {LeakyReLUOptions().negative_slope(negative_slope).inplace(inplace)}; + auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type); + x.resize_({size, size, size}); + if (!inplace) { + x.requires_grad_(true); + } + auto x_orig = x.clone(); + auto y = model(x); + torch::Tensor s = y.sum(); - ASSERT_EQ(s.ndimension(), 0); - ASSERT_EQ(y.ndimension(), 3); - ASSERT_EQ(y.sizes(), std::vector({size, size, size})); - auto y_exp = (x_orig < 0) * x_orig * negative_slope + (x_orig >= 0) * x_orig; - ASSERT_TRUE(torch::allclose(y, y_exp)); - if (inplace) { - ASSERT_TRUE(torch::allclose(x, y_exp)); - } else { - s.backward(); + ASSERT_EQ(s.ndimension(), 0); + ASSERT_EQ(y.ndimension(), 3); + ASSERT_EQ(y.sizes(), std::vector({size, size, size})); + auto y_exp = (x_orig < 0) * x_orig * negative_slope + (x_orig >= 0) * x_orig; + ASSERT_TRUE(torch::allclose(y, y_exp)); + if (inplace) { + ASSERT_TRUE(torch::allclose(x, y_exp)); + } else { + s.backward(); + } } } } @@ -2740,26 +2742,28 @@ TEST_F(ModulesTest, RReLU) { for (const auto lower : {0.01, 0.1, 0.2}) { for (const auto upper : {0.3, 0.4, 0.5}) { for (const auto inplace : {false, true}) { - RReLU model {RReLUOptions().lower(lower).upper(upper).inplace(inplace)}; - auto x = torch::linspace(-10.0, 10.0, size * size * size); - x.resize_({size, size, size}); - if (!inplace) { - x.requires_grad_(true); - } - auto x_orig = x.clone(); - auto y = model(x); - torch::Tensor s = y.sum(); - - ASSERT_EQ(s.ndimension(), 0); - ASSERT_EQ(y.ndimension(), 3); - ASSERT_EQ(y.sizes(), std::vector({size, size, size})); - auto z = ((x_orig >= 0) * (x_orig == y) + - (x_orig < 0) * (y >= x_orig * upper) * (y <= lower * x_orig)) * 1.0; - ASSERT_TRUE(torch::allclose(z, torch::ones_like(z))); - if (inplace) { - ASSERT_TRUE(torch::allclose(x, y)); - } else { - s.backward(); + for (const auto type : {torch::kFloat, torch::kBFloat16}) { + RReLU model {RReLUOptions().lower(lower).upper(upper).inplace(inplace)}; + auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type); + x.resize_({size, size, size}); + if (!inplace) { + x.requires_grad_(true); + } + auto x_orig = x.clone(); + auto y = model(x); + torch::Tensor s = y.sum(); + + ASSERT_EQ(s.ndimension(), 0); + ASSERT_EQ(y.ndimension(), 3); + ASSERT_EQ(y.sizes(), std::vector({size, size, size})); + auto z = ((x_orig >= 0) * (x_orig == y) + + (x_orig < 0) * (y >= x_orig * upper) * (y <= lower * x_orig)) * 1.0; + ASSERT_TRUE(torch::allclose(z, torch::ones_like(z))); + if (inplace) { + ASSERT_TRUE(torch::allclose(x, y)); + } else { + s.backward(); + } } } } diff --git a/test/test_autograd.py b/test/test_autograd.py index 8b7aeb4159f23..126d9230fe687 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -8256,6 +8256,12 @@ def test_leaky_relu_inplace_with_zero_slope(self, device): expected = torch.tensor([0., 0., 1.], device=device) self.assertEqual(a.grad, expected) + a_bf16 = torch.tensor([-2., 0., 2.], device=device, dtype=torch.bfloat16, requires_grad=True) + b_bf16 = torch.nn.functional.leaky_relu_(a_bf16.clone(), 0.0) + b_bf16.backward(torch.ones(3, device=device)) + expected_bf16 = torch.tensor([0., 0., 1.], device=device, dtype=torch.bfloat16) + self.assertEqual(a_bf16.grad, expected_bf16) + @onlyOnCPUAndCUDA def test_elu_inplace_with_neg_alpha(self, device): a = torch.tensor([-1., 1.], device=device, requires_grad=True) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 63af3965a2e7d..8dd9db3303456 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -7154,6 +7154,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): aten_name="leaky_relu", dtypes=floating_types(), sample_inputs_func=sample_inputs_leaky_relu, + dtypesIfCPU=floating_types_and(torch.bfloat16), dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), supports_autograd=True, assert_autodiffed=True, From 94d621584a8d2780252546aa787aab23203221b2 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Tue, 24 Aug 2021 08:54:36 -0700 Subject: [PATCH 170/530] optimize BFloat16 elemwise operators CPU: sigmoid, sigmoid_backward, tanh_backward, addcmul, addcdiv (#55221) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/55221 Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D28836797 Pulled By: VitalyFedyunin fbshipit-source-id: 6b79098c902ffe65d228668118ef36fb49bab800 --- aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | 65 ++++++++++---- .../ATen/native/cpu/PointwiseOpsKernel.cpp | 84 ++++++++++++++----- aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 36 ++++++-- .../_internal/common_methods_invocations.py | 4 +- 4 files changed, 143 insertions(+), 46 deletions(-) diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 2a8f73cb88dd0..16efa2511899f 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -684,19 +684,35 @@ void sigmoid_backward_kernel(TensorIteratorBase& iter) { return a * ((one_vec - b) * b).conj(); }); }); + } else if (iter.dtype() == kBFloat16) { + auto one_vec = Vectorized((float)(1)); + cpu_kernel_vec( + iter, + [=](BFloat16 a, BFloat16 b) -> BFloat16 { + float a0 = static_cast(a); + float b0 = static_cast(b); + return a0 * (float(1) - b0) * b0; + }, + [=](Vectorized a, Vectorized b) { + Vectorized a0, a1, b0, b1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + a0 = a0 * (one_vec - b0) * b0; + a1 = a1 * (one_vec - b1) * b1; + return convert_float_bfloat16(a0, a1); + }); } else { - AT_DISPATCH_FLOATING_TYPES_AND2( - kBFloat16, kHalf, iter.dtype(), "sigmoid_backward_cpu", [&]() { - auto one_vec = Vectorized((scalar_t)(1)); - cpu_kernel_vec( - iter, - [=](scalar_t a, scalar_t b) -> scalar_t { - return a * (scalar_t(1) - b) * b; - }, - [=](Vectorized a, Vectorized b) { - return a * (one_vec - b) * b; - }); + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "sigmoid_backward_cpu", [&]() { + auto one_vec = Vectorized((scalar_t)(1)); + cpu_kernel_vec( + iter, + [=](scalar_t a, scalar_t b) -> scalar_t { + return a * (scalar_t(1) - b) * b; + }, + [=](Vectorized a, Vectorized b) { + return a * (one_vec - b) * b; }); + }); } } @@ -754,15 +770,32 @@ void tanh_backward_kernel(TensorIteratorBase& iter) { if (isComplexType(iter.dtype())) { AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() { auto one_vec = Vectorized(scalar_t{1}); + cpu_kernel_vec( + iter, + [=](scalar_t a, scalar_t b) -> scalar_t { + return a * std::conj(scalar_t{1} - b * b); + }, + [=](Vectorized a, Vectorized b) { + return a * (one_vec - b * b).conj(); + }); + }); + } else if (iter.dtype() == kBFloat16) { + auto one_vec = Vectorized(float{1}); cpu_kernel_vec( iter, - [=](scalar_t a, scalar_t b) -> scalar_t { - return a * std::conj(scalar_t{1} - b * b); + [=](BFloat16 a, BFloat16 b) -> BFloat16 { + float a0 = float(a); + float b0 = float(b); + return a0 * (float{1} - b0 * b0); }, - [=](Vectorized a, Vectorized b) { - return a * (one_vec - b * b).conj(); + [=](Vectorized a, Vectorized b) { + Vectorized a0, a1, b0, b1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + a0 = a0 * (one_vec - b0 * b0); + a1 = a1 * (one_vec - b1 * b1); + return convert_float_bfloat16(a0, a1); }); - }); } else { AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() { auto one_vec = Vectorized(scalar_t{1}); diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp index e0807d14b1b5a..0d0508adb7c11 100644 --- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp @@ -12,38 +12,82 @@ namespace { static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { ScalarType dtype = iter.dtype(0); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcmul_cpu_out", [&] { - scalar_t scalar_val = value.to(); - auto scalar_vec = Vectorized(scalar_val); + if (iter.dtype() == kBFloat16) { + float float_val = value.to(); + auto float_vec = Vectorized(float_val); cpu_kernel_vec( iter, - [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t { - return self_val + scalar_val * t1_val * t2_val; + [=](BFloat16 self_val, BFloat16 t1_val, BFloat16 t2_val) -> BFloat16 { + return float(self_val) + float_val * float(t1_val) * float(t2_val); }, - [=](Vectorized self_vec, - Vectorized t1_vec, - Vectorized t2_vec) { - return self_vec + scalar_vec * t1_vec * t2_vec; + [=](Vectorized self_vec, + Vectorized t1_vec, + Vectorized t2_vec) { + Vectorized self_vec0, self_vec1; + std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec); + Vectorized t1_vec0, t1_vec1, t2_vec0, t2_vec1; + std::tie(t1_vec0, t1_vec1) = convert_bfloat16_float(t1_vec); + std::tie(t2_vec0, t2_vec1) = convert_bfloat16_float(t2_vec); + self_vec0 = self_vec0 + float_vec * t1_vec0 * t2_vec0; + self_vec1 = self_vec1 + float_vec * t1_vec1 * t2_vec1; + return convert_float_bfloat16(self_vec0, self_vec1); }); - }); + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcmul_cpu_out", [&] { + scalar_t scalar_val = value.to(); + auto scalar_vec = Vectorized(scalar_val); + cpu_kernel_vec( + iter, + [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t { + return self_val + scalar_val * t1_val * t2_val; + }, + [=](Vectorized self_vec, + Vectorized t1_vec, + Vectorized t2_vec) { + return self_vec + scalar_vec * t1_vec * t2_vec; + }); + }); + } } static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { ScalarType dtype = iter.dtype(0); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcdiv_cpu_out", [&] { - scalar_t scalar_val = value.to(); - auto scalar_vec = Vectorized(scalar_val); + if (dtype == kBFloat16) { + float float_val = value.to(); + auto float_vec = Vectorized(float_val); cpu_kernel_vec( iter, - [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t { - return self_val + scalar_val * t1_val / t2_val; + [=](BFloat16 self_val, BFloat16 t1_val, BFloat16 t2_val) -> BFloat16 { + return float(self_val) + float_val * float(t1_val) / float(t2_val); }, - [=](Vectorized self_vec, - Vectorized t1_vec, - Vectorized t2_vec) { - return self_vec + scalar_vec * t1_vec / t2_vec; + [=](Vectorized self_vec, + Vectorized t1_vec, + Vectorized t2_vec) { + Vectorized self_vec0, self_vec1; + std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec); + Vectorized t1_vec0, t1_vec1, t2_vec0, t2_vec1; + std::tie(t1_vec0, t1_vec1) = convert_bfloat16_float(t1_vec); + std::tie(t2_vec0, t2_vec1) = convert_bfloat16_float(t2_vec); + self_vec0 = self_vec0 + float_vec * t1_vec0 / t2_vec0; + self_vec1 = self_vec1 + float_vec * t1_vec1 / t2_vec1; + return convert_float_bfloat16(self_vec0, self_vec1); }); - }); + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcdiv_cpu_out", [&] { + scalar_t scalar_val = value.to(); + auto scalar_vec = Vectorized(scalar_val); + cpu_kernel_vec( + iter, + [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t { + return self_val + scalar_val * t1_val / t2_val; + }, + [=](Vectorized self_vec, + Vectorized t1_vec, + Vectorized t2_vec) { + return self_vec + scalar_vec * t1_vec / t2_vec; + }); + }); + } } static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) { diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index a867a2a0ce519..007e444d5cd33 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -35,18 +35,36 @@ namespace CPU_CAPABILITY { using namespace vec; static void sigmoid_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.common_dtype(), "sigmoid_cpu", [&]() { + if (iter.common_dtype() == kBFloat16) { cpu_kernel_vec( iter, - [=](scalar_t a) -> scalar_t { return (static_cast(1) / (static_cast(1) + std::exp((-a)))); }, - [=](Vectorized a) { - a = Vectorized(static_cast(0)) - a; - a = a.exp(); - a = Vectorized(static_cast(1)) + a; - a = a.reciprocal(); - return a; + [=](BFloat16 a) -> BFloat16 { + float a0 = static_cast(a); + return static_cast(1) / (static_cast(1) + std::exp((-a0))); + }, + [=](Vectorized a) { + Vectorized a0, a1; + std::tie(a0, a1) = convert_bfloat16_float(a); + a0 = (Vectorized(static_cast(1)) + a0.neg().exp()).reciprocal(); + a1 = (Vectorized(static_cast(1)) + a1.neg().exp()).reciprocal(); + return convert_float_bfloat16(a0, a1); }); - }); + } else { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "sigmoid_cpu", [&]() { + cpu_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { + return (static_cast(1) / (static_cast(1) + std::exp((-a)))); + }, + [=](Vectorized a) { + a = Vectorized(static_cast(0)) - a; + a = a.exp(); + a = Vectorized(static_cast(1)) + a; + a = a.reciprocal(); + return a; + }); + }); + } } #if AT_MKL_ENABLED() diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 8dd9db3303456..b725c4831d25f 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -5689,6 +5689,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): gradcheck_nondet_tol=GRADCHECK_NONDET_TOL), OpInfo('addcmul', dtypes=all_types_and_complex(), + dtypesIfCPU=all_types_and_complex_and(torch.bfloat16), dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -5699,6 +5700,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): sample_inputs_func=sample_inputs_addcmul_addcdiv), OpInfo('addcdiv', dtypes=floating_and_complex_types(), + dtypesIfCPU=floating_and_complex_types_and(torch.bfloat16), dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), supports_inplace_autograd=False, supports_forward_ad=True, @@ -7677,7 +7679,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), # "tanh_backward_cpu" not implemented for 'BFloat16' - backward_dtypesIfCPU=all_types_and_complex_and(torch.bool), + backward_dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16), assert_autodiffed=True, safe_casts_outputs=True, supports_forward_ad=True, From 227cb268bccd22feb8aa8651773a202ec1e09c7f Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 24 Aug 2021 09:24:50 -0700 Subject: [PATCH 171/530] [Reland] Embedding thrust->cub migration (#63806) Summary: Fixes https://github.com/pytorch/pytorch/issues/63427 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63806 Reviewed By: bdhirsh Differential Revision: D30498255 Pulled By: ngimel fbshipit-source-id: 78b7085a92a168cf0163f53dcb712bac922f5235 --- aten/src/ATen/cuda/cub.cuh | 19 +++- aten/src/ATen/native/cuda/Embedding.cu | 91 +++++-------------- .../native/cuda/EmbeddingBackwardKernel.cuh | 4 - aten/src/ATen/native/cuda/Indexing.cu | 3 - .../ATen/native/cuda/LegacyThrustHelpers.cu | 43 +++++++++ aten/src/ATen/native/cuda/Randperm.cu | 2 - aten/src/ATen/native/cuda/UniqueCub.cu | 13 +-- torch/testing/_internal/common_nn.py | 8 ++ 8 files changed, 95 insertions(+), 88 deletions(-) diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 62da28d34e8e5..38e5852260f3a 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -3,6 +3,7 @@ #include #include #include +#include // include cub in a safe manner, see: // https://github.com/pytorch/pytorch/pull/55292 @@ -102,6 +103,8 @@ static inline void sort_keys( const key_t *keys_in, key_t *keys_out, int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8 ) { + TORCH_CHECK(n <= std::numeric_limits::max(), + "cub sort does not support sorting more than INT_MAX elements"); using key_t_ = typename detail::cuda_type::type; const key_t_ *keys_in_ = reinterpret_cast(keys_in); @@ -124,6 +127,8 @@ static inline void sort_pairs( const value_t *values_in, value_t *values_out, int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8 ) { + TORCH_CHECK(n <= std::numeric_limits::max(), + "cub sort does not support sorting more than INT_MAX elements"); using key_t_ = typename detail::cuda_type::type; auto allocator = c10::cuda::CUDACachingAllocator::get(); @@ -156,6 +161,10 @@ static inline void segmented_sort_pairs( OffsetIteratorT begin_offsets, OffsetIteratorT end_offsets, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8 ) { + TORCH_CHECK(num_elements <= std::numeric_limits::max(), + "cub sort does not support sorting more than INT_MAX elements"); + TORCH_CHECK(num_segments <= std::numeric_limits::max(), + "cub sort does not support sorting more than INT_MAX elements"); using key_t_ = typename detail::cuda_type::type; auto allocator = c10::cuda::CUDACachingAllocator::get(); @@ -305,4 +314,12 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT } } -}}} +template +inline void unique(InputIteratorT input, OutputIteratorT output, NumSelectedIteratorT num_selected_out, int64_t num_items) { + TORCH_CHECK(num_items <= std::numeric_limits::max(), + "cub unique does not support more than INT_MAX elements"); + CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSelect::Unique, + input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream()); +} + +}}} // namespace at::cuda::cub diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 10a42b8914e62..ba79fa10f926a 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -7,12 +7,9 @@ #include #include -#include #include -#include -#include -#include +#include #include #include @@ -224,14 +221,19 @@ __global__ void renorm_kernel( } // anonymous namespace -Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices, +template +void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); + +Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) { auto grad_arg = TensorArg(grad_, "grad", 1); - auto indices_arg = TensorArg(indices, "indices", 1); + auto indices_arg = TensorArg(indices_, "indices", 1); checkScalarTypes("embedding_backward", indices_arg, {kLong, kInt}); checkSameGPU("embedding_backward", grad_arg, indices_arg); + auto indices = indices_.contiguous(); + auto num_indices = indices.numel(); auto grad = grad_.contiguous().view({num_indices, grad_.size(-1)}); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -272,59 +274,16 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); Tensor count; AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { - using device_ptr = thrust::device_ptr; - - // Sort the inputs into sorted with the corresponding indices; we - // don't need a stable or multidimensional sort, so just use Thrust - // directly - { - sorted_indices.copy_(indices); - - auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); - auto policy = thrust::cuda::par(allocator).on(stream); - - // Fill sortedOrigIndices with sequential indices - auto count_iter = thrust::counting_iterator(0); - auto orig_data = device_ptr(orig_indices.data_ptr()); - thrust::copy(policy, count_iter, count_iter + num_indices, orig_data); - - // Sort; a stable sort is not required - auto sorted_data = device_ptr(sorted_indices.data_ptr()); - thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, - LTOp()); - } + auto range = at::arange(num_indices, indices.options()); + int64_t nbits = cuda::cub::get_num_bits(num_weights); + cuda::cub::sort_pairs( + indices.data_ptr(), sorted_indices.data_ptr(), + range.data_ptr(), orig_indices.data_ptr(), + num_indices, false/*, 0, nbits*/); if (scale_grad_by_freq) { count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - - auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); - auto policy = thrust::cuda::par(allocator).on(stream); - - // Compute an increasing sequence per unique item in sortedIndices: - // sorted: 2 5 5 5 7 7 8 9 9 - // count: 1 1 2 3 1 2 1 1 2 - auto sorted_data = device_ptr(sorted_indices.data_ptr()); - auto count_data = device_ptr(count.data_ptr()); - thrust::inclusive_scan_by_key( - policy, - sorted_data, - sorted_data + num_indices, - thrust::make_constant_iterator(1), - count_data - ); - - // Take the maximum of each count per unique key in reverse: - // sorted: 2 5 5 5 7 7 8 9 9 - // count: 1 3 3 3 2 2 1 2 2 - thrust::inclusive_scan_by_key( - policy, - thrust::make_reverse_iterator(sorted_data + num_indices), - thrust::make_reverse_iterator(sorted_data), - thrust::make_reverse_iterator(count_data + num_indices), - thrust::make_reverse_iterator(count_data + num_indices), - thrust::equal_to(), - thrust::maximum() - ); + embedding_dense_backward_cuda_scan(sorted_indices, count); } }); @@ -340,23 +299,23 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, checkSameGPU("embedding_renorm", self_arg, indices_arg); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); - auto policy = thrust::cuda::par(allocator).on(stream); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_cuda_", [&] () { - using device_ptr = thrust::device_ptr; auto num_indices = indices.numel(); auto indices_contig = std::get<0>(indices.sort()).contiguous(); - auto indices_data = device_ptr(indices_contig.data_ptr()); - auto unique_indices = at::empty(indices.numel(), indices.options()); - auto unique_data = device_ptr(unique_indices.data_ptr()); - auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data); - auto num_unique_indices = static_cast(end - unique_data); + auto num_unique_indices = at::empty({}, indices.options().dtype(kLong)); + + cuda::cub::unique( + indices_contig.data_ptr(), + unique_indices.data_ptr(), + num_unique_indices.data_ptr(), + num_indices + ); - dim3 grid(num_unique_indices); - dim3 block(128); + dim3 grid = num_unique_indices.item(); + dim3 block = 128; int dim = self.stride(0); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_backward", [&] { diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh index f06b850668591..c79bf83cc8a6a 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -10,10 +10,6 @@ #include #include -#include -#include -#include - #pragma once namespace at { diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 95ab33e512f02..57654f2fb9b74 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -218,9 +218,6 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List::max(), - "index_put of tensors larger than INT_MAX is not supported yet in pytorch"); - if (num_indices > 0 && sliceSize > 0) { const bool permuted = !src.is_contiguous(); auto src_ = permuted ? src.contiguous() : src; diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu index 582dc9ebe0498..446aa085a31d3 100644 --- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu +++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu @@ -5,6 +5,8 @@ #include #include #include +#include +#include namespace at { namespace native { @@ -30,4 +32,45 @@ void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_ thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp()); } +template +void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) { + using device_ptr = thrust::device_ptr; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto num_indices = count.numel(); + + // Compute an increasing sequence per unique item in sortedIndices: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + auto sorted_data = device_ptr(sorted_indices.data_ptr()); + auto count_data = device_ptr(count.data_ptr()); + thrust::inclusive_scan_by_key( + policy, + sorted_data, + sorted_data + num_indices, + thrust::make_constant_iterator(1), + count_data + ); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( + policy, + thrust::make_reverse_iterator(sorted_data + num_indices), + thrust::make_reverse_iterator(sorted_data), + thrust::make_reverse_iterator(count_data + num_indices), + thrust::make_reverse_iterator(count_data + num_indices), + thrust::equal_to(), + thrust::maximum() + ); +} + +template +void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); +template +void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); + }} diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu index 4c5e16a1ceed0..56b8eb20faae6 100644 --- a/aten/src/ATen/native/cuda/Randperm.cu +++ b/aten/src/ATen/native/cuda/Randperm.cu @@ -47,8 +47,6 @@ template struct alignas(N) OpaqueType { char data[N]; }; Tensor& randperm_out_cuda(int64_t n, c10::optional generator, Tensor& result) { TORCH_CHECK(n >= 0, "n must be non-negative, got", n); - TORCH_CHECK(n <= std::numeric_limits::max(), - "randperm of tensors larger than INT_MAX is not supported yet in pytorch"); check_supported_max_int_with_precision(n, result); diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu index 1b9619b29812b..eb31fd2f76bb8 100644 --- a/aten/src/ATen/native/cuda/UniqueCub.cu +++ b/aten/src/ATen/native/cuda/UniqueCub.cu @@ -94,13 +94,7 @@ std::tuple compute_unique( Tensor length = at::empty({1}, options); int64_t num_out; if (!return_counts) { - CUB_WRAPPER( - cub::DeviceSelect::Unique, - data, - data_out.data_ptr(), - length.data_ptr(), - num_inp, - stream); + cuda::cub::unique(data, data_out.data_ptr(), length.data_ptr(), num_inp); num_out = length.item(); } else { counts.resize_(num_inp); @@ -135,11 +129,6 @@ std::tuple unique_cuda_template( auto options = self.options().dtype(kLong); int64_t num_inp = self.numel(); - TORCH_CHECK( - num_inp <= INT_MAX, - "num_inp ", - num_inp, - " is too big to be handled by cub"); Tensor sorted; Tensor self_c = self.contiguous(); if (consecutive) { diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 90024dea510ed..e0d09b7ba03fc 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -2774,6 +2774,14 @@ def single_batch_reference_fn(input, parameters, module): input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4), check_gradgrad=False, ), + dict( + module_name='Embedding', + constructor_args=(4, 3), + cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)', + input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512), + check_gradgrad=False, + desc='discontiguous' + ), dict( module_name='EmbeddingBag', constructor_args=(4, 3), From 7774a4e95b69d7a61ae1644a27f9b924d93ca600 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Tue, 24 Aug 2021 09:38:25 -0700 Subject: [PATCH 172/530] [Static Runtime] Implement prim::VarStack out variant (#63579) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63579 Provide a static runtime out variant implementation for the new op introduced in D30426232 (https://github.com/pytorch/pytorch/commit/1385f9fb12e6607c98d2d9d5edaaaab2bc07386f). Test Plan: `buck test //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- IndividualOps_VarStack` Reviewed By: navahgar Differential Revision: D30410525 fbshipit-source-id: bc59a3d8ad23e3d94561ec2dca9cc20687dbadf8 --- benchmarks/static_runtime/test_scripts.h | 5 +++ .../static_runtime/test_static_runtime.cc | 16 ++++++++++ torch/csrc/jit/runtime/static/impl.cpp | 1 + torch/csrc/jit/runtime/static/ops.cpp | 31 ++++++++++++++++--- 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 9e01d3b8d0b87..c82dd57752bd6 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -587,6 +587,11 @@ const auto var_cat_script = R"JIT( return torch.cat([inp1, inp2], dim).clone() )JIT"; +const auto var_stack_script = R"JIT( + def forward(self, inp1: Tensor, inp2: Tensor, dim: int): + return torch.stack([inp1, inp2], dim).clone() +)JIT"; + const auto isinstance_int_script = R"JIT( def forward(self, a: Any): return isinstance(a, int) diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 71102215b4e2e..701231e7720d1 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1195,3 +1195,19 @@ TEST(StaticRuntime, QuantizedLinear) { testStaticRuntime(quantize_script, {input, weight}, {input_2, weight_2}); } + +TEST(StaticRuntime, IndividualOps_VarStack) { + // 2D tensors - stack dim = 0 + std::vector args1 = {at::randn({6, 6}), at::randn({6, 6}), 0}; + testStaticRuntime(var_stack_script, args1); + + // 3D tensors - stack dim = 1 + std::vector args2 = {at::randn({4, 5, 6}), at::randn({4, 5, 6}), 1}; + testStaticRuntime(var_stack_script, args2); + + // 3D tensors - stack dim = 2 + std::vector args3 = {at::randn({4, 5, 6}), at::randn({4, 5, 6}), 2}; + testStaticRuntime(var_stack_script, args3); + + testStaticRuntime(var_stack_script, args1, args2); +} diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 1b5ee724b45a4..cb9342b364cc5 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -66,6 +66,7 @@ void OptimizeGraph( EliminateDeadCode(graph); FuseInferenceOpsForSparseNN(graph); UseVariadicCat(graph); + UseVariadicStack(graph); // TODO: we can avoid this guard by moving operations // to exposed folders. diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 62d86126a0577..e6af641083fc2 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -452,6 +452,29 @@ SROperator aten_stack(Node* n) { REGISTER_OPERATOR_FUNCTOR(aten::stack, aten_stack, aten_stack); +REGISTER_OPERATOR_FUNCTOR( + prim::VarStack, + prim_VarStack, + [](Node* n) -> SROperator { + return [](ProcessedNode* p_node) { + const size_t num_inputs = p_node->inputs().size(); + + std::vector inputs(num_inputs - 1); + for (size_t i = 0; i < num_inputs - 1; ++i) { + inputs[i] = p_node->Input(i).toTensor(); + } + + const auto dim = p_node->Input(num_inputs - 1).toInt(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::_stack_cpu(inputs, dim); + } else { + auto& out_t = p_node->Output(0).toTensor(); + fastResizeToZero(out_t); + at::native::_stack_out_cpu(inputs, dim, out_t); + } + }; + }); + REGISTER_OPERATOR_FUNCTOR(aten::leaky_relu, aten_leaky_relu, [](Node* n) -> SROperator { if (!n->matches(torch::schema( "aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor"))) { @@ -472,10 +495,10 @@ REGISTER_OPERATOR_FUNCTOR(aten::leaky_relu, aten_leaky_relu, [](Node* n) -> SROp namespace { -// Use the width of an AVX-512 vector by default; this happens to work OK for -// AVX2 as well. Some ops benefit from using multiple AVX ports, in which case -// they are vectorized by twice this constant. An exception is logit, since it -// contains FP divide, which is single-ported. +// Use the width of an AVX-512 vector by default; this happens to work OK +// for AVX2 as well. Some ops benefit from using multiple AVX ports, in +// which case they are vectorized by twice this constant. An exception is +// logit, since it contains FP divide, which is single-ported. static constexpr int kVectorWidth = 16; #ifdef TORCH_ENABLE_LLVM From 5dee15401c7730ae2a0e49e774c9da2d067527b7 Mon Sep 17 00:00:00 2001 From: Karen Zhou Date: Tue, 24 Aug 2021 10:17:28 -0700 Subject: [PATCH 173/530] [pruner] refactor `ActivationReconstruction` forward hooks (#63158) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63158 Combined functionality for `ActivationReconstruction` for both Linear and Conv2d in one class. The only difference between the old classes was the size and indexing of the reconstructed tensor -- that logic can be generalized by iterating over the size of `output`. ghstack-source-id: 136467465 Test Plan: `buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner` https://pxl.cl/1MSSv Reviewed By: raghuramank100 Differential Revision: D30282765 fbshipit-source-id: 08a1e4e0650511019fff85cf52b41dd818b0c7f8 --- torch/ao/sparsity/__init__.py | 3 +-- .../experimental/pruner/base_pruner.py | 10 +++----- .../experimental/pruner/parametrization.py | 25 +++++++++---------- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py index ef03c71c64732..9ba05f2cde5ab 100644 --- a/torch/ao/sparsity/__init__.py +++ b/torch/ao/sparsity/__init__.py @@ -17,8 +17,7 @@ # Parametrizations from .experimental.pruner.parametrization import PruningParametrization -from .experimental.pruner.parametrization import LinearActivationReconstruction -from .experimental.pruner.parametrization import Conv2dActivationReconstruction +from .experimental.pruner.parametrization import ActivationReconstruction # Pruner from .experimental.pruner.base_pruner import BasePruner diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py index 075a7ceae305a..92e1945666262 100644 --- a/torch/ao/sparsity/experimental/pruner/base_pruner.py +++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py @@ -8,7 +8,7 @@ from torch.nn.modules.container import ModuleDict, ModuleList -from .parametrization import PruningParametrization, LinearActivationReconstruction, Conv2dActivationReconstruction +from .parametrization import PruningParametrization, ActivationReconstruction SUPPORTED_MODULES = { nn.Linear, @@ -140,13 +140,9 @@ def prepare(self, use_path=False, *args, **kwargs): assert isinstance(module.parametrizations, ModuleDict) # make mypy happy assert isinstance(module.parametrizations.weight, ModuleList) - if isinstance(module, nn.Linear): + if isinstance(module, tuple(SUPPORTED_MODULES)): self.activation_handles.append(module.register_forward_hook( - LinearActivationReconstruction(module.parametrizations.weight[0]) - )) - elif isinstance(module, nn.Conv2d): - self.activation_handles.append(module.register_forward_hook( - Conv2dActivationReconstruction(module.parametrizations.weight[0]) + ActivationReconstruction(module.parametrizations.weight[0]) )) else: raise NotImplementedError("This module type is not supported yet.") diff --git a/torch/ao/sparsity/experimental/pruner/parametrization.py b/torch/ao/sparsity/experimental/pruner/parametrization.py index 1156ea8af4ef1..d4bebb27725cd 100644 --- a/torch/ao/sparsity/experimental/pruner/parametrization.py +++ b/torch/ao/sparsity/experimental/pruner/parametrization.py @@ -1,5 +1,6 @@ import torch from torch import nn +from typing import Any, List class PruningParametrization(nn.Module): @@ -13,27 +14,25 @@ def forward(self, x): return x[list(valid_outputs)] -class LinearActivationReconstruction: +class ActivationReconstruction: def __init__(self, parametrization): self.param = parametrization def __call__(self, module, input, output): max_outputs = self.param.original_outputs pruned_outputs = self.param.pruned_outputs - reconstructed_tensor = torch.zeros((output.shape[0], len(max_outputs))) valid_columns = list(max_outputs - pruned_outputs) - reconstructed_tensor[:, valid_columns] = output - return reconstructed_tensor + # get size of reconstructed output + sizes = list(output.shape) + sizes[1] = len(max_outputs) -class Conv2dActivationReconstruction: - def __init__(self, parametrization): - self.param = parametrization + # get valid indices of reconstructed output + indices: List[Any] = [] + for size in output.shape: + indices.append(slice(0, size, 1)) + indices[1] = valid_columns - def __call__(self, module, input, output): - max_outputs = self.param.original_outputs - pruned_outputs = self.param.pruned_outputs - reconstructed_tensor = torch.zeros((output.shape[0], len(max_outputs), output.shape[2], output.shape[3])) - valid_columns = list(max_outputs - pruned_outputs) - reconstructed_tensor[:, valid_columns, :, :] = output + reconstructed_tensor = torch.zeros(sizes) + reconstructed_tensor[indices] = output return reconstructed_tensor From 16ba20507a7a8fcb62d88c719eceab578e09e210 Mon Sep 17 00:00:00 2001 From: Karen Zhou Date: Tue, 24 Aug 2021 10:17:28 -0700 Subject: [PATCH 174/530] [pruner] amend base pruner API to match base sparsifier (#63178) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63178 Update base pruner API to match base sparsifier API as defined in D28970960 / PR58955 Changes include: - `enable_mask_update = True` in `__init__` - `prepare` takes model and config instead of constructor - convert functionality renamed to `squash_mask`, `convert` method call now raises Error - `activation_handles` ad `bias_handles` initialized in `_prepare` instead of constructor ghstack-source-id: 136467595 Test Plan: Function names updates according to changes `buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner` https://pxl.cl/1MTgH TODO will need to modify `fbcode/scripts/kazhou/fusion_tests.py` to use new API Reviewed By: z-a-f Differential Revision: D30287179 fbshipit-source-id: d4727bea1873b500f2d4bb784db26d532bf26cce --- test/ao/sparsity/test_pruner.py | 61 +++++----- torch/ao/sparsity/__init__.py | 2 + .../experimental/pruner/base_pruner.py | 109 +++--------------- .../ao/sparsity/sparsifier/base_sparsifier.py | 24 +--- torch/ao/sparsity/sparsifier/utils.py | 18 +++ 5 files changed, 66 insertions(+), 148 deletions(-) diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_pruner.py index 8f5f6dd19abbe..c358df6ac95ae 100644 --- a/test/ao/sparsity/test_pruner.py +++ b/test/ao/sparsity/test_pruner.py @@ -161,7 +161,7 @@ def _check_pruner_prepared(self, model, pruner, device): # Assume that this is the 1st/only parametrization assert type(module.parametrizations.weight[0]) == PruningParametrization - def _check_pruner_converted(self, model, pruner, device): + def _check_pruner_mask_squashed(self, model, pruner, device): for g in pruner.module_groups: module = g['module'] assert module.weight.device == device @@ -184,16 +184,18 @@ def _test_constructor_on_device(self, model, device): self.assertRaisesRegex(TypeError, 'with abstract methods update_mask', BasePruner) model = model.to(device) - pruner = SimplePruner(model, None, None) + pruner = SimplePruner(None) + pruner.prepare(model, None) for g in pruner.module_groups: module = g['module'] assert module.weight.device == device assert len(pruner.module_groups) == 2 pruner.step() # Can instantiate the model with configs - pruner = SimplePruner(model, [model.linear], {'test': 3}) + pruner = SimplePruner({'test': 3}) + pruner.prepare(model, [model.linear]) assert len(pruner.module_groups) == 1 - assert pruner.module_groups[0]['path'] == 'linear' + assert pruner.module_groups[0]['fqn'] == 'linear' assert 'test' in pruner.module_groups[0] assert pruner.module_groups[0]['test'] == 3 @@ -205,8 +207,8 @@ def test_constructor(self): def _test_prepare_linear_on_device(self, model, device): model = model.to(device) x = torch.ones(128, 16) - pruner = SimplePruner(model, None, None) - pruner.prepare() + pruner = SimplePruner(None) + pruner.prepare(model, None) self._check_pruner_prepared(model, pruner, device) assert model(x).shape == (128, 16) @@ -219,8 +221,8 @@ def test_prepare_linear(self): def _test_prepare_conv2d_on_device(self, model, device): model = model.to(device) x = torch.ones((1, 1, 28, 28)) - pruner = SimplePruner(model, None, None) - pruner.prepare() + pruner = SimplePruner(None) + pruner.prepare(model, None) self._check_pruner_prepared(model, pruner, device) assert model(x).shape == (1, 64, 24, 24) @@ -230,51 +232,49 @@ def test_prepare_conv2d(self): for model in models: self._test_prepare_conv2d_on_device(model, torch.device(device)) - def _test_convert_linear_on_device(self, model, device): + def _test_squash_mask_linear_on_device(self, model, device): model = model.to(device) x = torch.ones(128, 16) - pruner = SimplePruner(model, None, None) - pruner.prepare() - pruner.convert() - self._check_pruner_converted(model, pruner, device) + pruner = SimplePruner(None) + pruner.prepare(model, None) + pruner.squash_mask() + self._check_pruner_mask_squashed(model, pruner, device) assert model(x).shape == (128, 16) - def test_convert_linear(self): + def test_squash_mask_linear(self): models = [Linear(), LinearB()] # without and with bias for device in DEVICES: for model in models: - self._test_convert_linear_on_device(model, torch.device(device)) + self._test_squash_mask_linear_on_device(model, torch.device(device)) - def _test_convert_conv2d_on_device(self, model, device): + def _test_squash_mask_conv2d_on_device(self, model, device): model = model.to(device) x = torch.ones((1, 1, 28, 28)) - pruner = SimplePruner(model, None, None) - pruner.prepare() - pruner.convert() - self._check_pruner_converted(model, pruner, device) + pruner = SimplePruner(None) + pruner.prepare(model, None) + pruner.squash_mask() + self._check_pruner_mask_squashed(model, pruner, device) assert model(x).shape == (1, 64, 24, 24) - def test_convert_conv2d(self): + def test_squash_mask_conv2d(self): models = [Conv2dA(), Conv2dB(), Conv2dC()] for device in DEVICES: for model in models: - self._test_convert_conv2d_on_device(model, torch.device(device)) + self._test_squash_mask_conv2d_on_device(model, torch.device(device)) def _test_step_linear_on_device(self, model, is_basic, device): model = model.to(device) if is_basic: x = torch.ones(16, 16) - pruner = SimplePruner(model, None, None) - pruner.prepare() - pruner.enable_mask_update = True + pruner = SimplePruner(None) + pruner.prepare(model, None) self._check_pruner_valid_before_step(model, pruner, device) pruner.step() self._check_pruner_valid_after_step(model, pruner, {1}, device) else: x = torch.ones(7, 7) - pruner = MultiplePruner(model, None, None) - pruner.prepare() - pruner.enable_mask_update = True + pruner = MultiplePruner(None) + pruner.prepare(model, None) self._check_pruner_valid_before_step(model, pruner, device) pruner.step() self._check_pruner_valid_after_step(model, pruner, {1, 2}, device) @@ -291,9 +291,8 @@ def test_step_linear(self): def _test_step_conv2d_on_device(self, model, device): model = model.to(device) x = torch.ones((1, 1, 28, 28)) - pruner = SimplePruner(model, None, None) - pruner.prepare() - pruner.enable_mask_update = True + pruner = SimplePruner(None) + pruner.prepare(model, None) self._check_pruner_valid_before_step(model, pruner, device) pruner.step() self._check_pruner_valid_after_step(model, pruner, {1}, device) diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py index 9ba05f2cde5ab..55b8d7059c9ae 100644 --- a/torch/ao/sparsity/__init__.py +++ b/torch/ao/sparsity/__init__.py @@ -12,6 +12,8 @@ # Parametrizations from .sparsifier.utils import FakeSparsity +from .sparsifier.utils import module_to_fqn +from .sparsifier.utils import fqn_to_module # === Experimental === diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py index 92e1945666262..d89b3cc86d550 100644 --- a/torch/ao/sparsity/experimental/pruner/base_pruner.py +++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py @@ -1,6 +1,5 @@ import abc -import copy import torch from torch import nn @@ -10,31 +9,15 @@ from .parametrization import PruningParametrization, ActivationReconstruction +from torch.ao.sparsity import BaseSparsifier, fqn_to_module + SUPPORTED_MODULES = { nn.Linear, nn.Conv2d } -def _module_to_path(model, layer, prefix=''): - for name, child in model.named_children(): - new_name = prefix + '.' + name - if child is layer: - return new_name - child_path = _module_to_path(child, layer, prefix=new_name) - if child_path is not None: - return child_path - return None - -def _path_to_module(model, path): - path = path.split('.') - for name in path: - model = getattr(model, name, None) - if model is None: - return None - return model - - -class BasePruner(abc.ABC): + +class BasePruner(BaseSparsifier): r"""Base class for all pruners. Abstract methods that need to be implemented: @@ -53,66 +36,8 @@ class BasePruner(abc.ABC): be updated. """ - def __init__(self, model, config, defaults): - super().__init__() - self.config = config - self.defaults = defaults - if self.defaults is None: - self.defaults = dict() - - self.module_groups = [] - self.enable_mask_update = False - self.activation_handles = [] - self.bias_handles = [] - - self.model = model - # If no config -- try getting all the supported layers - if self.config is None: - # Add all models to the config - self.config = [] - stack = [model] - while stack: - module = stack.pop() - for name, child in module.named_children(): - if type(child) in SUPPORTED_MODULES: - self.config.append(child) - else: - stack.append(child) - - for module_config in self.config: - if isinstance(module_config, nn.Module): - module_config = {'module': module_config} - local_args = copy.deepcopy(self.defaults) - local_args.update(module_config) - module = local_args['module'] - module_path = _module_to_path(self.model, module) - if module_path and module_path[0] == '.': - module_path = module_path[1:] - local_args['path'] = module_path - self.module_groups.append(local_args) - - def __getstate__(self): - return { - 'defaults': self.defaults, - 'module_groups': self.module_groups, - } - - def __setstate__(self, state): - self.__dict__.update(state) - - def __repr__(self): - format_string = self.__class__.__name__ + ' (' - for i, sparse_args in enumerate(self.module_groups): - module = sparse_args['module'] - format_string += '\n' - format_string += f'\tModule Group {i}\n' - format_string += f'\t module: {module}\n' - for key in sorted(sparse_args.keys()): - if key == 'module': - continue - format_string += f'\t {key}: {sparse_args[key]}\n' - format_string += ')' - return format_string + def __init__(self, defaults): + super().__init__(defaults) def bias_hook(self, module, input, output): if getattr(module, '_bias', None) is not None: @@ -122,12 +47,15 @@ def bias_hook(self, module, input, output): output += bias return output - def prepare(self, use_path=False, *args, **kwargs): + def _prepare(self, use_path=False, *args, **kwargs): r"""Adds mask parametrization to the layer weight """ + self.activation_handles = [] # store removable hook handles + self.bias_handles = [] + for config in self.module_groups: if use_path: - module = _path_to_module(self.model, config['path']) + module = fqn_to_module(self.model, config['fqn']) else: module = config['module'] @@ -152,10 +80,10 @@ def prepare(self, use_path=False, *args, **kwargs): module.bias = None self.bias_handles.append(module.register_forward_hook(self.bias_hook)) - def convert(self, use_path=False, *args, **kwargs): + def squash_mask(self, use_path=False, *args, **kwargs): for config in self.module_groups: if use_path: - module = _path_to_module(self.model, config['path']) + module = fqn_to_module(self.model, config['fqn']) else: module = config['module'] parametrize.remove_parametrizations(module, 'weight', @@ -166,17 +94,6 @@ def convert(self, use_path=False, *args, **kwargs): del module._buffers['mask'] delattr(module, 'mask') - def step(self, use_path=True): - if not self.enable_mask_update: - return - with torch.no_grad(): - for config in self.module_groups: - if use_path: - module = _path_to_module(self.model, config['path']) - else: - module = config['module'] - self.update_mask(module, **config) - @abc.abstractmethod def update_mask(self, layer, **kwargs): pass diff --git a/torch/ao/sparsity/sparsifier/base_sparsifier.py b/torch/ao/sparsity/sparsifier/base_sparsifier.py index d6bc7d75248cf..1d01b71daae25 100644 --- a/torch/ao/sparsity/sparsifier/base_sparsifier.py +++ b/torch/ao/sparsity/sparsifier/base_sparsifier.py @@ -8,30 +8,12 @@ from torch import nn from torch.nn.utils import parametrize -from .utils import FakeSparsity +from .utils import FakeSparsity, module_to_fqn, fqn_to_module SUPPORTED_MODULES = { nn.Linear } -def _module_to_fqn(model, layer, prefix=''): - for name, child in model.named_children(): - new_name = prefix + '.' + name - if child is layer: - return new_name - child_path = _module_to_fqn(child, layer, prefix=new_name) - if child_path is not None: - return child_path - return None - -def _fqn_to_module(model, path): - path = path.split('.') - for name in path: - model = getattr(model, name, None) - if model is None: - return None - return model - class BaseSparsifier(abc.ABC): r"""Base class for all sparsifiers. @@ -136,7 +118,7 @@ def load_state_dict(self, state_dict, strict=True): module_groups = copy.deepcopy(state_dict['module_groups']) states = state_dict['state'] for fqn, s in states.items(): - layer = _fqn_to_module(self.model, fqn) + layer = fqn_to_module(self.model, fqn) if strict and layer is None: raise RuntimeError(f'Error loading {fqn} into the model') @@ -186,7 +168,7 @@ def prepare(self, model, config): local_args = copy.deepcopy(self.defaults) local_args.update(module_config) module = local_args['module'] - module_fqn = _module_to_fqn(model, module) + module_fqn = module_to_fqn(model, module) if module_fqn and module_fqn[0] == '.': module_fqn = module_fqn[1:] local_args['fqn'] = module_fqn diff --git a/torch/ao/sparsity/sparsifier/utils.py b/torch/ao/sparsity/sparsifier/utils.py index 6271a8d502f0d..3124b1b767b0f 100644 --- a/torch/ao/sparsity/sparsifier/utils.py +++ b/torch/ao/sparsity/sparsifier/utils.py @@ -1,5 +1,23 @@ from torch import nn +def module_to_fqn(model, layer, prefix=''): + for name, child in model.named_children(): + new_name = prefix + '.' + name + if child is layer: + return new_name + child_path = module_to_fqn(child, layer, prefix=new_name) + if child_path is not None: + return child_path + return None + +def fqn_to_module(model, path): + path = path.split('.') + for name in path: + model = getattr(model, name, None) + if model is None: + return None + return model + # Parametrizations class FakeSparsity(nn.Module): r"""Parametrization for the weights. Should be attached to the 'weight' or From 1256dcd50967b18c2ca335662558e77aeefe4f13 Mon Sep 17 00:00:00 2001 From: Karen Zhou Date: Tue, 24 Aug 2021 10:17:28 -0700 Subject: [PATCH 175/530] [pruner] modify base pruner to prune bias by default (#63202) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63202 By default, the prune will also prune biases, such that the whole output channel is removed. The user can manually set `also_prune_bias` to False when calling `prepare` if they don't want the bias to be pruned. ghstack-source-id: 136466671 Test Plan: `buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner` https://pxl.cl/1MV32 modify `fusion_tests` according to API change `buck test mode/opt //scripts/kazhou:fusion_tests` https://pxl.cl/1NbKz Reviewed By: z-a-f Differential Revision: D30294494 fbshipit-source-id: c84655648bee0035559195ca855b98fb7edaa134 --- torch/ao/sparsity/__init__.py | 1 + .../experimental/pruner/base_pruner.py | 22 +++++-------------- .../experimental/pruner/parametrization.py | 22 +++++++++++++++++++ 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py index 55b8d7059c9ae..06854a42cf9ce 100644 --- a/torch/ao/sparsity/__init__.py +++ b/torch/ao/sparsity/__init__.py @@ -20,6 +20,7 @@ # Parametrizations from .experimental.pruner.parametrization import PruningParametrization from .experimental.pruner.parametrization import ActivationReconstruction +from .experimental.pruner.parametrization import BiasHook # Pruner from .experimental.pruner.base_pruner import BasePruner diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py index d89b3cc86d550..a8a7b69141be3 100644 --- a/torch/ao/sparsity/experimental/pruner/base_pruner.py +++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py @@ -7,7 +7,7 @@ from torch.nn.modules.container import ModuleDict, ModuleList -from .parametrization import PruningParametrization, ActivationReconstruction +from .parametrization import PruningParametrization, ActivationReconstruction, BiasHook from torch.ao.sparsity import BaseSparsifier, fqn_to_module @@ -26,26 +26,16 @@ class BasePruner(BaseSparsifier): `module_groups`. Args: - - model [nn.Module]: model to configure. The model itself is not saved - but used for the state_dict saving / loading. - - config [list]: configuration elements could either be instances of - nn.Module or dict maps. The dicts must have a key 'module' with the - value being an instance of a nn.Module. - defaults [dict]: default configurations will be attached to the configuration. Only the keys that don't exist in the `config` will be updated. + - also_prune_bias [bool]: whether to prune bias in addition to weights (to prune full output channel) + or not; default=True. """ - def __init__(self, defaults): + def __init__(self, defaults, also_prune_bias=True): super().__init__(defaults) - - def bias_hook(self, module, input, output): - if getattr(module, '_bias', None) is not None: - idx = [1] * len(output.shape) - idx[1] = output.shape[1] - bias = module._bias.reshape(idx) - output += bias - return output + self.prune_bias = also_prune_bias def _prepare(self, use_path=False, *args, **kwargs): r"""Adds mask parametrization to the layer weight @@ -78,7 +68,7 @@ def _prepare(self, use_path=False, *args, **kwargs): if module.bias is not None: module.register_parameter('_bias', nn.Parameter(module.bias.detach())) module.bias = None - self.bias_handles.append(module.register_forward_hook(self.bias_hook)) + self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias))) def squash_mask(self, use_path=False, *args, **kwargs): for config in self.module_groups: diff --git a/torch/ao/sparsity/experimental/pruner/parametrization.py b/torch/ao/sparsity/experimental/pruner/parametrization.py index d4bebb27725cd..696b16e1edccc 100644 --- a/torch/ao/sparsity/experimental/pruner/parametrization.py +++ b/torch/ao/sparsity/experimental/pruner/parametrization.py @@ -36,3 +36,25 @@ def __call__(self, module, input, output): reconstructed_tensor = torch.zeros(sizes) reconstructed_tensor[indices] = output return reconstructed_tensor + + +class BiasHook: + def __init__(self, parametrization, prune_bias): + self.param = parametrization + self.prune_bias = prune_bias + + def __call__(self, module, input, output): + pruned_outputs = self.param.pruned_outputs + + if getattr(module, '_bias', None) is not None: + bias = module._bias.data + if self.prune_bias: + bias[list(pruned_outputs)] = 0 + + # reshape bias to broadcast over output dimensions + idx = [1] * len(output.shape) + idx[1] = -1 + bias = bias.reshape(idx) + + output += bias + return output From 14d4723abde40375118afd097d8e47a99145c222 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Tue, 24 Aug 2021 10:30:18 -0700 Subject: [PATCH 176/530] add bf16 support for bucketize (#55588) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/55588 Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D28836796 Pulled By: VitalyFedyunin fbshipit-source-id: c9ae5b969c30a45473533be5f29bb497f8da5143 --- aten/src/ATen/native/Bucketization.cpp | 4 ++-- test/test_reductions.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp index 7dc76a7577aa2..c11ce253f1d4a 100644 --- a/aten/src/ATen/native/Bucketization.cpp +++ b/aten/src/ATen/native/Bucketization.cpp @@ -74,12 +74,12 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) { if (!out_int32) { - AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cpu", [&] { + AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_cpu", [&] { searchsorted_cpu_contiguous(result, input, boundaries, right); }); } else { - AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cpu", [&] { + AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_cpu", [&] { searchsorted_cpu_contiguous(result, input, boundaries, right); }); } diff --git a/test/test_reductions.py b/test/test_reductions.py index f3f0d4c936451..1497ed6ad419d 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -1011,6 +1011,23 @@ def test_output_dtype(dtype, is_int32): test_output_dtype(torch.int32, False) test_output_dtype(torch.int64, True) + # scalar type bfloat16 + if self.device_type == 'cpu': + def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False): + values_1d_float = values_1d.to(torch.float32) + boundaries = torch.tensor([0.9, 1, 2, 2, 3, 3, 4, 4.1, 9, 9], device=device, dtype=torch.float32) + if values_bf16: + values_1d_float = values_1d_float.to(torch.bfloat16) + if boundaries_bf16: + boundaries = boundaries.to(torch.bfloat16) + expected_result = torch.tensor([1, 2, 4, 6, 8, 8, 8, 8, 8], device=device, dtype=torch.int32) + self.assertEqual(torch.searchsorted(boundaries, values_1d_float, out_int32=True), expected_result) + self.assertEqual(torch.bucketize(values_1d_float, boundaries, out_int32=True), expected_result) + + test_dtype_bfloat16(True, False) + test_dtype_bfloat16(False, True) + test_dtype_bfloat16(True, True) + @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False)) def test_nansum(self, device, dtype): args = product( From e6dc7bc61b106427bb44bb6822451369dfab0eda Mon Sep 17 00:00:00 2001 From: peterjc123 Date: Tue, 24 Aug 2021 10:44:45 -0700 Subject: [PATCH 177/530] Subprocess encoding fixes for cpp extension (#63756) Summary: Fixes https://github.com/pytorch/pytorch/issues/63584 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63756 Reviewed By: bdhirsh Differential Revision: D30485046 Pulled By: ezyang fbshipit-source-id: 4f0ac383da4e8843e2a602dceae85f389d7434ee --- torch/utils/cpp_extension.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index b313423426caa..bb0a85982c665 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -37,6 +37,8 @@ BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join( TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}'))) +SUBPROCESS_DECODE_ARGS = ('oem',) if IS_WINDOWS else () + # Taken directly from python stdlib < 3.9 # See https://github.com/pytorch/pytorch/issues/48617 def _nt_quote_args(args: Optional[List[str]]) -> List[str]: @@ -60,7 +62,7 @@ def _find_cuda_home() -> Optional[str]: which = 'where' if IS_WINDOWS else 'which' with open(os.devnull, 'w') as devnull: nvcc = subprocess.check_output([which, 'nvcc'], - stderr=devnull).decode().rstrip('\r\n') + stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n') cuda_home = os.path.dirname(os.path.dirname(nvcc)) except Exception: # Guess #3 @@ -90,7 +92,7 @@ def _find_rocm_home() -> Optional[str]: ["which hipcc | xargs readlink -f"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) hipcc, _ = pipe_hipcc.communicate() # this will be either /hip/bin/hipcc or /bin/hipcc - rocm_home = os.path.dirname(os.path.dirname(hipcc.decode().rstrip('\r\n'))) + rocm_home = os.path.dirname(os.path.dirname(hipcc.decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n'))) if os.path.basename(rocm_home) == 'hip': rocm_home = os.path.dirname(rocm_home) except Exception: @@ -251,12 +253,12 @@ def check_compiler_ok_for_platform(compiler: str) -> bool: return True which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT) # Use os.path.realpath to resolve any symlinks, in particular from 'c++' to e.g. 'g++'. - compiler_path = os.path.realpath(which.decode().strip()) + compiler_path = os.path.realpath(which.decode(*SUBPROCESS_DECODE_ARGS).strip()) # Check the compiler name if any(name in compiler_path for name in _accepted_compilers_for_platform()): return True # If ccache is used the compiler path is /usr/bin/ccache. Check by -v flag. - version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT).decode() + version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT).decode(*SUBPROCESS_DECODE_ARGS) if sys.platform.startswith('linux'): # Check for 'gcc' or 'g++' pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE) @@ -303,11 +305,11 @@ def check_compiler_abi_compatibility(compiler) -> bool: if sys.platform.startswith('linux'): minimum_required_version = MINIMUM_GCC_VERSION versionstr = subprocess.check_output([compiler, '-dumpfullversion', '-dumpversion']) - version = versionstr.decode().strip().split('.') + version = versionstr.decode(*SUBPROCESS_DECODE_ARGS).strip().split('.') else: minimum_required_version = MINIMUM_MSVC_VERSION compiler_info = subprocess.check_output(compiler, stderr=subprocess.STDOUT) - match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode().strip()) + match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode(*SUBPROCESS_DECODE_ARGS).strip()) version = (0, 0, 0) if match is None else match.groups() except Exception: _, error, _ = sys.exc_info() @@ -767,7 +769,7 @@ def _check_abi(self): def _check_cuda_version(self): if CUDA_HOME: nvcc = os.path.join(CUDA_HOME, 'bin', 'nvcc') - cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode() + cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode(*SUBPROCESS_DECODE_ARGS) cuda_version = re.search(r'release (\d+[.]\d+)', cuda_version_str) if cuda_version is not None: cuda_str_version = cuda_version.group(1) @@ -1727,7 +1729,7 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> # `error` is a CalledProcessError (which has an `ouput`) attribute, but # mypy thinks it's Optional[BaseException] and doesn't narrow if hasattr(error, 'output') and error.output: # type: ignore[union-attr] - message += f": {error.output.decode()}" # type: ignore[union-attr] + message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}" # type: ignore[union-attr] raise RuntimeError(message) from e @@ -1996,7 +1998,7 @@ def sanitize_flags(flags): link_rule = ['rule link'] if IS_WINDOWS: cl_paths = subprocess.check_output(['where', - 'cl']).decode().split('\r\n') + 'cl']).decode(*SUBPROCESS_DECODE_ARGS).split('\r\n') if len(cl_paths) >= 1: cl_path = os.path.dirname(cl_paths[0]).replace(':', '$:') else: From 73431449b357de30ccb5a775c7395f343cb24d73 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 24 Aug 2021 10:50:57 -0700 Subject: [PATCH 178/530] update readme and contributing.md (#63843) Summary: 1. In fact, Visual Studio isn't supported as CMAKE generator 2. I was asked many times why there's error as 'Could NOT find OpenMP' 3. Add Newly added Best Practices link in contributing.md Pull Request resolved: https://github.com/pytorch/pytorch/pull/63843 Reviewed By: seemethere, heitorschueroff Differential Revision: D30514095 Pulled By: janeyx99 fbshipit-source-id: 76715a1d8c049122546e5a7778cafe54e4dfd5d6 --- CONTRIBUTING.md | 1 + README.md | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index baafcefdc59fe..7cf3aecabd7c1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -197,6 +197,7 @@ with `brew install cmake` if you are developing on MacOS or Linux system. Could not find .../pytorch/third_party/pybind11/CMakeLists.txt ``` remove any `submodule.*` settings in your local git config (`.git/config` of your pytorch repo) and try again. +* If you're a Windows contributor, please check out [Best Practices](https://github.com/pytorch/pytorch/wiki/Best-Practices-to-Edit-and-Compile-Pytorch-Source-Code-On-Windows). ## Nightly Checkout & Pull diff --git a/README.md b/README.md index 9b2a854ef3557..ed793fb8874e6 100644 --- a/README.md +++ b/README.md @@ -291,9 +291,10 @@ You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob ```cmd cmd -:: [Optional] If you want to build with the VS 2017 generator for old CUDA and PyTorch, please change the value in the next line to `Visual Studio 15 2017`. -:: Note: This value is useless if Ninja is detected. However, you can force that by using `set USE_NINJA=OFF`. -set CMAKE_GENERATOR=Visual Studio 16 2019 +:: Set the environment variables after you have downloaded and upzipped the mkl package, +:: else CMake would throw error as `Could NOT find OpenMP`. +set CMAKE_INCLUDE_PATH={Your directory}\mkl\include +set LIB={Your directory}\mkl\lib;%LIB% :: Read the content in the previous section carefully before you proceed. :: [Optional] If you want to override the underlying toolset used by Ninja and Visual Studio with CUDA, please run the following script block. From d08a36f831cbcb4516fc1b68e3e3deff8ab45aba Mon Sep 17 00:00:00 2001 From: Aayush Prakash Date: Tue, 24 Aug 2021 11:19:34 -0700 Subject: [PATCH 179/530] Removing tensor.data usage in utils with tensor set_ method (#63867) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63867 When updating the model parameter, updating `parameter.data` is no longer recommended, because this `data` field will be deprecated in the future. The replacement is `tensor.set_`. ghstack-source-id: 136531233 Test Plan: buck test mode/dev-nosan //caffe2/test/distributed:distributed_nccl_spawn -- test_periodic_model_averager Reviewed By: SciPioneer Differential Revision: D30513613 fbshipit-source-id: 402efb9c30fafc3f285bebc631639f656ceae585 --- torch/distributed/algorithms/model_averaging/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py index 44ee422b9e92d..5d796e885d2ae 100644 --- a/torch/distributed/algorithms/model_averaging/utils.py +++ b/torch/distributed/algorithms/model_averaging/utils.py @@ -29,5 +29,6 @@ def average_parameters( offset = 0 for p in params_it2: - p.data = flat_params[offset : offset + p.numel()].view_as(p) + with torch.no_grad(): + p.set_(flat_params[offset : offset + p.numel()].view_as(p)) offset += p.numel() From 835dac0869fa155ab9bdb434a230f141dd5afad3 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Tue, 24 Aug 2021 11:45:54 -0700 Subject: [PATCH 180/530] Merge common fields from TensorInitParams and ShardedTensorMetadata into TensorProperties (#63731) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63731 1) Follow up [PR/63378 last comment](https://github.com/pytorch/pytorch/pull/63378#discussion_r693143053) 2) Also updated the caller side (usage of ShardedTensorMetadta) in fbcode Ref: [landing workflow 3](https://www.internalfb.com/intern/wiki/PyTorch/PyTorchDev/Workflow/Landing/#landing-your-prs-from-gi-1) Test Plan: Imported from OSS OSS: (pytorch).. $ python test/distributed/_sharded_tensor/test_sharded_tensor.py --v FB: fbcode $ buck test mode/dev //aiplatform/modelstore/checkpointing/pyper/tests:checkpoint_utils_test Reviewed By: wanchaol, heitorschueroff Differential Revision: D30472281 fbshipit-source-id: 727fb0e7f10eab4eb7a10476194e9008f2ac1fb5 --- .../_sharded_tensor/test_sharded_tensor.py | 65 ++++--- torch/distributed/_sharded_tensor/__init__.py | 62 +++---- torch/distributed/_sharded_tensor/api.py | 161 +++++++++--------- 3 files changed, 154 insertions(+), 134 deletions(-) diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py index 26a176b1455c1..6c03d9fdf631c 100644 --- a/test/distributed/_sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py @@ -19,6 +19,7 @@ from torch.distributed._sharded_tensor.api import ( CreateOp, TensorInitParams, + TensorProperties, _create_tensor_from_params, ) from torch.testing._internal.common_distributed import ( @@ -125,13 +126,14 @@ def wrapper(self): class TestCreateTensorFromParams(TestCase): @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') def test_empty(self): - tensor_init_params = TensorInitParams( - create_op=CreateOp.EMPTY, + tensor_properties = TensorProperties( dtype=torch.double, layout=torch.strided, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format, ) + tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, + tensor_properties=tensor_properties) local_device = torch.device('cuda:0') local_tensor = _create_tensor_from_params( 5, 10, local_device=local_device, tensor_init_params=tensor_init_params) @@ -142,13 +144,14 @@ def test_empty(self): @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') def test_ones(self): - tensor_init_params = TensorInitParams( - create_op=CreateOp.ONES, + tensor_properties = TensorProperties( dtype=torch.double, layout=torch.strided, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format, ) + tensor_init_params = TensorInitParams( + create_op=CreateOp.ONES, tensor_properties=tensor_properties) local_device = torch.device('cuda:0') local_tensor = _create_tensor_from_params( 5, 10, local_device=local_device, tensor_init_params=tensor_init_params) @@ -1267,15 +1270,18 @@ def test_init_from_local_shards(self): local_shards = [_sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)] - sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( - shards_metadata=shards_metadata, - size=torch.Size([10, 10]), + tensor_properties = TensorProperties( dtype=torch.get_default_dtype(), layout=torch.strided, requires_grad=False, memory_format=torch.contiguous_format, pin_memory=False, ) + sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( + shards_metadata=shards_metadata, + size=torch.Size([10, 10]), + tensor_properties=tensor_properties, + ) sharded_tensor = _sharded_tensor.init_from_local_shards(local_shards, sharded_tensor_metadata, init_rrefs=True) self.assertEqual((10, 10), sharded_tensor.size()) @@ -1334,15 +1340,19 @@ def test_init_from_local_shards_new_group(self): local_shard_metadata = rank1_shard_metadata if self.rank == 1 else rank3_shard_metadata local_shards.append(_sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)) - sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( - shards_metadata=shards_metadata, - size=torch.Size([10, 5]), + tensor_properties = TensorProperties( dtype=torch.get_default_dtype(), layout=torch.strided, requires_grad=False, memory_format=torch.contiguous_format, pin_memory=False, ) + + sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( + shards_metadata=shards_metadata, + size=torch.Size([10, 5]), + tensor_properties=tensor_properties + ) sharded_tensor = _sharded_tensor.init_from_local_shards(local_shards, sharded_tensor_metadata, new_pg, init_rrefs=True) if self.rank == 1 or self.rank == 3: @@ -1403,15 +1413,18 @@ def test_init_from_local_shards_invalid_shards(self): placement=f"rank:{r}/cuda:{r}" )) - sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( - shards_metadata=shards_metadata, - size=torch.Size([10, 10]), + tensor_properties = TensorProperties( dtype=torch.get_default_dtype(), layout=torch.strided, requires_grad=False, memory_format=torch.contiguous_format, pin_memory=False, ) + sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( + shards_metadata=shards_metadata, + size=torch.Size([10, 10]), + tensor_properties=tensor_properties + ) empty_local_shards = [] with self.assertRaisesRegex(RuntimeError, 'does not match number of local shards metadata'): @@ -1435,7 +1448,7 @@ def test_init_from_local_shards_invalid_shards(self): wrong_dtype_shards = [ _sharded_tensor.Shard(torch.ones(5, 5, device=f"cuda:{self.rank}", dtype=torch.int), local_shard_metadata) ] - with self.assertRaisesRegex(ValueError, 'Local shard tensor dtype does not match with sharded_tensor_metadata'): + with self.assertRaisesRegex(ValueError, 'Local shard tensor dtype does not match with tensor_properties!'): sharded_tensor = _sharded_tensor.init_from_local_shards(wrong_dtype_shards, sharded_tensor_metadata, init_rrefs=True) indices = [[0, 1, 1], [2, 0, 2]] @@ -1445,21 +1458,21 @@ def test_init_from_local_shards_invalid_shards(self): wrong_layout_shards = [ _sharded_tensor.Shard(sparse_tensor, local_shard_metadata) ] - with self.assertRaisesRegex(ValueError, 'Local shard tensor layout does not match with sharded_tensor_metadata'): + with self.assertRaisesRegex(ValueError, 'Local shard tensor layout does not match with tensor_properties!'): sharded_tensor = _sharded_tensor.init_from_local_shards( wrong_layout_shards, sharded_tensor_metadata, init_rrefs=True) wrong_requires_grad_shards = [ _sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}", requires_grad=True), local_shard_metadata) ] - with self.assertRaisesRegex(ValueError, 'Local shard tensor requires_grad does not match with sharded_tensor_metadata'): + with self.assertRaisesRegex(ValueError, 'Local shard tensor requires_grad does not match with tensor_properties!'): sharded_tensor = _sharded_tensor.init_from_local_shards( wrong_requires_grad_shards, sharded_tensor_metadata, init_rrefs=True) wrong_pin_memory_shards = [ _sharded_tensor.Shard(torch.randn(5, 5, pin_memory=True), local_shard_metadata) ] - with self.assertRaisesRegex(ValueError, 'Local shard tensor pin_memory does not match with sharded_tensor_metadata'): + with self.assertRaisesRegex(ValueError, 'Local shard tensor pin_memory does not match with tensor_properties!'): sharded_tensor = _sharded_tensor.init_from_local_shards( wrong_pin_memory_shards, sharded_tensor_metadata, init_rrefs=True) @@ -1492,15 +1505,18 @@ def test_init_from_local_shards_invalid_shards_overlap(self): placement=f"rank:{r}/cuda:{r}" )) - sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( - shards_metadata=shards_metadata, - size=torch.Size([10, 10]), + tensor_properties = TensorProperties( dtype=torch.get_default_dtype(), layout=torch.strided, requires_grad=False, memory_format=torch.contiguous_format, pin_memory=False, ) + sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( + shards_metadata=shards_metadata, + size=torch.Size([10, 10]), + tensor_properties=tensor_properties + ) local_shard_size = (5, 5) if self.rank != 0 else (6, 6) @@ -1531,15 +1547,18 @@ def test_init_from_local_shards_invalid_shards_gaps(self): placement=f"rank:{r}/cuda:{r}" )) - sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( - shards_metadata=shards_metadata, - size=torch.Size([10, 10]), + tensor_properties = TensorProperties( dtype=torch.get_default_dtype(), layout=torch.strided, requires_grad=False, memory_format=torch.contiguous_format, pin_memory=False, ) + sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata( + shards_metadata=shards_metadata, + size=torch.Size([10, 10]), + tensor_properties=tensor_properties + ) local_shard_size = (5, 5) if self.rank != 0 else (4, 4) diff --git a/torch/distributed/_sharded_tensor/__init__.py b/torch/distributed/_sharded_tensor/__init__.py index ecb7ea1fed8c6..4cbdded8ba1c4 100644 --- a/torch/distributed/_sharded_tensor/__init__.py +++ b/torch/distributed/_sharded_tensor/__init__.py @@ -8,20 +8,20 @@ ShardedTensor, ShardedTensorMetadata, TensorInitParams, + TensorProperties, load_with_process_group, ) -def empty( - sharding_spec: ShardingSpec, - *size, - dtype=None, - layout=torch.strided, - requires_grad=False, - pin_memory=False, - memory_format=torch.contiguous_format, - process_group=None, - init_rrefs=False): +def empty(sharding_spec: ShardingSpec, + *size, + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + process_group=None, + init_rrefs=False): """ Creates an empty :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion. @@ -52,9 +52,10 @@ def empty( Returns: A :class:`ShardedTensor` object on each rank """ - tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, dtype=dtype, layout=layout, - requires_grad=requires_grad, - pin_memory=pin_memory, memory_format=memory_format) + tensor_properties = TensorProperties(dtype=dtype, layout=layout, + requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format, ) + tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, tensor_properties=tensor_properties, ) return ShardedTensor( sharding_spec, *size, @@ -63,16 +64,15 @@ def empty( init_rrefs=init_rrefs, ) -def ones( - sharding_spec: ShardingSpec, - *size, - dtype=None, - layout=torch.strided, - requires_grad=False, - pin_memory=False, - memory_format=torch.contiguous_format, - process_group=None, - init_rrefs=False): +def ones(sharding_spec: ShardingSpec, + *size, + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + process_group=None, + init_rrefs=False): """ Creates a ones :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion. @@ -101,9 +101,10 @@ def ones( Returns: A :class:`ShardedTensor` object on each rank """ - tensor_init_params = TensorInitParams(create_op=CreateOp.ONES, dtype=dtype, layout=layout, - requires_grad=requires_grad, - pin_memory=pin_memory, memory_format=memory_format) + tensor_properties = TensorProperties(dtype=dtype, layout=layout, + requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format, ) + tensor_init_params = TensorInitParams(create_op=CreateOp.ONES, tensor_properties=tensor_properties) return ShardedTensor( sharding_spec, *size, @@ -112,11 +113,10 @@ def ones( init_rrefs=init_rrefs, ) -def init_from_local_shards( - local_shards: List[Shard], - sharded_tensor_metadata: ShardedTensorMetadata, - process_group=None, - init_rrefs=False): +def init_from_local_shards(local_shards: List[Shard], + sharded_tensor_metadata: ShardedTensorMetadata, + process_group=None, + init_rrefs=False): """ Creates an :class:`ShardedTensor` from local shards and the global metadata. Needs to be called on all ranks in an SPMD fashion. diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py index 5f501b7689e4e..ae1a3a9f38844 100644 --- a/torch/distributed/_sharded_tensor/api.py +++ b/torch/distributed/_sharded_tensor/api.py @@ -59,6 +59,17 @@ class Shard(object): tensor: torch.Tensor metadata: ShardMetadata +@dataclass +class TensorProperties(object): + """ Properties used to create :class:`Tensor` """ + + # Regular tensor fields + dtype: torch.dtype = field(default=torch.get_default_dtype()) + layout: torch.layout = field(default=torch.strided) + requires_grad: bool = False + memory_format: torch.memory_format = field(default=torch.contiguous_format) + pin_memory: bool = False + @dataclass class ShardedTensorMetadata(object): """ @@ -71,50 +82,55 @@ class ShardedTensorMetadata(object): # Size of each dim of the overall Tensor. size: torch.Size = field(default=torch.Size([])) - # Regular tensor fields - dtype: torch.dtype = field(default=torch.get_default_dtype()) - layout: torch.layout = field(default=torch.strided) - requires_grad: bool = False - memory_format: torch.memory_format = field(default=torch.contiguous_format) - pin_memory: bool = False + tensor_properties: TensorProperties = field( + default=TensorProperties(dtype=torch.get_default_dtype(), + layout=torch.strided, + requires_grad=False, + memory_format=torch.contiguous_format, + pin_memory=False)) def __getstate__(self): # Since torch.memory_format cannot be pickled! - if self.memory_format == torch.contiguous_format: + memory_format = self.tensor_properties.memory_format + if memory_format == torch.contiguous_format: mem_format_encoding = 0 - elif self.memory_format == torch.channels_last: + elif memory_format == torch.channels_last: mem_format_encoding = 1 - elif self.memory_format == torch.preserve_format: + elif memory_format == torch.preserve_format: mem_format_encoding = 1 else: - raise RuntimeError(f'Invalid torch.memory_format: {self.memory_format}') + raise RuntimeError(f'Invalid torch.memory_format: {memory_format}') + # Keep old seriazation to ensure backward compatibility return ( self.shards_metadata, self.size, - self.dtype, - self.layout, - self.requires_grad, + self.tensor_properties.dtype, + self.tensor_properties.layout, + self.tensor_properties.requires_grad, mem_format_encoding, - self.pin_memory, + self.tensor_properties.pin_memory, ) def __setstate__( self, state, ): - (self.shards_metadata, self.size, self.dtype, self.layout, - self.requires_grad, mem_format_encoding, self.pin_memory) = state + (self.shards_metadata, self.size, dtype, layout, requires_grad, mem_format_encoding, pin_memory) = state if mem_format_encoding == 0: - self.memory_format = torch.contiguous_format + memory_format = torch.contiguous_format elif mem_format_encoding == 1: - self.memory_format = torch.channels_last + memory_format = torch.channels_last elif mem_format_encoding == 2: - self.memory_format = torch.preserve_format + memory_format = torch.preserve_format else: raise RuntimeError(f'Invalid torch.memory_format encoding: {mem_format_encoding}') + self.tensor_properties = TensorProperties( + dtype=dtype, layout=layout, requires_grad=requires_grad, + memory_format=memory_format, pin_memory=pin_memory, ) + def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]], rpc_rank: int): with _sharded_tensor_lock: @@ -134,15 +150,10 @@ class CreateOp(Enum): class TensorInitParams(object): """ Container for list of common params to create new local tensor. """ - __slots__ = ['create_op', 'dtype', 'layout', 'requires_grad', 'pin_memory', - 'memory_format'] + __slots__ = ['create_op', 'tensor_properties'] create_op: CreateOp - dtype: torch.dtype - layout: torch.layout - requires_grad: bool - pin_memory: bool - memory_format: torch.memory_format + tensor_properties: TensorProperties class ShardedTensor(object): @@ -188,13 +199,16 @@ def __init__( # _process_group, _local_shards, etc. self._prepare_init(process_group=process_group, init_rrefs=init_rrefs) - if tensor_init_params.dtype is None: - tensor_init_params.dtype = torch.get_default_dtype() + if tensor_init_params.tensor_properties is None: + raise ValueError('tensor_properties must not be None.') + + if tensor_init_params.tensor_properties.dtype is None: + tensor_init_params.tensor_properties.dtype = torch.get_default_dtype() - if tensor_init_params.layout != torch.strided: + if tensor_init_params.tensor_properties.layout != torch.strided: raise ValueError('Only torch.strided layout is currently supported') - if tensor_init_params.memory_format != torch.contiguous_format: + if tensor_init_params.tensor_properties.memory_format != torch.contiguous_format: raise ValueError('Only torch.contiguous_format memory_format is currently supported') if len(size) == 1 and isinstance(size[0], collections.Sequence): @@ -309,11 +323,12 @@ def _init_from_local_shards( init_rrefs=False, ): shards_metadata = sharded_tensor_metadata.shards_metadata + tensor_properties = sharded_tensor_metadata.tensor_properties if len(shards_metadata) == 0: raise ValueError("shards_metadata must not be empty!") - if sharded_tensor_metadata.layout != torch.strided: + if tensor_properties.layout != torch.strided: raise ValueError('Only torch.strided layout is currently supported') sharded_tensor = cls.__new__(cls) @@ -354,11 +369,11 @@ def _init_from_local_shards( assert shard_meta in local_shard_metadatas, \ "local shard metadata not in sharded_tensor_metadata!" - if local_shard_tensor.layout != sharded_tensor_metadata.layout: + if local_shard_tensor.layout != tensor_properties.layout: raise ValueError( - f'Local shard tensor layout does not match with sharded_tensor_metadata! ' + f'Local shard tensor layout does not match with tensor_properties! ' f'local shard tensor layout: {local_shard_tensor.dtype}, ' - f'sharded_tensor_metadata layout: {sharded_tensor_metadata.layout}' + f'tensor_properties layout: {tensor_properties.layout}' ) if not local_shard_tensor.is_contiguous(): @@ -371,11 +386,11 @@ def _init_from_local_shards( f'local ShardMetadata shard lengths: {shard_meta.shard_lengths}' ) - if local_shard_tensor.is_pinned() != sharded_tensor_metadata.pin_memory: + if local_shard_tensor.is_pinned() != tensor_properties.pin_memory: raise ValueError( - f'Local shard tensor pin_memory does not match with sharded_tensor_metadata! ' + f'Local shard tensor pin_memory does not match with tensor_properties! ' f'local shard tensor pin_memory: {local_shard_tensor.is_pinned()}, ' - f'sharded_tensor_metadata pin_memory: {sharded_tensor_metadata.pin_memory}' + f'tensor_properties pin_memory: {tensor_properties.pin_memory}' ) if local_shard_tensor.device != local_device: @@ -385,18 +400,18 @@ def _init_from_local_shards( f'local shard metadata placement device: {local_device}' ) - if local_shard_tensor.dtype != sharded_tensor_metadata.dtype: + if local_shard_tensor.dtype != tensor_properties.dtype: raise ValueError( - f'Local shard tensor dtype does not match with sharded_tensor_metadata! ' + f'Local shard tensor dtype does not match with tensor_properties! ' f'local shard tensor dtype: {local_shard_tensor.dtype}, ' - f'sharded_tensor_metadata dtype: {sharded_tensor_metadata.dtype}' + f'tensor_properties dtype: {tensor_properties.dtype}' ) - if local_shard_tensor.requires_grad != sharded_tensor_metadata.requires_grad: + if local_shard_tensor.requires_grad != tensor_properties.requires_grad: raise ValueError( - f'Local shard tensor requires_grad does not match with sharded_tensor_metadata! ' + f'Local shard tensor requires_grad does not match with tensor_properties! ' f'local shard tensor requires_grad: {local_shard_tensor.requires_grad}, ' - f'sharded_tensor_metadata requires_grad: {sharded_tensor_metadata.requires_grad}' + f'tensor_properties requires_grad: {tensor_properties.requires_grad}' ) # check if shards_metadata have overlap shards @@ -459,14 +474,7 @@ def _init_chunked(self, dims, tensor_init_params: TensorInitParams, ): # Build overall metadata self._metadata = ShardedTensorMetadata( - shards_metadata, - dims, - tensor_init_params.dtype, - tensor_init_params.layout, - tensor_init_params.requires_grad, - tensor_init_params.memory_format, - tensor_init_params.pin_memory, - ) + shards_metadata, dims, tensor_init_params.tensor_properties, ) def _init_enumerable(self, dims, tensor_init_params: TensorInitParams): # Validate the sharding spec is compatible with the tensor. @@ -488,14 +496,7 @@ def _init_enumerable(self, dims, tensor_init_params: TensorInitParams): # Build overall metadata self._metadata = ShardedTensorMetadata( - shards_metadata, - dims, - tensor_init_params.dtype, - tensor_init_params.layout, - tensor_init_params.requires_grad, - tensor_init_params.memory_format, - tensor_init_params.pin_memory, - ) + shards_metadata, dims, tensor_init_params.tensor_properties, ) def _parse_and_validate_remote_device(self, remote_device: torch.distributed._remote_device): @@ -555,14 +556,14 @@ def is_pinned(self) -> bool: """ Returns True if the sharded tensor (each local shard) resides in pinned memory. """ - return self._metadata.pin_memory + return self._metadata.tensor_properties.pin_memory def is_contiguous(self) -> bool: """ Returns True if the sharded tensor (each local shard) is contiguous in memory in the order specified by memory format. """ - return self._metadata.memory_format == torch.contiguous_format + return self._metadata.tensor_properties.memory_format == torch.contiguous_format @property def shape(self): @@ -570,15 +571,15 @@ def shape(self): @property def requires_grad(self): - return self._metadata.requires_grad + return self._metadata.tensor_properties.requires_grad @property def dtype(self): - return self._metadata.dtype + return self._metadata.tensor_properties.dtype @property def layout(self): - return self._metadata.layout + return self._metadata.tensor_properties.layout def _register_remote_shards(self, remote_shards: List[rpc.RRef[Shard]], rpc_rank: int): self._remote_shards[rpc_rank] = remote_shards @@ -667,21 +668,21 @@ def __setstate__(self, state): def _create_tensor_from_params(*size, local_device, tensor_init_params: TensorInitParams): """ Helper to construct tensor from size, device and common params. """ - if tensor_init_params.create_op == CreateOp.ONES: - return torch.ones(*size, - dtype=tensor_init_params.dtype, - layout=tensor_init_params.layout, - device=local_device, - pin_memory=tensor_init_params.pin_memory, - requires_grad=tensor_init_params.requires_grad,) - elif tensor_init_params.create_op == CreateOp.EMPTY: - return torch.empty(*size, - dtype=tensor_init_params.dtype, - layout=tensor_init_params.layout, - device=local_device, - requires_grad=tensor_init_params.requires_grad, - # Note memory_format param is not accepted by torch.ones - memory_format=tensor_init_params.memory_format, - pin_memory=tensor_init_params.pin_memory,) + create_op = tensor_init_params.create_op + dtype = tensor_init_params.tensor_properties.dtype + layout = tensor_init_params.tensor_properties.layout + requires_grad = tensor_init_params.tensor_properties.requires_grad + memory_format = tensor_init_params.tensor_properties.memory_format + pin_memory = tensor_init_params.tensor_properties.pin_memory + + if create_op == CreateOp.ONES: + return torch.ones(*size, dtype=dtype, layout=layout, + device=local_device, pin_memory=pin_memory, + requires_grad=requires_grad,) + elif create_op == CreateOp.EMPTY: + return torch.empty(*size, dtype=dtype, layout=layout, + device=local_device, requires_grad=requires_grad, + # NB: memory_format param is not accepted by torch.ones + memory_format=memory_format, pin_memory=pin_memory,) else: raise ValueError(f'Unsupported create_op: {tensor_init_params.create_op}') From 699c764d2ef1e489ef2766d360701f61f602a7d7 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 24 Aug 2021 12:19:16 -0700 Subject: [PATCH 181/530] Revert D30513613: Removing tensor.data usage in utils with tensor set_ method Test Plan: revert-hammer Differential Revision: D30513613 (https://github.com/pytorch/pytorch/commit/d08a36f831cbcb4516fc1b68e3e3deff8ab45aba) Original commit changeset: 402efb9c30fa fbshipit-source-id: 911c66a9852de77dc5274b5fb373258c0c97739a --- torch/distributed/algorithms/model_averaging/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py index 5d796e885d2ae..44ee422b9e92d 100644 --- a/torch/distributed/algorithms/model_averaging/utils.py +++ b/torch/distributed/algorithms/model_averaging/utils.py @@ -29,6 +29,5 @@ def average_parameters( offset = 0 for p in params_it2: - with torch.no_grad(): - p.set_(flat_params[offset : offset + p.numel()].view_as(p)) + p.data = flat_params[offset : offset + p.numel()].view_as(p) offset += p.numel() From 4a0776100e347c887378b676f3deabd14b88d7b2 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Tue, 24 Aug 2021 12:43:27 -0700 Subject: [PATCH 182/530] Migrate legacy lstsq from THC to ATen (CUDA) (#63504) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63504 Closes gh-24592 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30441304 Pulled By: ngimel fbshipit-source-id: ec176596f54bc084af48a73d1dbb0dcb82fec593 --- BUILD.bazel | 1 - aten/src/ATen/LegacyTHFunctionsCUDA.h | 5 -- aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp | 77 ----------------- .../ATen/native/cuda/BatchLinearAlgebra.cu | 78 +++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 4 +- aten/src/THC/CMakeLists.txt | 3 - aten/src/THC/THCTensorMath.h | 3 - aten/src/THC/THCTensorMathMagma.cpp | 16 ---- aten/src/THC/THCTensorMathMagma.h | 20 ----- aten/src/THC/generic/THCTensorMathMagma.cpp | 83 ------------------- aten/src/THC/generic/THCTensorMathMagma.h | 17 ---- 11 files changed, 80 insertions(+), 227 deletions(-) delete mode 100644 aten/src/THC/THCTensorMathMagma.h delete mode 100644 aten/src/THC/generic/THCTensorMathMagma.cpp delete mode 100644 aten/src/THC/generic/THCTensorMathMagma.h diff --git a/BUILD.bazel b/BUILD.bazel index 5acbe4082d38e..afdd4699b160f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -393,7 +393,6 @@ filegroup( "aten/src/THC/THCTensor.cu.cc", "aten/src/THC/THCTensorCopy.cu.cc", "aten/src/THC/THCTensorMath.cu.cc", - "aten/src/THC/THCTensorMathMagma.cu.cc", "aten/src/THC/THCTensorMathPairwise.cu.cc", "aten/src/THC/THCTensorMathScan.cu.cc", "aten/src/THC/THCTensorScatterGather.cu.cc", diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h index 1a20e0bb8fa0b..41cbdd6f4ffe1 100644 --- a/aten/src/ATen/LegacyTHFunctionsCUDA.h +++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h @@ -18,12 +18,7 @@ namespace native { namespace legacy { namespace cuda { -std::tuple _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2); -std::tuple _th_gels(const Tensor & self, const Tensor & A); -Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper); -Tensor _th_potri(const Tensor & self, bool upper); Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src); -Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training); } // namespace th } // namespace legacy diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp index 0ad6dc8256ff0..c4e9dfe78cebe 100644 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -39,83 +39,6 @@ namespace { } } -std::tuple _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2) { - TORCH_WARN_ONCE( - "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n", - "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in " - "the returned tuple (although it returns other information about the problem).\n", - "To get the qr decomposition consider using torch.linalg.qr.\n", - "The returned solution in torch.lstsq stored the residuals of the solution in the ", - "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ", - "residuals in the field 'residuals' of the returned named tuple.\n", - "The unpacking of the solution, as in\n", - "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n", - "should be replaced with\n", - "X = torch.linalg.lstsq(A, B).solution" - ); - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); - break; - } - case ScalarType::Float: { - auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); - break; - } - default: - AT_ERROR("_th_gels_out not supported on CUDAType for ", dispatch_scalar_type); - } - return std::tuple(res1, res2); -} -std::tuple _th_gels(const Tensor & self, const Tensor & A) { - TORCH_WARN_ONCE( - "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n", - "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in " - "the returned tuple (although it returns other information about the problem).\n", - "To get the qr decomposition consider using torch.linalg.qr.\n", - "The returned solution in torch.lstsq stored the residuals of the solution in the ", - "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ", - "residuals in the field 'residuals' of the returned named tuple.\n", - "The unpacking of the solution, as in\n", - "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n", - "should be replaced with\n", - "X = torch.linalg.lstsq(A, B).solution" - ); - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); - auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); - auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); - auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_); - break; - } - default: - AT_ERROR("_th_gels not supported on CUDAType for ", dispatch_scalar_type); - } - return std::tuple(res1, res2); -} Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index 0dae7a2aa3c11..4e806f000c5ae 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -3114,6 +3114,84 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul REGISTER_DISPATCH(lstsq_stub, &lstsq_kernel); +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ legacy_lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +std::tuple legacy_lstsq_cuda(const Tensor &B, const Tensor &A) { + TORCH_WARN_ONCE( + "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n", + "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in " + "the returned tuple (although it returns other information about the problem).\n", + "To get the qr decomposition consider using torch.linalg.qr.\n", + "The returned solution in torch.lstsq stored the residuals of the solution in the ", + "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ", + "residuals in the field 'residuals' of the returned named tuple.\n", + "The unpacking of the solution, as in\n", + "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n", + "should be replaced with\n", + "X = torch.linalg.lstsq(A, B).solution" + ); + +#ifndef USE_MAGMA + TORCH_CHECK(false, "solve: MAGMA library not found in " + "compilation. Please rebuild with MAGMA."); +#else + const auto dtype = A.scalar_type(); + TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ", + dtype, " and ", B.scalar_type()); + TORCH_CHECK(A.numel() > 0 && A.dim() == 2, "A should be (non-empty) 2 dimensional"); + TORCH_CHECK(B.numel() > 0 && B.dim() == 2, "B should be (non-empty) 2 dimensional"); + auto a_sizes = A.sizes(); + auto b_sizes = B.sizes(); + TORCH_CHECK(a_sizes[0] == b_sizes[0], "Expected A and b to have same size " + "at dim 0, but A has ", a_sizes[0], " rows and B has ", b_sizes[0], " rows"); + TORCH_CHECK(a_sizes[0] >= a_sizes[1], "Expected A with shape (m x n) to have " + "m >= n. The case for m < n is not implemented yet."); + + Tensor A_working = cloneBatchedColumnMajor(A); + Tensor B_working = cloneBatchedColumnMajor(B); + + int64_t m = a_sizes[0]; + int64_t n = a_sizes[1]; + int64_t nrhs = b_sizes[1]; + + int info; + AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "legacy_lstsq_cuda", [&] { + scalar_t *a_data = A_working.data_ptr(); + scalar_t *b_data = B_working.data_ptr(); + scalar_t wkopt; + magmaGels(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); + + const auto hwork_size = static_cast(wkopt); + scalar_t *hwork = nullptr; + ALLOCATE_ARRAY(hwork, scalar_t, hwork_size); + + magmaGels(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, hwork_size, &info); + }); + + TORCH_CHECK(info == 0, "MAGMA gels : Argument %d : illegal value", -info); + return std::tuple(B_working, A_working); +#endif // USE_MAGMA +} + +std::tuple legacy_lstsq_out_cuda( + const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) { + const auto dtype = A.scalar_type(); + TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ", + A.scalar_type(), " and ", B.scalar_type()); + TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype, + " but found", A_out.scalar_type()); + TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype, + " but found", B_out.scalar_type()); + Tensor A_tmp, B_tmp; + std::tie(B_tmp, A_tmp) = native::legacy_lstsq_cuda(B, A); + resize_output(A_out, A_tmp.sizes()); + A_out.copy_(A_tmp); + resize_output(B_out, B_tmp.sizes()); + B_out.copy_(B_tmp); + return std::tuple(B_out, A_out); +} + + }} // namespace at::native #undef ALLOCATE_ARRAY diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 9bce764b1ee1a..4f7d7e66a7d5e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6658,13 +6658,13 @@ - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) dispatch: CPU: legacy_lstsq_out - CUDA: legacy::cuda::_th_gels_out + CUDA: legacy_lstsq_out_cuda - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR) variants: method, function dispatch: CPU: legacy_lstsq - CUDA: legacy::cuda::_th_gels + CUDA: legacy_lstsq_cuda - func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient) dispatch: diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index 786506027ea8f..f34b040e484ce 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -66,7 +66,6 @@ install(FILES THCNumerics.cuh THCTensorInfo.cuh THCTensorTypeUtils.cuh - THCTensorMathMagma.h THCThrustAllocator.cuh # See Note [TH abstraction violation] THCTensor.hpp @@ -88,8 +87,6 @@ install(FILES generic/THCTensorCopy.h generic/THCTensorMath.h generic/THCTensorMath.cu - generic/THCTensorMathMagma.h - generic/THCTensorMathMagma.cpp generic/THCTensorMathPairwise.h generic/THCTensorMathPairwise.cu DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic") diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h index 422a423959457..b70d4d14b02d9 100644 --- a/aten/src/THC/THCTensorMath.h +++ b/aten/src/THC/THCTensorMath.h @@ -13,9 +13,6 @@ #include #include -#include -#include - #include #include diff --git a/aten/src/THC/THCTensorMathMagma.cpp b/aten/src/THC/THCTensorMathMagma.cpp index ca0cc8a621282..43607531bd60e 100644 --- a/aten/src/THC/THCTensorMathMagma.cpp +++ b/aten/src/THC/THCTensorMathMagma.cpp @@ -1,23 +1,10 @@ #include -#include -#include -#include -#include -#include -#include -#include #include #ifdef USE_MAGMA #include #endif -#ifndef DIVUP -#define DIVUP(x, y) (((x) + (y) - 1) / (y)) -#endif - -#define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)" - namespace { void _THCMagma_init() { #ifdef USE_MAGMA @@ -31,6 +18,3 @@ struct Initializer { }; } initializer; } // anonymous namespace - -#include -#include diff --git a/aten/src/THC/THCTensorMathMagma.h b/aten/src/THC/THCTensorMathMagma.h deleted file mode 100644 index 1fb5821afce56..0000000000000 --- a/aten/src/THC/THCTensorMathMagma.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef THC_TENSOR_MATH_MAGMA_CUH -#define THC_TENSOR_MATH_MAGMA_CUH - -#ifdef USE_MAGMA -#include -#endif - -#ifdef USE_MAGMA -template -static inline T* th_magma_malloc_pinned(size_t n) -{ - void* ptr; - if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T))) - THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456); - return reinterpret_cast(ptr); -} - -#endif - -#endif // THC_TENSOR_MATH_MAGMA_CUH diff --git a/aten/src/THC/generic/THCTensorMathMagma.cpp b/aten/src/THC/generic/THCTensorMathMagma.cpp deleted file mode 100644 index 0d94fc320e53b..0000000000000 --- a/aten/src/THC/generic/THCTensorMathMagma.cpp +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THC/generic/THCTensorMathMagma.cpp" -#else - -#include - -#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - -static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src) -{ - THAssert(src->dim() == 2); - if (self == src && self->stride(0) == 1 && self->stride(1) == self->size(0)) - { - THCTensor_(retain)(state, self); - return self; - } - - if (self == src) - self = THCTensor_(new)(state); - else - THCTensor_(retain)(state, self); - - int64_t size[2] = { src->size(0), src->size(1) }; - int64_t stride[2] = { 1, src->size(0) }; - - THCTensor_(resizeNd)(state, self, 2, size, stride); - THCTensor_(copy)(state, self, src); - return self; -} - -void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) -{ -#ifdef USE_MAGMA - THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); - THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional"); - TORCH_CHECK(a_->size(0) == b_->size(0), "Expected A and b to have same size " - "at dim 0, but A has ", a_->size(0), " rows and B has ", b_->size(0), " rows"); - THArgCheck(a_->size(0) >= a_->size(1), 2, "Expected A with shape (m x n) to have " - "m >= n. The case for m < n is not implemented yet."); - - THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); - THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); - scalar_t *a_data = THCTensor_(data)(state, a); - scalar_t *b_data = THCTensor_(data)(state, b); - - int64_t m = a->size(0); - int64_t n = a->size(1); - int64_t nrhs = b->size(1); - scalar_t wkopt; - - int info; - { - at::native::MagmaStreamSyncGuard guard; -#if defined(THC_REAL_IS_FLOAT) - magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); -#else - magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); -#endif - - scalar_t *hwork = th_magma_malloc_pinned((size_t)wkopt); - -#if defined(THC_REAL_IS_FLOAT) - magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); -#else - magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); -#endif - - magma_free_pinned(hwork); - } - - if (info != 0) - THError("MAGMA gels : Argument %d : illegal value", -info); - - THCTensor_(freeCopyTo)(state, a, ra_); - THCTensor_(freeCopyTo)(state, b, rb_); -#else - THError(NoMagma(gels)); -#endif -} - -#endif - -#endif diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h deleted file mode 100644 index 585d02ceff7a7..0000000000000 --- a/aten/src/THC/generic/THCTensorMathMagma.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THC/generic/THCTensorMathMagma.h" -#else - -#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - -// MAGMA (i.e. CUDA implementation of LAPACK functions) -TORCH_CUDA_CU_API void THCTensor_(gels)( - THCState* state, - THCTensor* rb_, - THCTensor* ra_, - THCTensor* b_, - THCTensor* a_); - -#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - -#endif From 5be17ec1fca4fcce5464cd679ee3f3dd6f102059 Mon Sep 17 00:00:00 2001 From: soulitzer Date: Tue, 24 Aug 2021 13:02:27 -0700 Subject: [PATCH 183/530] Do not modify saved variables in-place for spectral norm during power iteration (#62293) Summary: Interestingly enough, the original code did have a mechanism that aims to prevent this very issue: but it performs a clone AFTER modifying u and v in-place. This wouldn't work though because we can later use the cloned u and v in operations that save for backward, and the next time we execute forward, we modify the same cloned u and v in-place. So if the idea is that we want to avoid modifying saved variable in-place we should clone it BEFORE the in-place operation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62293 Reviewed By: bdhirsh Differential Revision: D30489750 Pulled By: soulitzer fbshipit-source-id: cbe8dea885aef97adda8481f7a822e5bd91f7889 --- test/test_nn.py | 3 +++ torch/nn/utils/parametrizations.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_nn.py b/test/test_nn.py index 07a2b48cc6a20..43e105a676ced 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -4220,6 +4220,9 @@ def fn(input): out1 = wrapped_m(input) return out0 + out1 + # Make sure we can compute gradients wrt to all the parameters in the case + # of double forward + fn(input.clone().requires_grad_()).sum().backward() gradcheck(fn, (input.clone().requires_grad_(),), check_batched_grad=False) # test removing diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py index 7941f41f19cac..de3d5c7144f9e 100644 --- a/torch/nn/utils/parametrizations.py +++ b/torch/nn/utils/parametrizations.py @@ -84,6 +84,7 @@ def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> No # Precondition assert weight_mat.ndim > 1 + for _ in range(n_power_iterations): # Spectral norm of weight equals to `u^T W v`, where `u` and `v` # are the first left and right singular vectors. @@ -92,9 +93,6 @@ def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> No dim=0, eps=self.eps, out=self._u) # type: ignore[has-type] self._v = F.normalize(torch.mv(weight_mat.t(), self._u), dim=0, eps=self.eps, out=self._v) # type: ignore[has-type] - # See above on why we need to clone - self._u = self._u.clone(memory_format=torch.contiguous_format) - self._v = self._v.clone(memory_format=torch.contiguous_format) def forward(self, weight: torch.Tensor) -> torch.Tensor: if weight.ndim == 1: @@ -104,10 +102,13 @@ def forward(self, weight: torch.Tensor) -> torch.Tensor: weight_mat = self._reshape_weight_to_matrix(weight) if self.training: self._power_method(weight_mat, self.n_power_iterations) + # See above on why we need to clone + u = self._u.clone(memory_format=torch.contiguous_format) + v = self._v.clone(memory_format=torch.contiguous_format) # The proper way of computing this should be through F.bilinear, but # it seems to have some efficiency issues: # https://github.com/pytorch/pytorch/issues/58093 - sigma = torch.dot(self._u, torch.mv(weight_mat, self._v)) + sigma = torch.dot(u, torch.mv(weight_mat, v)) return weight / sigma def right_inverse(self, value: torch.Tensor) -> torch.Tensor: From 4e37a015c7ed7ac4a4f57057cb726f53b94e693b Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 24 Aug 2021 13:44:52 -0700 Subject: [PATCH 184/530] [FX] Fix _replicate_for_data_parallel (#63821) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63821 Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D30502115 Pulled By: jamesr66a fbshipit-source-id: 0f004f95def6e1ba21ccbeab40cb0a739a0ad20c --- test/test_fx.py | 15 +++++++++++++++ torch/fx/graph_module.py | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/test/test_fx.py b/test/test_fx.py index c55e97dc7da84..27f64e1cd1827 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -2296,6 +2296,21 @@ def forward(self, x): r"Call using an FX-traced Module, line .* of the " r"traced Module's generated forward function:") + def test_graph_module_replicate_for_dp(self): + class Foo(torch.nn.Module): + def forward(self, x): + return torch.relu(x) + + gm = torch.fx.symbolic_trace(Foo()) + + x = torch.randn(5, 3) + out = gm(x) + + replica = gm._replicate_for_data_parallel() + out_replica = replica(x) + + torch.testing.assert_allclose(out_replica, out) + def test_ast_rewriter_rewrites_assert(self): class M(torch.nn.Module): def forward(self, x: torch.Tensor, y: int, z: int): diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index b87aeaaa78a03..c91857342ffcd 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -656,6 +656,11 @@ def __str__(self) -> str: orig_str = super().__str__() return '\n'.join([orig_str, self._code]) + def _replicate_for_data_parallel(self): + new_gm = self.__copy__() + new_gm._is_replica = True + return new_gm + # workarounds for issues in __torch_function__ # WAR for __torch_function__ not handling tensor lists, From 865d127a66e961b12f895747c3c59bd7503223dd Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Tue, 24 Aug 2021 14:13:04 -0700 Subject: [PATCH 185/530] .github: Enable with-ssh for Windows (#63440) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63440 Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: janeyx99 Differential Revision: D30521460 Pulled By: seemethere fbshipit-source-id: e987e170e73fb4f9d9f024bed0e58404ed206848 --- .github/scripts/kill_active_ssh_sessions.ps1 | 11 +++++++ .github/scripts/wait_for_ssh_to_drain.ps1 | 17 +++++++++++ .github/templates/windows_ci_workflow.yml.j2 | 24 +++++++++++++++ ...rated-periodic-win-vs2019-cuda11.3-py3.yml | 30 +++++++++++++++++++ .../generated-win-vs2019-cpu-py3.yml | 30 +++++++++++++++++++ .../generated-win-vs2019-cuda10.1-py3.yml | 30 +++++++++++++++++++ .../generated-win-vs2019-cuda11.1-py3.yml | 30 +++++++++++++++++++ 7 files changed, 172 insertions(+) create mode 100644 .github/scripts/kill_active_ssh_sessions.ps1 create mode 100644 .github/scripts/wait_for_ssh_to_drain.ps1 diff --git a/.github/scripts/kill_active_ssh_sessions.ps1 b/.github/scripts/kill_active_ssh_sessions.ps1 new file mode 100644 index 0000000000000..09cc63e94bc1f --- /dev/null +++ b/.github/scripts/kill_active_ssh_sessions.ps1 @@ -0,0 +1,11 @@ +function Get-SSH-Sessions { + Get-Process sshd -IncludeUserName | + Where-Object UserName -notLike "*SYSTEM*" | + Select-Object Id +} + +$runningSessions = Get-SSH-Sessions + +foreach ($session in $runningSessions) { + Stop-Process -id $session.Id +} diff --git a/.github/scripts/wait_for_ssh_to_drain.ps1 b/.github/scripts/wait_for_ssh_to_drain.ps1 new file mode 100644 index 0000000000000..ab3ab41f355ce --- /dev/null +++ b/.github/scripts/wait_for_ssh_to_drain.ps1 @@ -0,0 +1,17 @@ +function Get-SSH-Users { + # Gets ssh sessions for all users not named SYSTEM + Get-CimInstance -ClassName Win32_Process -Filter "Name = 'sshd.exe'" | + Get-CimAssociatedInstance -Association Win32_SessionProcess | + Get-CimAssociatedInstance -Association Win32_LoggedOnUser | + Where-Object {$_.Name -ne 'SYSTEM'} | + Measure-Object +} + +$usersLoggedOn = Get-SSH-Users + +Write-Output "Holding runner until all ssh sessions have logged out" +while ($usersLoggedOn.Count -gt 0) { + $usersLoggedOn = Get-SSH-Users + Write-Output "." + Start-Sleep -s 5 +} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index f00f4b19a903d..6756bf4720ac6 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -4,6 +4,20 @@ {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} {%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} +{%- macro wait_and_kill_ssh() -%} + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 +{%- endmacro -%} + # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: !{{ build_environment }} @@ -76,6 +90,10 @@ jobs: http_proxy: "!{{ squid_proxy }}" https_proxy: "!{{ squid_proxy }}" steps: + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch uses: actions/checkout@v2 with: @@ -123,6 +141,7 @@ jobs: if-no-files-found: error name: ${{ env.BUILD_ENVIRONMENT }} path: C:\${{ github.run_id }}\build-results + !{{ wait_and_kill_ssh() }} - name: Cleanup build-results and workspaces if: always() shell: bash @@ -193,6 +212,10 @@ jobs: # deep clone, to allow use of git merge-base fetch-depth: 0 !{{ common.display_ec2_information() }} + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -251,6 +274,7 @@ jobs: if-no-files-found: error path: pytorch-${{ github.run_id }}/test-reports-*.zip + !{{ wait_and_kill_ssh() }} - name: Cleanup workspace if: always() shell: bash diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml index 407aace6e4aee..6d1eff302dba9 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml @@ -51,6 +51,10 @@ jobs: http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" steps: + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch uses: actions/checkout@v2 with: @@ -99,6 +103,17 @@ jobs: if-no-files-found: error name: ${{ env.BUILD_ENVIRONMENT }} path: C:\${{ github.run_id }}\build-results + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup build-results and workspaces if: always() shell: bash @@ -163,6 +178,10 @@ jobs: shell: bash run: | .github/scripts/display_ec2_information.sh + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -219,6 +238,17 @@ jobs: if-no-files-found: error path: pytorch-${{ github.run_id }}/test-reports-*.zip + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup workspace if: always() shell: bash diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 35f9feccaf26c..53acdd8a961b7 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -51,6 +51,10 @@ jobs: http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" steps: + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch uses: actions/checkout@v2 with: @@ -91,6 +95,17 @@ jobs: if-no-files-found: error name: ${{ env.BUILD_ENVIRONMENT }} path: C:\${{ github.run_id }}\build-results + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup build-results and workspaces if: always() shell: bash @@ -155,6 +170,10 @@ jobs: shell: bash run: | .github/scripts/display_ec2_information.sh + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -203,6 +222,17 @@ jobs: if-no-files-found: error path: pytorch-${{ github.run_id }}/test-reports-*.zip + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup workspace if: always() shell: bash diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index 26b703500b0d1..a3447bc41f616 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -53,6 +53,10 @@ jobs: http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" steps: + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch uses: actions/checkout@v2 with: @@ -101,6 +105,17 @@ jobs: if-no-files-found: error name: ${{ env.BUILD_ENVIRONMENT }} path: C:\${{ github.run_id }}\build-results + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup build-results and workspaces if: always() shell: bash @@ -165,6 +180,10 @@ jobs: shell: bash run: | .github/scripts/display_ec2_information.sh + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -221,6 +240,17 @@ jobs: if-no-files-found: error path: pytorch-${{ github.run_id }}/test-reports-*.zip + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup workspace if: always() shell: bash diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml index d4175aca5f02d..2b3a30c6187fd 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml @@ -53,6 +53,10 @@ jobs: http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" steps: + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch uses: actions/checkout@v2 with: @@ -101,6 +105,17 @@ jobs: if-no-files-found: error name: ${{ env.BUILD_ENVIRONMENT }} path: C:\${{ github.run_id }}\build-results + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup build-results and workspaces if: always() shell: bash @@ -165,6 +180,10 @@ jobs: shell: bash run: | .github/scripts/display_ec2_information.sh + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -221,6 +240,17 @@ jobs: if-no-files-found: error path: pytorch-${{ github.run_id }}/test-reports-*.zip + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 - name: Cleanup workspace if: always() shell: bash From 41ffec07ce8abfffd3f4f450ae442a8f5982a074 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Tue, 24 Aug 2021 15:32:42 -0700 Subject: [PATCH 186/530] Add a common autograd TLS state (#63860) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63860 Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30513253 Pulled By: albanD fbshipit-source-id: 97d76ed54dfbdf4ba3fc7051ce3b9bb636cefb4b --- aten/src/ATen/ThreadLocalState.cpp | 22 +++++++++++------ aten/src/ATen/ThreadLocalState.h | 5 +--- c10/core/AutogradState.cpp | 19 +++++++++++++++ c10/core/AutogradState.h | 39 ++++++++++++++++++++++++++++++ c10/core/GradMode.cpp | 7 +++--- c10/core/InferenceMode.cpp | 8 +----- c10/core/InferenceMode.h | 19 +++++++-------- 7 files changed, 87 insertions(+), 32 deletions(-) create mode 100644 c10/core/AutogradState.cpp create mode 100644 c10/core/AutogradState.h diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index ba7be1a06b8a1..fc4b8fa9c27ec 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -12,15 +12,12 @@ namespace at { ThreadLocalState::ThreadLocalState(bool keep_grad_mode) : dispatch_key_(c10::impl::tls_local_dispatch_key_set()), debug_info_(c10::ThreadLocalDebugInfo::current()), - inference_mode_enabled_(c10::InferenceMode::is_enabled()) { + autograd_tls_(c10::AutogradState::get_tls_state()) { rf_tls_ = at::get_record_function_tls_(); saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks(); #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) keep_grad_mode_ = keep_grad_mode; - if (keep_grad_mode_) { - grad_mode_enabled_ = GradMode::is_enabled(); - } #endif bumped_record_all_functions_ = at::checkRecordAllFunctions(); } @@ -28,10 +25,23 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode) /* static */ void ThreadLocalState::setThreadLocalState( const ThreadLocalState& state) { + // Note that setting the InferenceMode TLS in this function is ONLY ok because we always + // restore the dispatch key set TLS at the same time. #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) if (state.keep_grad_mode_) { - GradMode::set_enabled(state.grad_mode_enabled_); + c10::AutogradState::set_tls_state(state.autograd_tls_); + } else { + auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), + /* inference_mode */ state.autograd_tls_.get_inference_mode()); + c10::AutogradState::set_tls_state(new_state); } +#else + // The mobile build explicitly ignore grad_mode but fails if we propagate + // its value across threads or set it to a fixed value. + // So we have to make sure the grad_mode value is not changed here. + auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), + /* inference_mode */ state.autograd_tls_.get_inference_mode()); + c10::AutogradState::set_tls_state(new_state); #endif at::set_record_function_tls_(state.rf_tls_); @@ -43,8 +53,6 @@ void ThreadLocalState::setThreadLocalState( c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_); c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_); - - c10::InferenceMode::_set_enabled(state.inference_mode_enabled_); } } // namespace at diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index f30f5e3442cc1..4942399cbd6d7 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -35,14 +35,11 @@ class TORCH_API ThreadLocalState { // RecordFunction TLS RecordFunctionTLS rf_tls_; + AutogradState autograd_tls_; #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) bool keep_grad_mode_ = true; - bool grad_mode_enabled_; #endif - // TLS for InferenceMode - bool inference_mode_enabled_; - // TLS for saved tensors default hooks std::pair saved_tensors_default_hooks_; diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp new file mode 100644 index 0000000000000..9684a76b78564 --- /dev/null +++ b/c10/core/AutogradState.cpp @@ -0,0 +1,19 @@ +#include + +namespace c10 { + +namespace { +// By default, grad mode is enabled and inference mode is disabled +thread_local AutogradState autograd_state_tls = + AutogradState(/* grad_mode */ true, /* inference_mode */ false); +} // namespace + +AutogradState& AutogradState::get_tls_state() { + return autograd_state_tls; +} + +void AutogradState::set_tls_state(AutogradState state) { + autograd_state_tls = state; +} + +} // namespace c10 diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h new file mode 100644 index 0000000000000..1447594433fe4 --- /dev/null +++ b/c10/core/AutogradState.h @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include + +namespace c10 { + +// Structure used to pack all the thread local boolean +// flags used by autograd +struct C10_API AutogradState { + static AutogradState& get_tls_state(); + static void set_tls_state(AutogradState state); + + AutogradState(bool grad_mode, bool inference_mode) + : grad_mode_(grad_mode), inference_mode_(inference_mode) {} + + void set_grad_mode(bool enabled) { + grad_mode_ = enabled; + } + + void set_inference_mode(bool enabled) { + inference_mode_ = enabled; + } + + bool get_grad_mode() const { + return grad_mode_; + } + + bool get_inference_mode() const { + return inference_mode_; + } + + private: + bool grad_mode_ : 1; + bool inference_mode_ : 1; +}; + +} // namespace c10 diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp index 32747a6698afa..a5db198083b2b 100644 --- a/c10/core/GradMode.cpp +++ b/c10/core/GradMode.cpp @@ -1,16 +1,15 @@ +#include #include #include namespace c10 { -thread_local bool GradMode_enabled = true; - bool GradMode::is_enabled() { - return GradMode_enabled; + return AutogradState::get_tls_state().get_grad_mode(); } void GradMode::set_enabled(bool enabled) { - GradMode_enabled = enabled; + AutogradState::get_tls_state().set_grad_mode(enabled); } } // namespace c10 diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp index b588ab4da54b5..59eca760cf504 100644 --- a/c10/core/InferenceMode.cpp +++ b/c10/core/InferenceMode.cpp @@ -2,18 +2,12 @@ #include namespace c10 { -thread_local bool InferenceMode_enabled = false; - // Invariant: // is_enabled() == // !c10::impl::tls_is_dispatch_key_included(DispatchKey::ADInplaceOrView); // InferenceMode::is_enabled() is in perf critical path (TensorImpl constructor) // so it worths a separate TLS to skip the DispatchKeySet check. bool InferenceMode::is_enabled() { - return InferenceMode_enabled; -} - -void InferenceMode::_set_enabled(bool enabled) { - InferenceMode_enabled = enabled; + return AutogradState::get_tls_state().get_inference_mode(); } } // namespace c10 diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h index 7a9c2c593a453..9748d6eccfb54 100644 --- a/c10/core/InferenceMode.h +++ b/c10/core/InferenceMode.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -50,10 +51,12 @@ struct TORCH_API InferenceMode { // are applicable to InferenceMode as well, e.g. // `tensorTypeInCurrentExecutionContext` in interpreter.cpp. InferenceMode(bool enabled = true) - : prev_mode(InferenceMode::is_enabled()), - prev_keyset(c10::impl::tls_local_dispatch_key_set()), - grad_mode(at::AutoGradMode(!enabled)) { - _set_enabled(enabled); + : prev_mode(AutogradState::get_tls_state()), + prev_keyset(c10::impl::tls_local_dispatch_key_set()) { + // Enabling inference mode means disabling grad mode + // And disabling inference mode means enabling grad mode + AutogradState::set_tls_state( + AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled)); DispatchKeySet included = enabled ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView) : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView); @@ -67,17 +70,13 @@ struct TORCH_API InferenceMode { } ~InferenceMode() { - _set_enabled(prev_mode); + AutogradState::set_tls_state(prev_mode); c10::impl::_force_tls_local_dispatch_key_set(prev_keyset); } static bool is_enabled(); - // _set_enabled() is not user facing and should be only used in - // ThreadLocalState.cpp. - static void _set_enabled(bool enabled); private: - bool prev_mode; + AutogradState prev_mode; c10::impl::LocalDispatchKeySet prev_keyset; - at::AutoGradMode grad_mode; }; } // namespace c10 From 6d58c830072eda05786a336c252b624c105a6fac Mon Sep 17 00:00:00 2001 From: Xiaodong Wang Date: Tue, 24 Aug 2021 15:45:59 -0700 Subject: [PATCH 187/530] Turn off layer norm in jit symbolic differentiation (#63816) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63816 Test Plan: Confirmed this can rescue the NE: https://www.internalfb.com/mast/job/torchx_xdwang-SparseNNApplication_72cf593d Reviewed By: ngimel Differential Revision: D30498746 fbshipit-source-id: 4a387f32ee2f70685de6104459c7f21bfbddc187 --- torch/csrc/jit/runtime/symbolic_script.cpp | 2 +- torch/testing/_internal/jit_metaprogramming_utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp index 29ce74a7d3ef7..6f2acca134738 100644 --- a/torch/csrc/jit/runtime/symbolic_script.cpp +++ b/torch/csrc/jit/runtime/symbolic_script.cpp @@ -1141,7 +1141,7 @@ const std::vector functions = { return output, backward - def layer_norm(input : Tensor, + def layer_norm_disabled(input : Tensor, normalized_shape : List[int], weight : Optional[Tensor], bias : Optional[Tensor], diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py index 350866cdbf083..75b1615d065d5 100644 --- a/torch/testing/_internal/jit_metaprogramming_utils.py +++ b/torch/testing/_internal/jit_metaprogramming_utils.py @@ -144,14 +144,14 @@ 'with_only_weight_inference', (True, 'aten::_batch_norm_impl_index')), ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),), ('layer_norm', (S, S, S, S), ([5],), '', - (True, ['aten::native_layer_norm'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),), 'with_only_weight', - (True, ['aten::native_layer_norm'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], None, non_differentiable(torch.rand(S)),), 'with_only_bias', - (True, ['aten::native_layer_norm'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)), non_differentiable(torch.rand(S))), 'with_weight_and_bias', - (True, ['aten::native_layer_norm'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])), ('group_norm', (S, S, S), (1, torch.rand(5),),), ('local_response_norm', (S, S, S), (2, ),), ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]),), '',), From 956c8fa01ee6122122d96043f9b192fd106eb139 Mon Sep 17 00:00:00 2001 From: Harut Movsisyan Date: Tue, 24 Aug 2021 16:20:13 -0700 Subject: [PATCH 188/530] Microbenchmarking matrix mult (einsum, torch.mult, torch.mm) (#63654) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63654 Test Plan: ``` > buck run mode/opt caffe2/benchmarks/operator_benchmark/pt:matrix_mult_test # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: einsum_bmm # Mode: Eager # Name: einsum_bmm_B4_M5_N3_K2_cpu # Input: B: 4, M: 5, N: 3, K: 2, device: cpu Forward Execution Time (us) : 27.970 # Benchmarking PyTorch: einsum_bmm # Mode: Eager # Name: einsum_bmm_B32_M25_N20_K30_cpu # Input: B: 32, M: 25, N: 20, K: 30, device: cpu Forward Execution Time (us) : 41.830 # Benchmarking PyTorch: einsum_bmm # Mode: Eager # Name: einsum_bmm_B128_M100_N120_K110_cpu # Input: B: 128, M: 100, N: 120, K: 110, device: cpu Forward Execution Time (us) : 499.114 # Benchmarking PyTorch: bmm # Mode: Eager # Name: bmm_B4_M5_N3_K2_cpu # Input: B: 4, M: 5, N: 3, K: 2, device: cpu Forward Execution Time (us) : 6.268 # Benchmarking PyTorch: bmm # Mode: Eager # Name: bmm_B32_M25_N20_K30_cpu # Input: B: 32, M: 25, N: 20, K: 30, device: cpu Forward Execution Time (us) : 12.676 # Benchmarking PyTorch: bmm # Mode: Eager # Name: bmm_B128_M100_N120_K110_cpu # Input: B: 128, M: 100, N: 120, K: 110, device: cpu Forward Execution Time (us) : 438.219 # Benchmarking PyTorch: einsum_elementwise # Mode: Eager # Name: einsum_elementwise_B4_M5_N3_cpu # Input: B: 4, M: 5, N: 3, device: cpu Forward Execution Time (us) : 7.657 # Benchmarking PyTorch: einsum_elementwise # Mode: Eager # Name: einsum_elementwise_B32_M25_N20_cpu # Input: B: 32, M: 25, N: 20, device: cpu Forward Execution Time (us) : 18.523 # Benchmarking PyTorch: einsum_elementwise # Mode: Eager # Name: einsum_elementwise_B100_M90_N110_cpu # Input: B: 100, M: 90, N: 110, device: cpu Forward Execution Time (us) : 55.103 # Benchmarking PyTorch: mul # Mode: Eager # Name: mul_B4_M5_N3_cpu # Input: B: 4, M: 5, N: 3, device: cpu Forward Execution Time (us) : 2.501 # Benchmarking PyTorch: mul # Mode: Eager # Name: mul_B32_M25_N20_cpu # Input: B: 32, M: 25, N: 20, device: cpu Forward Execution Time (us) : 10.589 # Benchmarking PyTorch: mul # Mode: Eager # Name: mul_B100_M90_N110_cpu # Input: B: 100, M: 90, N: 110, device: cpu Forward Execution Time (us) : 50.102 Reviewed By: ajyu Differential Revision: D30455179 fbshipit-source-id: 9f2d92b2d2b860f41a8e59be2cc086d75b587f7b --- .../operator_benchmark/pt/matrix_mult_test.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 benchmarks/operator_benchmark/pt/matrix_mult_test.py diff --git a/benchmarks/operator_benchmark/pt/matrix_mult_test.py b/benchmarks/operator_benchmark/pt/matrix_mult_test.py new file mode 100644 index 0000000000000..ad7d42318140d --- /dev/null +++ b/benchmarks/operator_benchmark/pt/matrix_mult_test.py @@ -0,0 +1,119 @@ +import operator_benchmark as op_bench +import torch + +""" +Microbenchmarks for batch matrix mult with einsum and torch.bmm. +""" + +batch_mm_configs_short = op_bench.config_list( + attr_names=["B", "M", "N", "K"], + attrs=[ + [4, 5, 3, 2], + [32, 25, 20, 30], + [128, 100, 120, 110], + ], + cross_product_configs={ + 'device': ['cpu', 'cuda'], + }, + tags=["short"], +) + +batch_mm_configs_long = op_bench.config_list( + attr_names=["B", "M", "N", "K"], + attrs=[ + [128, 256, 128, 256], + [512, 1024, 1024, 512], + ], + cross_product_configs={ + 'device': ['cpu', 'cuda'], + }, + tags=["long"], +) + +batch_mm_op_list = op_bench.op_list( + attr_names=['op_name', 'op_func'], + attrs=[ + ['einsum_bmm', torch.einsum], + ['bmm', torch.bmm], + ], +) + +class BatchMatrixMultBenchmark(op_bench.TorchBenchmarkBase): + def init(self, B, M, N, K, device, op_func): + self.inputs = { + "input_one": torch.rand(B, M, N, device=device), + "input_two": torch.rand(B, N, K, device=device) + } + self.op_func = op_func + + def forward(self, input_one, input_two): + if self.op_func.__name__ == "einsum": + return torch.einsum('bij,bjk->bik', input_one, input_two) + else: + return torch.bmm(input_one, input_two) + + +""" +Microbenchmarks for element-wise matrix mult with einsum and torch.mul. +""" + +batch_elementwise_configs_short = op_bench.config_list( + attr_names=["B", "M", "N"], + attrs=[ + [4, 5, 3], + [32, 25, 20], + [100, 90, 110], + ], + cross_product_configs={ + 'device': ['cpu', 'cuda'], + }, + tags=["short"], +) + + +batch_elementwise_configs_long = op_bench.cross_product_configs( + B=[128, 512, 1024], + M=[128, 512, 1024], + N=[128, 512, 1024], + device=['cpu', 'cuda'], + tags=['long'] +) + +batch_elementwise_op_list = op_bench.op_list( + attr_names=['op_name', 'op_func'], + attrs=[ + ['einsum_elementwise', torch.einsum], + ['mul', torch.mul], + ], +) + +class BatchElementWiseBenchmark(op_bench.TorchBenchmarkBase): + def init(self, B, M, N, device, op_func): + self.inputs = { + "input_one": torch.rand(B, M, N, device=device), + "input_two": torch.rand(B, M, N, device=device) + } + self.op_func = op_func + + def forward(self, input_one, input_two): + if self.op_func.__name__ == "einsum": + return torch.einsum('bij,bij->bij', input_one, input_two) + else: + return torch.mul(input_one, input_two) + + +op_bench.generate_pt_tests_from_op_list( + batch_mm_op_list, + batch_mm_configs_short + batch_mm_configs_long, + BatchMatrixMultBenchmark, +) + +op_bench.generate_pt_tests_from_op_list( + batch_elementwise_op_list, + batch_elementwise_configs_short + batch_elementwise_configs_long, + BatchElementWiseBenchmark, +) + + +if __name__ == "__main__": + op_bench.benchmark_runner.main() From 6fa646ad547f5ea9975f59cbece7e287959503fe Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Tue, 24 Aug 2021 17:06:18 -0700 Subject: [PATCH 189/530] [StaticRuntime] Fix bug in HasInplaceOp (#63842) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63842 Reviewed By: mikeiovine Differential Revision: D30506914 fbshipit-source-id: b2e358cfb991dacdb295b61bbc37beb36b73b852 --- benchmarks/static_runtime/test_scripts.h | 16 ++++++++++++++++ benchmarks/static_runtime/test_static_runtime.cc | 1 + torch/csrc/jit/runtime/static/passes.cpp | 4 +++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index c82dd57752bd6..90f93b20c94c0 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -138,6 +138,22 @@ const auto reshape_inplace_script = R"JIT( return (d, e, f) )JIT"; +const auto reshape_inplace_script_1 = R"JIT( + def forward(self, inp: Tensor, shape: List[int], flag: bool): + if flag: + a = inp + inp + b = a.reshape(shape) + c = b.sigmoid() + else: + a = inp * inp + b = a.sigmoid_() + c = b.reshape(shape) + d = c + c + e = a + a + f = b + b + return (d, e, f) +)JIT"; + const auto sigmoid_inplace_script = R"JIT( def forward(self, inp: Tensor): a = torch.sigmoid(inp, out=inp).clone() diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 701231e7720d1..f6ec677bbb7bc 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -69,6 +69,7 @@ Node* getNodeWithKind(const StaticModule& smodule, const std::string& kind) { TEST(StaticRuntime, InPlace) { EXPECT_TRUE(testHasInplaceOp(reshape_inplace_script)); + EXPECT_TRUE(testHasInplaceOp(reshape_inplace_script_1)); EXPECT_TRUE(testHasInplaceOp(sigmoid_inplace_script)); EXPECT_FALSE(testHasInplaceOp(sigmoid_out_script)); } diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp index 2e9eb5746d276..c8e1107199528 100644 --- a/torch/csrc/jit/runtime/static/passes.cpp +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -12,7 +12,9 @@ namespace { bool HasInplaceOp(Block* block, const AliasDb& alias_db) { for (auto* node : block->nodes()) { for (Block* sub_block : node->blocks()) { - return HasInplaceOp(sub_block, alias_db); + if (HasInplaceOp(sub_block, alias_db)) { + return true; + } } auto inputs = node->inputs(); // check if node modifies inputs (both inplace ops and certain out variants From 5b28e3c18359ef863946f540717f80fd1dcaa193 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 24 Aug 2021 18:20:43 -0700 Subject: [PATCH 190/530] [quant][graphmode][fx] Add reference option support for binary ops (#62698) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62698 We also removed the special handling in match_utils for binary ops Test Plan: python test/test_quantize.py TestQuantizeFx python test/test_quantize.py TestQuantizeFxOps Imported from OSS Reviewed By: vkuzo Differential Revision: D30093781 fbshipit-source-id: 58cc972de8211a80dd4d111e25dc4ad36057933f --- test/quantization/fx/test_numeric_suite_fx.py | 4 +- torch/quantization/fx/convert.py | 15 +- torch/quantization/fx/match_utils.py | 67 ++------ torch/quantization/fx/prepare.py | 4 +- .../quantization/fx/quantization_patterns.py | 145 +++++++++--------- torch/quantization/ns/mappings.py | 1 + 6 files changed, 99 insertions(+), 137 deletions(-) diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py index d605eba34d922..61062fba781e5 100644 --- a/test/quantization/fx/test_numeric_suite_fx.py +++ b/test/quantization/fx/test_numeric_suite_fx.py @@ -646,7 +646,6 @@ def _op_is_unmatchable(op): # these ops do not have quantized equivalents ops_to_skip = [ torch.bmm, - torch.sum, torch.div, torch.sub, operator.truediv, @@ -662,6 +661,9 @@ def _op_is_unmatchable(op): # RNNDynamicQuantizeHandler pass elif qhandler_cls == qp.DefaultNodeQuantizeHandler: + # torch.sum does not have quantized equivalents + if base_op == torch.sum: + continue self.assertTrue( _op_in_base_sets_of_related_ops(base_op), f"{base_op} not in sets of related ops") diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py index 976ca0c6aeca7..671c2704d7da7 100644 --- a/torch/quantization/fx/convert.py +++ b/torch/quantization/fx/convert.py @@ -335,11 +335,18 @@ def node_arg_is_quantized(node_arg: Any) -> bool: else: return False - def is_output_quantized(node: Node, obj: QuantizeHandler, qconfig: QConfigAny, modules: Dict[str, torch.nn.Module]) -> bool: + def is_output_quantized( + node: Node, obj: QuantizeHandler, qconfig: QConfigAny, + modules: Dict[str, torch.nn.Module], is_reference=False) -> bool: """ Check if output node is quantized or not """ assert modules is not None - # by default the output for a quantizable node is expected to be quantized - quantized = True + # for some ops the output is quantized only when `is_reference` is True + # and when `is_reference` is False, it has limited qconfig + # support, for example `add` + # ideally this check should not happen here, it should happen either in + # prepare or during lowering, we don't need this check + # after the default path is changed to produce reference patterns + quantized = obj.is_output_quantized(qconfig, is_reference) # Need to get correct quantized/non-quantized state forn the output # of FixedQParamsQuantizeHandler @@ -454,7 +461,7 @@ def insert_quantize_node(node: Node, modules: Dict[str, torch.nn.Module]) -> Non node, qconfig, modules, quantized_graph, node_name_to_scope, load_arg, is_reference=is_reference, convert_custom_config_dict=convert_custom_config_dict) if not is_observed_standalone_module_node: - quantized = is_output_quantized(node, obj, qconfig, modules) + quantized = is_output_quantized(node, obj, qconfig, modules, is_reference) if quantized: env[node.name][activation_dtype(qconfig)] = result diff --git a/torch/quantization/fx/match_utils.py b/torch/quantization/fx/match_utils.py index dd8501c9b8bf1..4aa9275870c26 100644 --- a/torch/quantization/fx/match_utils.py +++ b/torch/quantization/fx/match_utils.py @@ -9,9 +9,6 @@ QuantizeHandler, CustomModuleQuantizeHandler, StandaloneModuleQuantizeHandler, - BinaryOpQuantizeHandler, - binary_op_supported_dtypes, - binary_reference_op_supported_dtypes, ) from ..qconfig import ( QConfigAny, @@ -19,7 +16,6 @@ from .graph_module import ( is_observed_standalone_module, ) -from ..utils import get_qconfig_dtypes from typing import Any, Dict, List, Callable, Optional, Tuple, Set @@ -135,60 +131,15 @@ def record_match(pattern, node, matched): if node.name not in match_map and node.name not in all_matched: for pattern, value in patterns.items(): if is_match(modules, node, pattern): - skip_this_match = False - if value is BinaryOpQuantizeHandler: - - # to properly check for dtype support, we need to - # navigate to the base node of an add-relu or mul-relu - # pattern - base_node = node - if ( - (node.op == 'call_function' and - node.target is torch.nn.functional.relu) or - (node.op == 'call_module' and - isinstance(modules[node.target], torch.nn.ReLU)) - ): - base_node = node.args[0] - - this_node_qconfig = \ - qconfig_map[base_node.name] - if this_node_qconfig: - dtypes = get_qconfig_dtypes(this_node_qconfig) - # TODO(future PR): update the pattern to quantize - # handler logic to take this into account. - - - # This needs to handle 3 cases - # 1) op and dtype is in either [is_ref or non-ref] list -> don't skip - # 2) op is not in either list (i.e. relu) -> don't skip - # 3) op is in non-ref list, but not for dtype, and op+dtype not in is_ref list -> skip - - # note: the value of is_reference is unknown at prepare, so we have to cover both cases - # handle is_reference = False - skip_match_not_is_reference = ( - (base_node.target in binary_op_supported_dtypes) and - (dtypes not in binary_op_supported_dtypes[base_node.target]) - ) - - # handle is_reference = True - supported_is_reference = ( - (base_node.target in binary_reference_op_supported_dtypes) and - (dtypes in binary_reference_op_supported_dtypes[base_node.target]) - ) - - # only skip if not reference says skip and is_reference doesn't support - skip_this_match = skip_match_not_is_reference and not supported_is_reference - - if not skip_this_match: - matched: List[Any] = [] - record_match(pattern, node, matched) - for n in matched: - match_map[n.name] = ( - node, matched, pattern, value(node, modules), # type: ignore[operator] - qconfig_map[n.name]) - all_matched.add(n.name) - # break after finding the first match - break + matched: List[Any] = [] + record_match(pattern, node, matched) + for n in matched: + match_map[n.name] = ( + node, matched, pattern, value(node, modules), # type: ignore[operator] + qconfig_map[n.name]) + all_matched.add(n.name) + # break after finding the first match + break # add custom module instances to the match result assert modules is not None diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index 29600b8797c52..86abac2d20991 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -87,7 +87,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool: - return node.op == "call_module" and \ + return isinstance(node, torch.fx.Node) and node.op == "call_module" and \ is_activation_post_process(modules[str(node.target)]) def node_arg_is_weight(node: Node, arg: Any) -> bool: @@ -772,6 +772,8 @@ def maybe_make_input_output_share_observers( # we need to navigate up to the first observer iteration_guard = 0 while not is_activation_post_process_node(first_arg_arg, modules): + if not isinstance(first_arg_arg, Node): + return False # did not find an activation_post_process for the op if first_arg_arg.op == "placeholder": return False diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index a68eea2bbf44c..1ce43cadc8e98 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -155,6 +155,15 @@ def get_activation_ctr( """ return qconfig.activation + def is_output_quantized(self, qconfig, is_reference): + """ Returns true if the output node of convert is quantized + when is_reference is False, we would return float node when a certain dtype + combination is not supported (since fbgemm/qnnpack only support certain dtype + combinations), so the output may be float, but when is_reference is True, + we support all dtype combinations so the output will always be quantized. + """ + return True + @abstractmethod def convert(self, @@ -180,34 +189,52 @@ def convert(self, # tuple (activation_dtype, weight_dtype, compute_dtype) # these are supported types for common binary ops like add/mul etc. -binary_op_all_dtypes = [ +all_dtypes = [ (torch.quint8, torch.qint8, None), (torch.float16, torch.float16, None), ] -binary_op_float16_dtypes = [ +fp16_dtypes = [ (torch.float16, torch.float16, None) ] -binary_op_int8_dtypes = [ +int8_dtypes = [ (torch.quint8, torch.qint8, None), ] binary_op_supported_dtypes : Dict[Union[Callable, str], List[Tuple[torch.dtype, torch.dtype, None]]] = { - operator.add: binary_op_all_dtypes, - torch.add: binary_op_all_dtypes, - operator.mul: binary_op_all_dtypes, - torch.mul: binary_op_all_dtypes, - torch.bmm: binary_op_float16_dtypes, - torch.sub: binary_op_float16_dtypes, - operator.sub: binary_op_float16_dtypes, - torch.div: binary_op_float16_dtypes, - operator.truediv: binary_op_float16_dtypes, - torch.sum: binary_op_float16_dtypes + operator.add: all_dtypes, + torch.add: all_dtypes, + operator.mul: all_dtypes, + torch.mul: all_dtypes, + torch.bmm: fp16_dtypes, + torch.sub: fp16_dtypes, + operator.sub: fp16_dtypes, + torch.div: fp16_dtypes, + operator.truediv: fp16_dtypes, } -binary_reference_op_supported_dtypes : Dict[Union[Callable, str], List[Tuple[torch.dtype, torch.dtype, None]]] = { - torch.bmm: binary_op_int8_dtypes, - operator.add: binary_op_int8_dtypes, - torch.add: binary_op_int8_dtypes, - operator.mul: binary_op_int8_dtypes, - torch.mul: binary_op_int8_dtypes, + +default_op_supported_dtypes = { + torch.nn.ConvTranspose1d: int8_dtypes, + torch.nn.ConvTranspose2d: int8_dtypes, + torch.nn.ELU: int8_dtypes, + torch.nn.LeakyReLU: int8_dtypes, + torch.nn.Hardswish: int8_dtypes, + torch.nn.InstanceNorm1d: int8_dtypes, + torch.nn.InstanceNorm2d: int8_dtypes, + torch.nn.InstanceNorm3d: int8_dtypes, + torch.nn.LayerNorm: all_dtypes, + torch.nn.SiLU: fp16_dtypes, + torch.nn.Mish: fp16_dtypes, + torch.nn.GELU: int8_dtypes, + torch.nn.Softmax: int8_dtypes, + torch.nn.functional.elu: int8_dtypes, + torch.nn.functional.hardswish: int8_dtypes, + torch.nn.functional.instance_norm: int8_dtypes, + torch.nn.functional.layer_norm: all_dtypes, + torch.nn.functional.leaky_relu: int8_dtypes, + torch.nn.functional.silu: fp16_dtypes, + torch.nn.functional.mish: fp16_dtypes, + torch.nn.functional.gelu: int8_dtypes, + torch.nn.functional.softmax: int8_dtypes, + torch.sum: fp16_dtypes, } QAT_CONV_MODULE_CLASSES = \ @@ -266,7 +293,6 @@ def _get_name(): @register_quant_pattern(torch.sub) @register_quant_pattern(torch.mul) @register_quant_pattern(torch.div) -@register_quant_pattern(torch.sum) @register_quant_pattern(torch.bmm) @register_quant_pattern((torch.nn.ReLU, operator.add)) @register_quant_pattern((torch.nn.ReLU, operator.mul)) @@ -344,6 +370,13 @@ def input_output_observed(self): # for x + y where x and y are scalars, we do not observe anything return self.num_tensor_args > 0 + def is_output_quantized(self, qconfig, is_reference): + dtypes = get_qconfig_dtypes(qconfig) + if not is_reference: + return self.binary_op in binary_op_supported_dtypes and \ + dtypes in binary_op_supported_dtypes[self.binary_op] + return True + def convert(self, node: Node, qconfig: QConfigAny, @@ -361,11 +394,14 @@ def convert(self, dtypes = get_qconfig_dtypes(qconfig) - if is_reference and self.binary_op in binary_reference_op_supported_dtypes and \ - dtypes in binary_reference_op_supported_dtypes[self.binary_op]: - if dtypes in binary_op_int8_dtypes: - # make sure both inputs are quantized to torch.quint8 - load_arg(quantized={0: torch.quint8, 1: torch.quint8})(self.binary_op_node.args) + if is_reference: + act_dtype = activation_dtype(qconfig) + if act_dtype == torch.float: + return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) + else: + if self.num_tensor_args == 2: + # make sure both inputs are quantized to act_dtype + load_arg(quantized={0: act_dtype, 1: act_dtype})(self.binary_op_node.args) args = load_arg(quantized=torch.float)(self.binary_op_node.args) kwargs = load_arg(quantized=torch.float)(self.binary_op_node.kwargs) op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float)) @@ -384,12 +420,6 @@ def modified_load_arg(n: Node): return quantize_node( op_out, activation_post_process, node, modules, quantized_graph, node_name_to_scope, is_input=False) - else: - warnings.warn( - "No implementation found for dtype combination: {}" - "for op {} with is_reference={} despite it being listed as supported" - "this should not happen".format(dtypes, self.binary_op, is_reference)) - return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) elif not is_reference and self.binary_op in binary_op_supported_dtypes and \ dtypes in binary_op_supported_dtypes[self.binary_op]: if dtypes in [(torch.quint8, torch.qint8, None)]: @@ -445,15 +475,10 @@ def modified_load_arg(n: Node): "dtype combination: {} is not " "supported by {} for is_reference={}. " "Supported non-reference dtype combinations are: {} " - "Supported reference dtype combinations are: {}" "".format(dtypes, self.binary_op, is_reference, - binary_op_supported_dtypes[self.binary_op], - ( - [] if self.binary_op not in binary_reference_op_supported_dtypes.keys() - else binary_reference_op_supported_dtypes[self.binary_op] - ) + binary_op_supported_dtypes[self.binary_op] ) ) if self.relu_node: @@ -1226,6 +1251,7 @@ def convert(self, # until they receive a proper fp16 kernel. To use the reference pattern, use a custom qconfig # @register_quant_pattern(torch.nn.functional.gelu) # @register_quant_pattern(torch.nn.functional.softmax) +@register_quant_pattern(torch.sum) class DefaultNodeQuantizeHandler(QuantizeHandler): """ Common quantized op, first input and first output will be quantized """ @@ -1239,6 +1265,13 @@ def __init__( elif node.op == "call_module": self.op = type(modules[str(node.target)]) + def is_output_quantized(self, qconfig, is_reference): + dtypes = get_qconfig_dtypes(qconfig) + if not is_reference: + return self.op in default_op_supported_dtypes and \ + dtypes in default_op_supported_dtypes[self.op] + return True + def convert(self, node: Node, qconfig: QConfigAny, @@ -1256,46 +1289,12 @@ def convert(self, convert_custom_config_dict = {} additional_static_quant_mapping = convert_custom_config_dict.get("static", {}) - all_dtypes = [ - (torch.quint8, torch.qint8, None), - (torch.float16, torch.float16, None) - ] - int8_dtypes = [ - (torch.quint8, torch.qint8, None) - ] - fp16_dtypes = [ - (torch.float16, torch.float16, None) - ] - supported_dtypes = { - torch.nn.ConvTranspose1d: int8_dtypes, - torch.nn.ConvTranspose2d: int8_dtypes, - torch.nn.ELU: int8_dtypes, - torch.nn.LeakyReLU: int8_dtypes, - torch.nn.Hardswish: int8_dtypes, - torch.nn.InstanceNorm1d: int8_dtypes, - torch.nn.InstanceNorm2d: int8_dtypes, - torch.nn.InstanceNorm3d: int8_dtypes, - torch.nn.LayerNorm: all_dtypes, - torch.nn.SiLU: fp16_dtypes, - torch.nn.Mish: fp16_dtypes, - torch.nn.GELU: int8_dtypes, - torch.nn.Softmax: int8_dtypes, - torch.nn.functional.elu: int8_dtypes, - torch.nn.functional.hardswish: int8_dtypes, - torch.nn.functional.instance_norm: int8_dtypes, - torch.nn.functional.layer_norm: all_dtypes, - torch.nn.functional.leaky_relu: int8_dtypes, - torch.nn.functional.silu: fp16_dtypes, - torch.nn.functional.mish: fp16_dtypes, - torch.nn.functional.gelu: int8_dtypes, - torch.nn.functional.softmax: int8_dtypes, - } dtypes = get_qconfig_dtypes(qconfig) - if not is_reference and dtypes not in supported_dtypes[self.op]: + if not is_reference and dtypes not in default_op_supported_dtypes[self.op]: warnings.warn( "dtype combination: {} is not " "supported by {} " - "supported dtype combinations are: {}".format(dtypes, self.op, supported_dtypes[self.op])) + "supported dtype combinations are: {}".format(dtypes, self.op, default_op_supported_dtypes[self.op])) return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) # TODO: make helper functions for (torch.quint8, torch.qint8, None) if not is_reference: diff --git a/torch/quantization/ns/mappings.py b/torch/quantization/ns/mappings.py index 2a7c859347f3d..399ddca22668e 100644 --- a/torch/quantization/ns/mappings.py +++ b/torch/quantization/ns/mappings.py @@ -419,6 +419,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: # uncomment below # operator.add, # operator.mul, + torch.sum, ]) FUNS_IO_TYPE_FP16: Set[NSNodeTargetType] = set() From d454c9e76e19f51ad95644509ae497f49bcd1924 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Tue, 24 Aug 2021 18:48:25 -0700 Subject: [PATCH 191/530] Migrate THCTensor_copyIgnoringOverlaps to ATen (#63505) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63505 This isn't a public operator, just a helper function used in CUDA_tensor_apply. Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30441305 Pulled By: ngimel fbshipit-source-id: 84fabc701cbd8479e02d80f373a3dd62d70df2ce --- aten/src/ATen/LegacyTHFunctionsCUDA.h | 26 ----- aten/src/ATen/cuda/CUDAApplyUtils.cuh | 18 +-- aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp | 104 ------------------ aten/src/ATen/native/Copy.cpp | 16 +++ aten/src/ATen/native/Copy.h | 2 + .../ATen/native/cuda/DistributionBernoulli.cu | 1 - .../native/cuda/DistributionCauchyKernel.cu | 1 - .../cuda/DistributionExponentialKernel.cu | 1 - .../cuda/DistributionGeometricKernel.cu | 1 - .../cuda/DistributionLogNormalKernel.cu | 1 - .../ATen/native/cuda/DistributionNormal.cu | 1 - .../native/cuda/DistributionRandomKernel.cu | 1 - aten/src/ATen/native/cuda/Distributions.cu | 1 - .../src/ATen/native/cuda/MultinomialKernel.cu | 1 - aten/src/ATen/native/cuda/Sort.cu | 1 - aten/src/ATen/native/cuda/TensorTopK.cu | 1 - .../src/ATen/native/cuda/UpSampleNearest3d.cu | 1 - .../ATen/templates/RegisterDispatchKey.cpp | 1 - tools/codegen/gen.py | 3 - tools/codegen/gen_backend_stubs.py | 1 - 20 files changed, 23 insertions(+), 160 deletions(-) delete mode 100644 aten/src/ATen/LegacyTHFunctionsCUDA.h delete mode 100644 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h deleted file mode 100644 index 41cbdd6f4ffe1..0000000000000 --- a/aten/src/ATen/LegacyTHFunctionsCUDA.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace c10 { -class Scalar; -} -namespace at { -struct Generator; -class Tensor; -struct Type; -} // namespace at - -namespace at { -namespace native { -namespace legacy { -namespace cuda { - -Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src); - -} // namespace th -} // namespace legacy -} // namespace native -} // namespace at diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index 2617870eea519..2b1538ec15ade 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include @@ -453,13 +453,11 @@ inline bool CUDA_tensor_apply2(at::Tensor a, if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { // Must perform in contiguous space - oldA = a; - a = a.contiguous(); + oldA = std::exchange(a, a.contiguous()); } if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) { // Must perform in contiguous space - oldB = b; - b = b.contiguous(); + oldB = std::exchange(b, b.contiguous()); } // It is possible that the tensor dimensions are able to be collapsed, @@ -547,17 +545,11 @@ inline bool CUDA_tensor_apply2(at::Tensor a, #undef HANDLE_A_CASE if (oldA.defined()) { - // Ignore overlaps when copying back; if we use copy - // instead, it will recursively try and invoke ourselves to make - // oldA contiguous. - at::native::legacy::cuda::_th_copy_ignoring_overlaps_(oldA, a); + at::native::copy_ignoring_overlaps(oldA, a); } if (oldB.defined()) { - // Ignore overlaps when copying back; if we use copy - // instead, it will recursively try and invoke ourselves to make - // oldB contiguous. - at::native::legacy::cuda::_th_copy_ignoring_overlaps_(oldB, b); + at::native::copy_ignoring_overlaps(oldB, b); } return true; diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp deleted file mode 100644 index c4e9dfe78cebe..0000000000000 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ /dev/null @@ -1,104 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#undef THNN_ -#undef THCIndexTensor_ -#include -#include -#include -#include - -namespace at { -namespace native { -namespace legacy { -namespace cuda { - -namespace { - ScalarType infer_scalar_type(const Tensor & t) { - return t.scalar_type(); - } - ScalarType infer_scalar_type(const TensorList & tl) { - TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); - return tl[0].scalar_type(); - } - - TensorOptions options(ScalarType s) { - return TensorOptions().dtype(s) - .device(DeviceType::CUDA) - .layout(kStrided); - } - - Allocator* allocator() { - return at::cuda::getCUDADeviceAllocator(); - } -} - -Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaByteTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaCharTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaIntTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaLongTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaShortTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaHalfTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_); - break; - } - default: - AT_ERROR("_th_copy_ignoring_overlaps_ not supported on CUDAType for ", dispatch_scalar_type); - } - return self; -} - -} // namespace th -} // namespace legacy -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 1b8538ec07601..7fa952d020ef9 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -253,6 +253,22 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) { return self; } +void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) { + // Called when we are copying into an overlapping index `dst`, but we don't + // care which writer wins. Hacky but it works. This is only used by + // CUDA_tensor_apply2 in case that there are write overlaps. + // FIXME: really, overlapping writes should be illegal/an error in Torch + auto iter = TensorIteratorConfig() + .add_output(dst) + .add_input(src) + .resize_outputs(false) + .set_check_mem_overlap(false) + .check_all_same_dtype(true) + .check_all_same_device(true) + .build(); + copy_stub(iter.device_type(), iter, /*non_blocking=*/false); +} + DEFINE_DISPATCH(copy_stub); } // namespace native diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h index 2dfd9e9f4922b..938466102b469 100644 --- a/aten/src/ATen/native/Copy.h +++ b/aten/src/ATen/native/Copy.h @@ -13,5 +13,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking); DECLARE_DISPATCH(copy_fn, copy_stub); +TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu index 3acf87c3c4b40..0baaf2e049b04 100644 --- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu +++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu index 35a1e6ef5a98c..6f43ee664cb2c 100644 --- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu index b4cf288bcb7b8..6e1823032a789 100644 --- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu index eb71ab3231f12..9086e2a35c8d3 100644 --- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu index 89b9c04b3a687..9497cf83cc405 100644 --- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu index da647277c1762..32d223c5d0a93 100644 --- a/aten/src/ATen/native/cuda/DistributionNormal.cu +++ b/aten/src/ATen/native/cuda/DistributionNormal.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu index 8d6614b9010d8..57d0701329d91 100644 --- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index cf1281d320b14..a48a3778305ab 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index 3912af58e1d99..65c45e7027964 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -1,6 +1,5 @@ #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu index f53f7b478dadf..83fce65d33b6c 100644 --- a/aten/src/ATen/native/cuda/Sort.cu +++ b/aten/src/ATen/native/cuda/Sort.cu @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index c0bc353110b6f..121208dd58dc2 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu index 522225b5fd85a..6270bba9eafee 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu @@ -1,6 +1,5 @@ #include #include -#include #include #include #include diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp index c702a68063c31..1abc3ee391ae2 100644 --- a/aten/src/ATen/templates/RegisterDispatchKey.cpp +++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp @@ -33,7 +33,6 @@ #include #include $extra_cuda_headers -$legacy_th_headers $external_backend_headers $namespaced_headers diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index c0ce886c3d50a..44bb3b4f87e1d 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -1096,9 +1096,6 @@ def make_file_manager(install_dir: str) -> FileManager: fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: { 'extra_cuda_headers': extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else '', - 'legacy_th_headers': - '#include ' if dispatch_key == DispatchKey.CUDA else - '', 'external_backend_headers': '', 'namespaced_headers': f'#include ' if dispatch_key in functions_keys else '', 'DispatchKey': dispatch_key, diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py index a712a239ad565..51f81c702e122 100644 --- a/tools/codegen/gen_backend_stubs.py +++ b/tools/codegen/gen_backend_stubs.py @@ -227,7 +227,6 @@ def make_file_manager(install_dir: str) -> FileManager: for dispatch_key in [backend_dispatch_key, autograd_dispatch_key]: fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: { 'extra_cuda_headers': '', - 'legacy_th_headers': '', 'external_backend_headers': f'#include "{output_dir}/{backend_key}NativeFunctions.h"', 'namespaced_headers': '', 'DispatchKey': dispatch_key, From 543130511a97a0aab57a6b2345156cc638b8cf7f Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Tue, 24 Aug 2021 18:52:29 -0700 Subject: [PATCH 192/530] [nnc] Disable erf and erfc (#63775) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63775 These introduce small accuracy differences that cause some internal tests to fail, and it's not worth fixing the tests right now because they're slower than the ATen ops anyways. ghstack-source-id: 136526229 Test Plan: ``` buck test mode/dev //aml/eccv/mcm/training:tests -- --exact 'aml/eccv/mcm/training:tests - test_build_torch_script_model (aml.eccv.mcm.training.tests.publish_helper_tests.TransformerPredictorPublishHelperTests)' ``` Reviewed By: navahgar Differential Revision: D30484557 fbshipit-source-id: 095a9c810539a499105b76e1d96843dbc61b0079 --- test/test_jit_fuser_te.py | 3 +++ torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 5e8204a4c7b14..f2dce12673d51 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1274,8 +1274,11 @@ def apply(fn): lambda x: torch.threshold(x, 0, -10), lambda x: torch.clamp(x, -10, 10), ] + gpu_only = {torch.erf, torch.erfc} sizes = [(1,), (2,), (4, 4)] for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): + if op in gpu_only and device == "cpu": + continue try: x = self.data_for(dtype, device, size=size) fn = apply(op) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index d4add03506c4f..3f0cd14668169 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -948,6 +948,14 @@ class TensorExprFuser { "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", "aten::matmul(Tensor self, Tensor other) -> Tensor", }; + static const OperatorSet gpu_only_operator_set{ + // On CPU, these are slower and less accurate than ATen kernels, because + // ATen is able to use MKL-VML, whereas the fuser currently can't. The + // fuser uses sleef instead because sleef provides functions that operate + // on vectors, instead of large buffers. + "aten::erf(Tensor self) -> Tensor", + "aten::erfc(Tensor self) -> Tensor", + }; static const OperatorSet pow{ "aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor", }; @@ -1026,6 +1034,17 @@ class TensorExprFuser { } } + // Operator is only supported on GPU. + if (node->isMemberOf(gpu_only_operator_set)) { + auto device = tensorexpr::pickDeviceType(node->inputs()); + if (!device) { + device = tensorexpr::pickDeviceType(node->outputs()); + } + if (!device || !device->is_cuda()) { + return false; + } + } + if (node->kind() == aten::to) { // only support same-device conversion auto device = tensorexpr::pickDeviceType(node->inputs()); From 1787b905c4a571ff1ae09ddc56ce56cb04e52136 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Tue, 24 Aug 2021 18:52:29 -0700 Subject: [PATCH 193/530] Don't switch executors mid test (#63830) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63830 It's really not safe to change the executor out from under models that may have already been partially compiled. ghstack-source-id: 136526228 Test Plan: ``` DEBUG=1 CFLAGS="-fsanitize=address" CXXFLAGS="-fsanitize=address" USE_LLVM=$(realpath ../llvm-project/install) CMAKE_PREFIX_PATH=$CONDA_PREFIX python setup.py install LD_PRELOAD=/lib64/libasan.so.5 numactl -C3 pytest -v --cov --cov-report xml:test/coverage.xml --cov-append onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset11 -s ``` Reviewed By: desertfire Differential Revision: D30504489 fbshipit-source-id: 188581cb53f0cf5bd3442d1e9d46e8c0c7e124f8 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 9d56c1169dec2..ffeef00cc9ac6 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -531,10 +531,6 @@ def test_faster_rcnn(self): dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5) def test_paste_mask_in_image(self): - # disable profiling - torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) - masks = torch.rand(10, 1, 26, 26) boxes = torch.rand(10, 4) boxes[:, 2:] += torch.rand(10, 2) @@ -582,10 +578,6 @@ def test_mask_rcnn(self): "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) def test_heatmaps_to_keypoints(self): - # disable profiling - torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) - maps = torch.rand(10, 1, 26, 26) rois = torch.rand(10, 4) from torchvision.models.detection.roi_heads import heatmaps_to_keypoints From 8dda299d9631e0f6e121dcb9f8f94bbdd8435515 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Tue, 24 Aug 2021 18:52:29 -0700 Subject: [PATCH 194/530] Re-apply: [nnc] Support thread level parallelism in fused kernels (#63776) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63776 I reverted this out of an abundance of caution because some test failures occurred, but they were all due to precision issues fixed lower in this stack. Let's try again. I've rolled the elimination of the allow-parallelism-in-fusions toggle into this diff since they're pretty tightly coupled. ghstack-source-id: 136529847 Test Plan: CI Reviewed By: huiguoo Differential Revision: D30484555 fbshipit-source-id: 38fd33520f710585d1130c365a8c60c9ce794a59 --- test/cpp/tensorexpr/test_kernel.cpp | 28 +++++++ test/cpp/tensorexpr/test_te_fuser_pass.cpp | 6 +- test/jit/test_profiler.py | 3 - test/test_jit_fuser_te.py | 5 -- test/test_tensorexpr.py | 4 - torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 +---- torch/csrc/jit/passes/tensorexpr_fuser.h | 2 - torch/csrc/jit/python/init.cpp | 2 - torch/csrc/jit/tensorexpr/kernel.cpp | 87 ++++++++++++++++++++++ torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 24 ++++-- torch/csrc/jit/tensorexpr/llvm_jit.h | 8 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 7 ++ 12 files changed, 148 insertions(+), 47 deletions(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index e14282f258893..8cdf2ef90df11 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -198,6 +198,34 @@ TEST_F(Kernel, _3) { } } +TEST_F(Kernel, ParallelStrided) { + const auto graph_string = R"IR( + graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu), + %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)): + %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1) + %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2) + return (%3))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat)); + auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat)) + .index( + {Slice(None, None, 2), + Slice(None, None, 2), + Slice(None, None, 2)}); + auto ref = a * (a * b); + auto o = at::zeros_like(ref); + TensorExprKernel k(graph); + std::vector inputs = {a, b}; + std::vector stack = fmap(inputs); + k.run(stack); + o = stack[0].toTensor(); + for (size_t i = 0; i < 5 * 3; i++) { + CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); + } +} + TEST_F(Kernel, DISABLED_Shape_Inference) { // disabled: doesn't do stride propagation, and isn't being used currently diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index 723a8fef81bea..b82d383bc99b0 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -14,19 +14,15 @@ namespace jit { using namespace torch::jit::tensorexpr; struct WithCPUFuser { - WithCPUFuser(bool val = true) - : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) { + WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) { overrideCanFuseOnCPU(val); - setTexprParallelCPUEnabled(true); } ~WithCPUFuser() { overrideCanFuseOnCPU(cpuFuserEnabled); - setTexprParallelCPUEnabled(parallel); } bool cpuFuserEnabled; - bool parallel; }; TEST(TEFuserPass, FuserPass_1) { diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index aa8be0518385f..b9ed9d0b78eb5 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -29,8 +29,6 @@ def setUp(self): torch._C._debug_set_fusion_group_inlining(False) self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) - self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() - torch._C._jit_set_texpr_parallel_cpu_enabled(True) def tearDown(self): torch._C._jit_set_profiling_executor(self.prev_exec) @@ -42,7 +40,6 @@ def tearDown(self): torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled) torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) - torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def test_tensor_type_not_determined_by_inputs(self): @torch.jit.script diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index f2dce12673d51..014f142cf1443 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -85,10 +85,6 @@ def setUp(self): self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) - # TODO: CPU fuser currently is disabled when multithreading. - self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() - torch._C._jit_set_texpr_parallel_cpu_enabled(True) - self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] self.int_dtypes = [ torch.int8, @@ -116,7 +112,6 @@ def tearDown(self): torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) - torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 6353113a1ec4c..47c7e689aa6a4 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -24,9 +24,6 @@ def setUp(self): torch._C._debug_set_fusion_group_inlining(False) self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu() torch._C._jit_set_te_must_use_llvm_cpu(False) - # TODO: CPU fuser currently is disabled when multithreading. - self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled() - torch._C._jit_set_texpr_parallel_cpu_enabled(True) self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] @@ -39,7 +36,6 @@ def tearDown(self): torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state) torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu) - torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel) def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 3f0cd14668169..085291afbdcf8 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -1,6 +1,5 @@ #include -#include #include #include #include @@ -250,15 +249,6 @@ bool isSupported(Node* node) { } // namespace tensorexpr static bool texpr_fuser_enabled_ = true; -static bool texpr_parallel_cpu_enabled = false; - -bool texprParallelCPUEnabled() { - return texpr_parallel_cpu_enabled; -} - -void setTexprParallelCPUEnabled(bool val) { - texpr_parallel_cpu_enabled = val; -} void setTensorExprFuserEnabled(bool val) { texpr_fuser_enabled_ = val; @@ -898,14 +888,7 @@ class TensorExprFuser { return false; } if (device->is_cpu()) { - // CPU fusion is only supported for single-thread. - if (!canFuseOnCPU()) { - return false; - } - if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) { - return true; - } - return false; + return canFuseOnCPU(); } else if (device->is_cuda()) { return canFuseOnGPU(); } else if (device->is_xpu()) { diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h index 3f6538b7e587a..254aebd91d12f 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.h +++ b/torch/csrc/jit/passes/tensorexpr_fuser.h @@ -24,8 +24,6 @@ TORCH_API void setTensorExprFuserEnabled(bool val); TORCH_API bool tensorExprFuserEnabled(); TORCH_API bool setTexprReductionsEnabled(bool value); TORCH_API bool texprReductionsEnabled(); -TORCH_API bool texprParallelCPUEnabled(); -TORCH_API void setTexprParallelCPUEnabled(bool val); TORCH_API void RemoveProfileNodesAndSpecializeTypes( std::shared_ptr& graph); diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index baea47d63ed18..645fea2274fb2 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -714,8 +714,6 @@ void initJITBindings(PyObject* module) { .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed) .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled) .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled) - .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled) - .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled) .def( "_jit_set_te_generate_block_code", [](bool gen_block_code) { diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index fed5e1e139d3d..d53e857d75a48 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -2487,6 +2488,86 @@ void fuseAllLoops(StmtPtr st) { } } +// Compute the trip count of a loop if it is a constant. +c10::optional tripCount(ForPtr loop) { + auto tc = IRSimplifier::simplify( + cast(ExprHandle(loop->stop()) - ExprHandle(loop->start()))); + if (auto val = to(tc.node())) { + return val->value(); + } + return c10::nullopt; +} + +// Prune innermost loops until iterations satisfies a minimum grain size. +static void pruneByGrainSize(std::vector& loops) { + constexpr int64_t minGrainSize = 32768; + int64_t grainSize = 1; + for (int64_t i = loops.size(); i > 0; i--) { + auto tc = tripCount(loops[i - 1]); + if (!tc) { + break; + } + grainSize *= *tc; + if (grainSize < minGrainSize) { + loops.pop_back(); + } + } +} + +// Retain enough outermost loops to fill the number of threads. +static void pruneByThreadCount(std::vector& loops) { + int64_t trips = 1; + auto threads = at::get_num_threads(); + auto it = loops.begin(); + for (; it != loops.end(); it++) { + if (trips >= threads) { + break; + } + auto tc = tripCount(*it); + if (!tc) { + break; + } + trips *= *tc; + } + loops.erase(it, loops.end()); +} + +// Flatten and parallelize outer loops, subject to a minimum number of elements +// in the inner loop, and a maximum level of thread-level parallelism in the +// outer loops. +template +static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) { + for (auto const& buf : bufs) { + auto loops = l.getLoopStmtsFor(buf); + pruneByGrainSize(loops); + pruneByThreadCount(loops); + + // There are no loops to parallelize; give up. + if (loops.size() == 0) { + continue; + } + // The loop nest contains a reduction; give up. + auto reductions = NodeFinder::find(loops[0]); + if (reductions.size() > 0) { + continue; + } + // The loop nest has loop carried dependences; give up. + if (LoopNest::hasLoopCarriedDependence(loops[0])) { + continue; + } + // Try to flatten the outer loops and parallelize them if successful. + ForPtr flattened = nullptr; + if (loops.size() == 1) { + flattened = loops[0]; + } else { + LoopNest::flatten(loops, &flattened); + } + if (flattened) { + flattened->set_parallel(); + } + } +} + StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { torch::jit::tensorexpr::LoopNest l(st, bufOutputs_); GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n"); @@ -2528,6 +2609,8 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { if (backendType == kLLVMCodeGen) { fuseAllLoops(l.root_stmt()); GRAPH_DEBUG("after fuse", *l.root_stmt()); + parallelizeOuterLoops(l, bufOutputs_); + GRAPH_DEBUG("after parallelize", *l.root_stmt()); } if (backendType == kCudaCodeGen) { @@ -2602,9 +2685,13 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { } l.prepareForCodegen(); + GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt()); + l.simplify(); + GRAPH_DEBUG("after simplification", *l.root_stmt()); if (backendType == kLLVMCodeGen && !hasReduction) { l.vectorizeInnerLoops(); + GRAPH_DEBUG("after vectorization", *l.root_stmt()); } StmtPtr stmt = l.root_stmt(); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 4ab2d53cc4942..5346d3668ec7e 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -274,15 +274,24 @@ class LLVMCodeGenImpl : public IRVisitor { } }; +extern "C" { typedef void (*ParallelCallee)(int index, int8_t* packed_data); -void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data) { +void DispatchParallel( + int8_t* func, + int start, + int stop, + int8_t* packed_data) noexcept { // TODO: preserve the func type. - ParallelCallee callee = reinterpret_cast(func); - at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { - for (int index = f_begin; index < f_end; index++) { - callee(index, packed_data); - } - }); + try { + ParallelCallee callee = reinterpret_cast(func); + at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { + for (int index = f_begin; index < f_end; index++) { + callee(index, packed_data); + } + }); + } catch (...) { + } +} } } // namespace tensorexpr @@ -1288,6 +1297,7 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) { module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype); llvm::Function* dispatcher = llvm::cast(dispatcher_callee.getCallee()); + dispatcher->addFnAttr(llvm::Attribute::NoUnwind); irb_.CreateCall( dispatcher, {func_value, start, stop, packed_caller_args_ptr}); value_ = llvm::ConstantInt::get(IntTy_, 0); diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h index 30ad5317a1b3c..8585900abc8d6 100644 --- a/torch/csrc/jit/tensorexpr/llvm_jit.h +++ b/torch/csrc/jit/tensorexpr/llvm_jit.h @@ -17,7 +17,13 @@ namespace torch { namespace jit { namespace tensorexpr { -void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data); +extern "C" { +void DispatchParallel( + int8_t* func, + int start, + int stop, + int8_t* packed_data) noexcept; +} inline std::string formatError(llvm::Error&& err, const char* msg) { static constexpr char* defaultErrorMsg = "Unexpected failure in LLVM JIT"; diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 190499998b289..d3a4b919bef33 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -179,6 +179,13 @@ class Vectorizer : public IRMutator { }); } + ExprPtr mutate(ModPtr v) override { + std::vector inputs = {v->lhs(), v->rhs()}; + return try_vectorize(v, inputs, [&]() { + return ExprHandle(inputs[0]) % ExprHandle(inputs[1]); + }); + } + ExprPtr mutate(AndPtr v) override { std::vector inputs = {v->lhs(), v->rhs()}; return try_vectorize(v, inputs, [&]() { From 58ef99bd5aaf94c2cf5744b938ba4774773eb98d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 24 Aug 2021 18:55:23 -0700 Subject: [PATCH 195/530] TST Adds pickle testing for ModuleInfo (#63736) Summary: Follow up to https://github.com/pytorch/pytorch/pull/61935 This PR adds `test_pickle` to `test_modules`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63736 Reviewed By: heitorschueroff Differential Revision: D30522462 Pulled By: jbschlosser fbshipit-source-id: a03b66ea0d81c6d0845c4fddf0ddc3714bbf0ab1 --- test/test_modules.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/test/test_modules.py b/test/test_modules.py index bb0fe5f1f9689..52520dad080de 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -1,3 +1,5 @@ +import tempfile + import torch from torch.testing._internal.common_device_type import instantiate_device_type_tests from torch.testing._internal.common_modules import module_db, modules @@ -108,6 +110,36 @@ def test_factory_kwargs(self, device, dtype, module_info): buffer.dtype, dtype, f'Buffer {name} is of dtype {buffer.dtype} instead of the expected dtype {dtype}') + @modules(module_db) + def test_pickle(self, device, dtype, module_info): + # Test that module can be pickled and unpickled. + module_cls = module_info.module_cls + module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, + requires_grad=False) + for module_input in module_inputs: + if module_input.forward_input is None: + continue + + args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs + + with freeze_rng_state(): + # === Instantiate the module. === + args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs + m = module_cls(*args, **kwargs) + m.to(device).to(dtype) + + # === Do forward pass. === + args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs + output = m(*args, **kwargs) + + # === Check unpickled module gives the same output. === + with tempfile.TemporaryFile() as f: + torch.save(m, f) + f.seek(0) + m_copy = torch.load(f) + output_from_copy = m_copy(*args, **kwargs) + self.assertEqual(output, output_from_copy) + instantiate_device_type_tests(TestModule, globals()) From 544af391b5649c8c407fa36b36631a2307997a09 Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Tue, 24 Aug 2021 19:00:33 -0700 Subject: [PATCH 196/530] Allow arbitrary objects in state_dicts (#62976) Summary: Fixes https://github.com/pytorch/pytorch/issues/62094 Introduces functionality for adding arbitrary objects to module state_dicts. To take advantage of this, the following functions can be defined on a module: * `get_extra_state(self) -> dict` - Returns a dict defining any extra state this module wants to save * `set_extra_state(self, state)` - Subsumes the given state within the module In the details, a sub-dictionary is stored in the state_dict under the key `_extra_state` for each module that requires extra state. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62976 Reviewed By: heitorschueroff Differential Revision: D30518657 Pulled By: jbschlosser fbshipit-source-id: 5fb35ab8e3d36f35e3e96dcd4498f8c917d1f386 --- test/test_nn.py | 86 ++++++++++++++++++++++++++++++++++++++ torch/jit/_script.py | 2 + torch/nn/modules/module.py | 51 +++++++++++++++++++++- 3 files changed, 138 insertions(+), 1 deletion(-) diff --git a/test/test_nn.py b/test/test_nn.py index 43e105a676ced..d577493fd531c 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -5465,6 +5465,92 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, self.assertEqual(mm[0].param[0].item(), 10) self.assertEqual(mm[0].sub.weight[0, 0].item(), 555) + def test_extra_state(self): + + class SubModule(torch.nn.Module): + def __init__(self, foo): + super().__init__() + self.foo = foo + + def get_extra_state(self): + return { + 'foo': self.foo + } + + def set_extra_state(self, state): + self.foo = state['foo'] + + class MyModule(torch.nn.Module): + def __init__(self, foo, bar): + super().__init__() + self.sub = SubModule(foo) + self.bar = bar + + def get_extra_state(self): + return { + 'bar': self.bar + } + + def set_extra_state(self, state): + self.bar = state['bar'] + + # Ensure state_dict contains the extra state by loading it into another module. + m = MyModule(3, 'something') + m2 = MyModule(5, 'something else') + m2.load_state_dict(m.state_dict()) + self.assertEqual(m.state_dict(), m2.state_dict()) + self.assertEqual(m2.bar, m.bar) + self.assertEqual(m2.sub.foo, m.sub.foo) + + def test_extra_state_non_dict(self): + + class MyModule(torch.nn.Module): + def __init__(self, foo): + super().__init__() + self.foo = foo + + def get_extra_state(self): + return self.foo + + def set_extra_state(self, state): + self.foo = state + + # Test various types of extra state. + for state in ('something', 5, MyModule(3)): + m = MyModule(state) + m2 = MyModule('something else') + m2.load_state_dict(m.state_dict()) + self.assertEqual(m.state_dict(), m2.state_dict()) + self.assertEqual(m.foo, m2.foo) + + def test_extra_state_missing_set_extra_state(self): + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def get_extra_state(self): + return { + 'foo': 5 + } + + m = MyModule() + with self.assertRaisesRegex(RuntimeError, 'Unexpected key'): + m.load_state_dict(m.state_dict()) + + def test_extra_state_missing_get_extra_state(self): + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def set_extra_state(self): + pass + + m = MyModule() + with self.assertRaisesRegex(RuntimeError, 'Missing key'): + m.load_state_dict(m.state_dict()) + def test_parameter_assignment(self): l = nn.Linear(5, 5) diff --git a/torch/jit/_script.py b/torch/jit/_script.py index 0c3e5ef7f0726..3d173ae27bd01 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -912,6 +912,8 @@ def _get_methods(cls): "_tracing_name", "eval", "train", + "get_extra_state", + "set_extra_state" } def _make_fail(name): diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 2376422117306..28b220e24037f 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -46,6 +46,8 @@ def _addindent(s_, numSpaces): _global_forward_pre_hooks: Dict[int, Callable] = OrderedDict() _global_forward_hooks: Dict[int, Callable] = OrderedDict() +_EXTRA_STATE_KEY_SUFFIX = '_extra_state' + def register_module_forward_pre_hook(hook: Callable[..., None]) -> RemovableHandle: r"""Registers a forward pre-hook common to all modules. @@ -528,6 +530,41 @@ def get_buffer(self, target: str) -> "Tensor": return buffer + def get_extra_state(self) -> Any: + """ + Returns any extra state to include in the module's state_dict. + Implement this and a corresponding :func:`set_extra_state` for your module + if you need to store extra state. This function is called when building the + module's `state_dict()`. + + Note that extra state should be pickleable to ensure working serialization + of the state_dict. We only provide provide backwards compatibility guarantees + for serializing Tensors; other objects may break backwards compatibility if + their serialized pickled form changes. + + Returns: + object: Any extra state to store in the module's state_dict + """ + raise RuntimeError( + "Reached a code path in Module.get_extra_state() that should never be called. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md " + "to report this bug.") + + def set_extra_state(self, state: Any): + """ + This function is called from :func:`load_state_dict` to handle any extra state + found within the `state_dict`. Implement this function and a corresponding + :func:`get_extra_state` for your module if you need to store extra state within its + `state_dict`. + + Args: + state (dict): Extra state from the `state_dict` + """ + raise RuntimeError( + "Reached a code path in Module.set_extra_state() that should never be called. " + "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md " + "to report this bug.") + def _apply(self, fn): for module in self.children(): module._apply(fn) @@ -1228,6 +1265,9 @@ def _save_to_state_dict(self, destination, prefix, keep_vars): for name, buf in self._buffers.items(): if buf is not None and name not in self._non_persistent_buffers_set: destination[prefix + name] = buf if keep_vars else buf.detach() + extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX + if getattr(self.__class__, "get_extra_state", Module.get_extra_state) is not Module.get_extra_state: + destination[extra_state_key] = self.get_extra_state() # The user can pass an optional arbitrary mappable object to `state_dict`, in which case `state_dict` returns # back that same object. But if they pass nothing, an `OrederedDict` is created and returned. @@ -1365,9 +1405,18 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, elif strict: missing_keys.append(key) + extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX + if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state: + if extra_state_key in state_dict: + self.set_extra_state(state_dict[extra_state_key]) + elif strict: + missing_keys.append(extra_state_key) + elif strict and (extra_state_key in state_dict): + unexpected_keys.append(extra_state_key) + if strict: for key in state_dict.keys(): - if key.startswith(prefix): + if key.startswith(prefix) and key != extra_state_key: input_name = key[len(prefix):] input_name = input_name.split('.', 1)[0] # get the name of param/buffer/child if input_name not in self._modules and input_name not in local_state: From ba126df61448ca3442ec77374bc32f43fcdd9773 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 24 Aug 2021 19:03:07 -0700 Subject: [PATCH 197/530] TST Adds more modules into common module tests (#62999) Summary: This PR moves some modules into `common_modules` to see what it looks like. While migrating some no batch modules into `common_modules`, I noticed that `desc` is not used for the name. This means we can not use `-k` to filter tests. This PR moves the sample generation into `_parametrize_test`, and passes in the already generated `module_input` into users of `modules(modules_db)`. I can see this is a little different from opsinfo and would be happy to revert to the original implementation of `modules`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62999 Reviewed By: heitorschueroff Differential Revision: D30522737 Pulled By: jbschlosser fbshipit-source-id: 7ed1aeb3753fc97a4ad6f1a3c789727c78e1bc73 --- torch/testing/_internal/common_modules.py | 100 +++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py index 088e66f962592..99525a7b68756 100644 --- a/torch/testing/_internal/common_modules.py +++ b/torch/testing/_internal/common_modules.py @@ -5,8 +5,8 @@ from torch.testing import floating_types from torch.testing._internal.common_device_type import ( _TestParametrizer, _dtype_test_suffix, _update_param_kwargs, skipIf) -from torch.testing._internal.common_nn import nllloss_reference -from torch.testing._internal.common_utils import make_tensor +from torch.testing._internal.common_nn import nllloss_reference, get_reduction +from torch.testing._internal.common_utils import make_tensor, freeze_rng_state from types import ModuleType from typing import List, Tuple, Type, Set, Dict @@ -46,6 +46,7 @@ class modules(_TestParametrizer): """ PROTOTYPE: Decorator for specifying a list of modules over which to run a test. """ + def __init__(self, module_info_list): self.module_info_list = module_info_list @@ -199,8 +200,103 @@ def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs): return module_inputs +def no_batch_dim_reference_fn(m, p, *args, **kwargs): + """Reference function for modules supporting no batch dimensions. + + The module is passed the input and target in batched form with a single item. + The output is squeezed to compare with the no-batch input. + """ + single_batch_input_args = [input.unsqueeze(0) for input in args] + with freeze_rng_state(): + return m(*single_batch_input_args).squeeze(0) + + +def no_batch_dim_reference_criterion_fn(m, *args, **kwargs): + """Reference function for criterion supporting no batch dimensions.""" + output = no_batch_dim_reference_fn(m, *args, **kwargs) + reduction = get_reduction(m) + if reduction == 'none': + return output.squeeze(0) + # reduction is 'sum' or 'mean' which results in a 0D tensor + return output + + +def generate_regression_criterion_inputs(make_input): + return [ + ModuleInput( + constructor_input=FunctionInput(reduction=reduction), + forward_input=FunctionInput(make_input(size=(4, )), make_input(size=4,)), + reference_fn=no_batch_dim_reference_criterion_fn, + desc='no_batch_dim_{}'.format(reduction) + ) for reduction in ['none', 'mean', 'sum']] + + +def module_inputs_torch_nn_AvgPool1d(module_info, device, dtype, requires_grad, **kwargs): + make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + return [ + ModuleInput(constructor_input=FunctionInput(kernel_size=2), + forward_input=FunctionInput(make_input(size=(3, 6))), + desc='no_batch_dim', + reference_fn=no_batch_dim_reference_fn)] + + +def module_inputs_torch_nn_ELU(module_info, device, dtype, requires_grad, **kwargs): + make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + return [ + ModuleInput(constructor_input=FunctionInput(alpha=2.), + forward_input=FunctionInput(make_input(size=(3, 2, 5))), + reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1))), + ModuleInput(constructor_input=FunctionInput(alpha=2.), + forward_input=FunctionInput(make_input(size=())), + desc='scalar'), + ModuleInput(constructor_input=FunctionInput(), + forward_input=FunctionInput(make_input(size=(3,))), + desc='no_batch_dim', + reference_fn=no_batch_dim_reference_fn)] + + +def module_inputs_torch_nn_CELU(module_info, device, dtype, requires_grad, **kwargs): + make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + return [ + ModuleInput(constructor_input=FunctionInput(alpha=2.), + forward_input=FunctionInput(make_input(size=(3, 2, 5))), + reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1))), + ModuleInput(constructor_input=FunctionInput(alpha=2.), + forward_input=FunctionInput(make_input(size=())), + reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1)), + desc='scalar'), + ModuleInput(constructor_input=FunctionInput(alpha=2.), + forward_input=FunctionInput(make_input(size=(3,))), + desc='no_batch_dim', + reference_fn=no_batch_dim_reference_fn)] + + +def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, **kwargs): + make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + return [ + ModuleInput(constructor_input=FunctionInput(), + forward_input=FunctionInput(make_input(size=(2, 3, 4)), + make_input(size=(2, 3, 4))), + reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum() + for a, b in zip(i, t))), + ModuleInput(constructor_input=FunctionInput(), + forward_input=FunctionInput(make_input(size=()), make_input(size=())), + reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(), + desc='scalar')] + generate_regression_criterion_inputs(make_input) + + # Database of ModuleInfo entries in alphabetical order. module_db: List[ModuleInfo] = [ + ModuleInfo(torch.nn.AvgPool1d, + module_inputs_func=module_inputs_torch_nn_AvgPool1d), + ModuleInfo(torch.nn.ELU, + module_inputs_func=module_inputs_torch_nn_ELU), + ModuleInfo(torch.nn.L1Loss, + module_inputs_func=module_inputs_torch_nn_L1Loss), ModuleInfo(torch.nn.Linear, module_inputs_func=module_inputs_torch_nn_Linear), ModuleInfo(torch.nn.NLLLoss, From e69a1398cbe534874060460faf36af21d24ce6e7 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Tue, 24 Aug 2021 19:37:54 -0700 Subject: [PATCH 198/530] compute reduction intermediate buffer size in elements (#63885) Summary: Fixes https://github.com/pytorch/pytorch/issues/63869 `iter` strides are in bytes, and we are additionally multiplying size computed using those strides by `sizeof(arg_t)`. Computing `output_memory_size` in elements should be enough. This doesn't fix the still real problem of allocating large intermediate tensor, but it makes this tensor smaller by typically a factor of 4. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63885 Reviewed By: mruberry Differential Revision: D30526034 Pulled By: ngimel fbshipit-source-id: 0aca7f887974b7776e380463bbd82d32a5786ee8 --- aten/src/ATen/native/cuda/Reduce.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 8c423061a79f6..161a896094976 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -923,6 +923,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id for (int dim = 0; dim < iter.ndim(); dim++) { output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), sizeof(out_scalar_t), (char*) iter.data_ptr(0), From c8527bc39837e6c1e00fb770c0e158508279ba2c Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 24 Aug 2021 21:05:14 -0700 Subject: [PATCH 199/530] [qunat][graphmode][fx] Add a separate lower_to_native_backend function for relu (#62861) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62861 This PR adds a lower_to_native_backend function to lower a quantized reference model to a model that uses fbgemm/qnnpack ops. We'll gradually add support and remove the fbgemm/qnnpack specific handling in quantization_patterns.py Test Plan: python test/test_quantization.py TestQuantizeFx python test/test_quantization.py TestQuantizeFxOps Imported from OSS Reviewed By: vkuzo Differential Revision: D30165828 fbshipit-source-id: de1149cd7e7c1840c17c251cd4d35004afd015b7 --- test/quantization/fx/test_quantize_fx.py | 22 +++++++++++++ .../fx/_lower_to_native_backend.py | 14 +++++++++ torch/quantization/fx/convert.py | 3 ++ torch/quantization/fx/lower_to_fbgemm.py | 8 +++++ torch/quantization/fx/lower_to_qnnpack.py | 8 +++++ .../quantization/fx/quantization_patterns.py | 4 ++- ...ntized_fusion_patterns_and_replacements.py | 31 +++++++++++++++++++ 7 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 torch/quantization/fx/_lower_to_native_backend.py create mode 100644 torch/quantization/fx/lower_to_fbgemm.py create mode 100644 torch/quantization/fx/lower_to_qnnpack.py create mode 100644 torch/quantization/fx/quantized_fusion_patterns_and_replacements.py diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index bf15a06831bac..1bc6b610d1662 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -2861,6 +2861,28 @@ def forward(self, inputs: torch.Tensor, state: List[torch.Tensor]): if n.target == "lstm": self.assertEqual(type(n.args[1]), tuple) + def test_lowering(self): + class M(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.relu(x) + + m = M().eval() + m = prepare_fx(m, {"": default_qconfig}) + m_copy = copy.deepcopy(m) + m = convert_fx(m) + m_ref = convert_fx(m_copy, is_reference=True) + node_occurrence = { + ns.call_function(torch.quantize_per_tensor): 1, + ns.call_method("dequantize"): 1 + } + node_occurrence_ref = { + ns.call_function(torch.quantize_per_tensor): 2, + ns.call_method("dequantize"): 2 + } + + self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence) + self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops diff --git a/torch/quantization/fx/_lower_to_native_backend.py b/torch/quantization/fx/_lower_to_native_backend.py new file mode 100644 index 0000000000000..a5518996bc44e --- /dev/null +++ b/torch/quantization/fx/_lower_to_native_backend.py @@ -0,0 +1,14 @@ +from torch.fx import subgraph_rewriter +from .graph_module import QuantizedGraphModule +from .quantized_fusion_patterns_and_replacements import get_fbgemm_patterns_and_replacements + +def _lower_to_native_backend(model: QuantizedGraphModule) -> QuantizedGraphModule: + """ Lower a quantized reference model (with reference quantized operator patterns) + to the native backend in PyTorch (fbgemm/qnnpack), both backends shares the same + operator signature so they can be lowered with the same function + """ + module_dict = dict(model.named_modules()) + for pattern, replacement in get_fbgemm_patterns_and_replacements(): + subgraph_rewriter.replace_pattern(model, pattern, replacement) + model.graph.lint() + return model diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py index 671c2704d7da7..867b0b24cf7ad 100644 --- a/torch/quantization/fx/convert.py +++ b/torch/quantization/fx/convert.py @@ -45,6 +45,8 @@ activation_dtype, ) +from .lower_to_fbgemm import lower_to_fbgemm + # weight prepacking ops WEIGHT_PREPACK_OPS = { torch._ops.ops.quantized.linear_prepack, @@ -535,4 +537,5 @@ def load_arg_remove(a: Argument) -> Argument: model = QuantizedGraphModule(model, act_post_process_removed_graph, preserved_attributes) if not is_reference: model = fold_weight(model, node_name_to_scope) + model = lower_to_fbgemm(model) return model diff --git a/torch/quantization/fx/lower_to_fbgemm.py b/torch/quantization/fx/lower_to_fbgemm.py new file mode 100644 index 0000000000000..fc76d135ee809 --- /dev/null +++ b/torch/quantization/fx/lower_to_fbgemm.py @@ -0,0 +1,8 @@ +from ._lower_to_native_backend import _lower_to_native_backend +from .graph_module import QuantizedGraphModule + +def lower_to_fbgemm(model: QuantizedGraphModule) -> QuantizedGraphModule: + """ Lower a quantized reference model (with reference quantized operator patterns) + to fbgemm + """ + return _lower_to_native_backend(model) diff --git a/torch/quantization/fx/lower_to_qnnpack.py b/torch/quantization/fx/lower_to_qnnpack.py new file mode 100644 index 0000000000000..0a0ea9cd248cd --- /dev/null +++ b/torch/quantization/fx/lower_to_qnnpack.py @@ -0,0 +1,8 @@ +from ._lower_to_native_backend import _lower_to_native_backend +from .graph_module import QuantizedGraphModule + +def lower_to_qnnpack(model: QuantizedGraphModule) -> QuantizedGraphModule: + """ Lower a quantized reference model (with reference quantized operator patterns) + to qnnpack + """ + return _lower_to_native_backend(model) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 1ce43cadc8e98..1a7d714136501 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -1496,7 +1496,9 @@ def convert(self, load_arg: Callable, is_reference: bool = False, convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if is_reference: + # always produce reference pattern for relu + is_relu = node.op == "call_function" and node.target == torch.nn.functional.relu + if is_reference or is_relu: # when activation dtype is torch.float, the node does not require # observation # e.g. dynamic quantization or weight_only quantization diff --git a/torch/quantization/fx/quantized_fusion_patterns_and_replacements.py b/torch/quantization/fx/quantized_fusion_patterns_and_replacements.py new file mode 100644 index 0000000000000..07c109ec4f922 --- /dev/null +++ b/torch/quantization/fx/quantized_fusion_patterns_and_replacements.py @@ -0,0 +1,31 @@ +import torch + +def relu_inplace_pattern(x, scale, zero_point): + x = x.dequantize() + x = torch.nn.functional.relu(x, inplace=True) + x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) + return x + +def relu_non_inplace_pattern(x, scale, zero_point): + x = x.dequantize() + x = torch.nn.functional.relu(x, inplace=False) + x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) + return x + +def relu_replacement(x, scale, zero_point): + x = torch.nn.functional.relu(x) + return x + + +def _get_all_patterns_and_replacements(): + return [ + (relu_inplace_pattern, relu_replacement), + (relu_non_inplace_pattern, relu_replacement) + ] + + +def get_fbgemm_patterns_and_replacements(): + return _get_all_patterns_and_replacements() + +def get_qnnpack_patterns_and_replacements(): + return _get_all_patterns_and_replacements() From d388a1a5df0e0255c07dc123e65002ec7396ad05 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 24 Aug 2021 21:21:57 -0700 Subject: [PATCH 200/530] [TensorExpr] LLVMCodegen: Use addFnAttr instead of addAttribute which was deleted. (#63886) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63886 cc gmagogsfm Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D30523135 Pulled By: ZolotukhinM fbshipit-source-id: 62e125f917b2a0153eb30879d93cf956587a05e0 --- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 5346d3668ec7e..a93fd64df0a68 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -424,9 +424,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl( llvm::FunctionType* fntype = llvm::FunctionType::get(retTy, params, false); fn_ = llvm::Function::Create( fntype, llvm::Function::PrivateLinkage, "pytorch", module_.get()); - fn_->addAttribute( - llvm::AttributeList::AttrIndex::FunctionIndex, - llvm::Attribute::AlwaysInline); + fn_->addFnAttr(llvm::Attribute::AlwaysInline); for (const auto i : c10::irange(args.size())) { if (!args[i].isVar()) { fn_->addParamAttr(i, llvm::Attribute::NoAlias); From 0301c3bc01329613c29c59cffa2c77f3ae2d0829 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 24 Aug 2021 21:28:40 -0700 Subject: [PATCH 201/530] [quant][graphmode][fx] Make maxpool and flatten produce the reference pattern (#63501) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63501 Currently some of the ops are considered as working with both float and quantized input, so we may have things like "quant - some_op - dequant" this might not work well with the backend, we may consider change everything to produce "quant - dequant - some_op - quant - dequant" instead in the future, this PR fixes it for maxpool and flatten only to unblock resnet benchmarking on TensorRT Test Plan: python test/test_quantization.py TestQuantizeFxOps Imported from OSS Reviewed By: mruberry Differential Revision: D30402788 fbshipit-source-id: 892c5ff6552775070e2c1453f65846590fb12735 --- torch/quantization/fx/quantization_patterns.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 1a7d714136501..09ca190a73668 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -1447,6 +1447,9 @@ def convert(self, @register_quant_pattern(torch.nn.AvgPool3d) @register_quant_pattern(torch.nn.Dropout) @register_quant_pattern(torch.nn.Hardtanh) +@register_quant_pattern(torch.nn.MaxPool1d) +@register_quant_pattern(torch.nn.MaxPool2d) +@register_quant_pattern(torch.nn.MaxPool3d) @register_quant_pattern(torch.nn.ReLU) @register_quant_pattern(torch.nn.ReLU6) @register_quant_pattern(torch.adaptive_avg_pool1d) @@ -1456,12 +1459,16 @@ def convert(self, @register_quant_pattern(torch.nn.functional.hardtanh) @register_quant_pattern(torch.nn.functional.hardtanh_) @register_quant_pattern(torch.nn.functional.interpolate) +@register_quant_pattern(torch.nn.functional.max_pool1d) +@register_quant_pattern(torch.nn.functional.max_pool2d) +@register_quant_pattern(torch.nn.functional.max_pool3d) @register_quant_pattern(torch.nn.functional.relu) @register_quant_pattern(torch.nn.functional.relu6) @register_quant_pattern(torch.avg_pool1d) @register_quant_pattern(torch._C._nn.avg_pool2d) @register_quant_pattern(torch._C._nn.avg_pool3d) @register_quant_pattern(torch.clamp) +@register_quant_pattern(torch.flatten) @register_quant_pattern(torch.max) @register_quant_pattern(torch.mean) @register_quant_pattern(torch.min) @@ -1556,15 +1563,8 @@ def convert(self, # module attribute like module._QUANTIZED_INPUT_INDEXES return quantized_graph.node_copy(node, load_arg(quantized=None)) -@register_quant_pattern(torch.nn.MaxPool1d) -@register_quant_pattern(torch.nn.MaxPool2d) -@register_quant_pattern(torch.nn.MaxPool3d) @register_quant_pattern(torch.nn.Identity) -@register_quant_pattern(torch.nn.functional.max_pool1d) -@register_quant_pattern(torch.nn.functional.max_pool2d) -@register_quant_pattern(torch.nn.functional.max_pool3d) @register_quant_pattern(torch.chunk) -@register_quant_pattern(torch.flatten) @register_quant_pattern(torch.transpose) @register_quant_pattern(torch.repeat_interleave) @register_quant_pattern(torch.sort) From 10dfa58eba055a1bbc1cc89df033cd2815cbb403 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 24 Aug 2021 21:33:12 -0700 Subject: [PATCH 202/530] [fx2trt] Add a test for quantized resnet18 (#63446) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63446 Add a test for quantized resnet18 running in TensorRT Test Plan: buck run mode/opt -c python.package_style=inplace caffe2:fx2trt_quantized_resnet_test Reviewed By: 842974287 Differential Revision: D30384746 fbshipit-source-id: 1a863877711618cd23d887694269ed9e44ee606c --- .../fx2trt/converters/acc_ops_converters.py | 15 +-- .../fx2trt/example/quantized_resnet_test.py | 117 ++++++++++++++++++ 2 files changed, 121 insertions(+), 11 deletions(-) create mode 100644 torch/fx/experimental/fx2trt/example/quantized_resnet_test.py diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 566359bf2af0d..33a817d4ccdb5 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -1300,15 +1300,11 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): if q_zero_point != 0: raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}") - # temporarily set q_scale to 1 to make sure the q_scale is different - # for quantize and dequantize to avoid the error - # TODO: follow up with nvidia TensorRT team to repro and fix the problem - q_scale = 1 scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32))) scale_layer.name = input_val.name + ".quant.scale" scale = scale_layer.get_output(0) - assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " - "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + # assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " + # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ layer = network.add_quantize(input=input_val, scale=scale) layer.axis = 0 layer.name = input_val.name + ".quant" @@ -1316,9 +1312,6 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): @tensorrt_converter(acc_ops.dequantize) def acc_ops_dequantize(network, target, args, kwargs, name): - """ - Currently just a no-op. - """ input_val = kwargs["input"] if not isinstance(input_val, trt.tensorrt.ITensor): @@ -1339,8 +1332,8 @@ def acc_ops_dequantize(network, target, args, kwargs, name): scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32))) scale_layer.name = input_val.name + ".dequant.scale" scale = scale_layer.get_output(0) - assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " - "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + # assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " + # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ layer = network.add_dequantize(input=input_val, scale=scale) layer.name = input_val.name + ".dequant" layer.axis = 0 diff --git a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py new file mode 100644 index 0000000000000..39553dfd9dfb6 --- /dev/null +++ b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py @@ -0,0 +1,117 @@ +import torch.fx +import torchvision.models as models +from torch.fx.experimental.fx2trt.fx2trt import TRTInterpreter, InputTensorSpec, TRTModule +from torch.quantization.quantize_fx import prepare_fx, convert_fx +import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer +import copy +from torch.fx.passes import shape_prop +from torch.fx.experimental.normalize import NormalizeArgs + +rn18 = models.resnet18().eval() + +def build_fp16_trt(rn18): + rn18 = copy.deepcopy(rn18) + rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)]) + interp = TRTInterpreter(rn18, [InputTensorSpec([3, 224, 224], torch.float, has_batch_dim=False)]) + engine, input_names, output_names = interp.run(fp16_mode=True) + return TRTModule(engine, input_names, output_names) + +@torch.no_grad() +def build_int8_trt(rn18): + rn18 = copy.deepcopy(rn18) + data = torch.randn(1, 3, 224, 224) + # data = torch.randn(1, 64, 10, 10) + # TensorRT only supports symmetric quantization + qconfig = torch.quantization.QConfig( + activation=torch.quantization.observer.HistogramObserver.with_args( + qscheme=torch.per_tensor_symmetric, dtype=torch.qint8 + ), + weight=torch.quantization.default_weight_observer + ) + prepared = prepare_fx(rn18, {"": qconfig}) + for _ in range(10): + prepared(data) + quantized_rn18 = convert_fx(prepared, is_reference=True) + print("quantized model:", quantized_rn18) + + quantized_rn18 = acc_tracer.trace(quantized_rn18, [data]) + interp = TRTInterpreter(quantized_rn18, [InputTensorSpec(data.shape[1:], torch.float, has_batch_dim=False)]) + engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True) + return TRTModule(engine, input_names, output_names) + +@torch.no_grad() +def build_int8_trt_implicit_quant(rn18): + rn18 = copy.deepcopy(rn18) + data = torch.randn(1, 3, 224, 224) + # Quantization + qconfig = torch.quantization.QConfig( + activation=torch.quantization.observer.HistogramObserver.with_args( + qscheme=torch.per_tensor_symmetric, reduce_range=True + ), + weight=torch.quantization.default_per_channel_weight_observer + ) + prepared = prepare_fx(rn18, {"": qconfig}) + for _ in range(10): + prepared(data) + quantized_rn18 = convert_fx(prepared, is_reference=True) + + # Build trt int8 model + traced_rn18 = torch.fx.symbolic_trace(quantized_rn18) + shape_prop.ShapeProp(traced_rn18).propagate(data) + traced_rn18 = NormalizeArgs(traced_rn18).transform() + interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data])) + engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True) + trt_mod = TRTModule(engine, input_names, output_names) + return trt_mod + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 3, 3, padding=1) + + def forward(self, x): + out = self.conv(x) + # out = torch.nn.functional.relu(out) + out += x + out += out + out = torch.nn.functional.relu(out) + return out + +# rn18 = M().eval() +# rn18 = rn18.layer1 +int8_trt = build_int8_trt(rn18) +implicit_int8_trt = build_int8_trt_implicit_quant(rn18) +fp16_trt = build_fp16_trt(rn18) +x = torch.randn(5, 3, 224, 224, device="cuda") +rn18 = rn18.cuda() + +import time +NITER = 100 + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + fp16_trt(x) + torch.cuda.synchronize() +print('trt fp16 time (ms/iter)', (time.time() - s) / NITER * 1000) + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + int8_trt(x) + torch.cuda.synchronize() +print('trt int8 time (ms/iter)', (time.time() - s) / NITER * 1000) + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + implicit_int8_trt(x) + torch.cuda.synchronize() +print('trt implicit int8 time (ms/iter)', (time.time() - s) / NITER * 1000) + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + rn18(x) + torch.cuda.synchronize() +print('PyTorch time (ms/iter)', (time.time() - s) / NITER * 1000) From 839eaa2e91556ecd4532596b4fef18a1c3f6e1c1 Mon Sep 17 00:00:00 2001 From: Linbin Yu Date: Wed, 25 Aug 2021 00:42:03 -0700 Subject: [PATCH 203/530] Revert D30384746: [fx2trt] Add a test for quantized resnet18 Test Plan: revert-hammer Differential Revision: D30384746 (https://github.com/pytorch/pytorch/commit/10dfa58eba055a1bbc1cc89df033cd2815cbb403) Original commit changeset: 1a8638777116 fbshipit-source-id: b93235323e229b391f5456f6e3543988062dd0d4 --- .../fx2trt/converters/acc_ops_converters.py | 15 ++- .../fx2trt/example/quantized_resnet_test.py | 117 ------------------ 2 files changed, 11 insertions(+), 121 deletions(-) delete mode 100644 torch/fx/experimental/fx2trt/example/quantized_resnet_test.py diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 33a817d4ccdb5..566359bf2af0d 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -1300,11 +1300,15 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): if q_zero_point != 0: raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}") + # temporarily set q_scale to 1 to make sure the q_scale is different + # for quantize and dequantize to avoid the error + # TODO: follow up with nvidia TensorRT team to repro and fix the problem + q_scale = 1 scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32))) scale_layer.name = input_val.name + ".quant.scale" scale = scale_layer.get_output(0) - # assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " - # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " + "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ layer = network.add_quantize(input=input_val, scale=scale) layer.axis = 0 layer.name = input_val.name + ".quant" @@ -1312,6 +1316,9 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): @tensorrt_converter(acc_ops.dequantize) def acc_ops_dequantize(network, target, args, kwargs, name): + """ + Currently just a no-op. + """ input_val = kwargs["input"] if not isinstance(input_val, trt.tensorrt.ITensor): @@ -1332,8 +1339,8 @@ def acc_ops_dequantize(network, target, args, kwargs, name): scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32))) scale_layer.name = input_val.name + ".dequant.scale" scale = scale_layer.get_output(0) - # assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " - # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " + "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ layer = network.add_dequantize(input=input_val, scale=scale) layer.name = input_val.name + ".dequant" layer.axis = 0 diff --git a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py deleted file mode 100644 index 39553dfd9dfb6..0000000000000 --- a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py +++ /dev/null @@ -1,117 +0,0 @@ -import torch.fx -import torchvision.models as models -from torch.fx.experimental.fx2trt.fx2trt import TRTInterpreter, InputTensorSpec, TRTModule -from torch.quantization.quantize_fx import prepare_fx, convert_fx -import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer -import copy -from torch.fx.passes import shape_prop -from torch.fx.experimental.normalize import NormalizeArgs - -rn18 = models.resnet18().eval() - -def build_fp16_trt(rn18): - rn18 = copy.deepcopy(rn18) - rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)]) - interp = TRTInterpreter(rn18, [InputTensorSpec([3, 224, 224], torch.float, has_batch_dim=False)]) - engine, input_names, output_names = interp.run(fp16_mode=True) - return TRTModule(engine, input_names, output_names) - -@torch.no_grad() -def build_int8_trt(rn18): - rn18 = copy.deepcopy(rn18) - data = torch.randn(1, 3, 224, 224) - # data = torch.randn(1, 64, 10, 10) - # TensorRT only supports symmetric quantization - qconfig = torch.quantization.QConfig( - activation=torch.quantization.observer.HistogramObserver.with_args( - qscheme=torch.per_tensor_symmetric, dtype=torch.qint8 - ), - weight=torch.quantization.default_weight_observer - ) - prepared = prepare_fx(rn18, {"": qconfig}) - for _ in range(10): - prepared(data) - quantized_rn18 = convert_fx(prepared, is_reference=True) - print("quantized model:", quantized_rn18) - - quantized_rn18 = acc_tracer.trace(quantized_rn18, [data]) - interp = TRTInterpreter(quantized_rn18, [InputTensorSpec(data.shape[1:], torch.float, has_batch_dim=False)]) - engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True) - return TRTModule(engine, input_names, output_names) - -@torch.no_grad() -def build_int8_trt_implicit_quant(rn18): - rn18 = copy.deepcopy(rn18) - data = torch.randn(1, 3, 224, 224) - # Quantization - qconfig = torch.quantization.QConfig( - activation=torch.quantization.observer.HistogramObserver.with_args( - qscheme=torch.per_tensor_symmetric, reduce_range=True - ), - weight=torch.quantization.default_per_channel_weight_observer - ) - prepared = prepare_fx(rn18, {"": qconfig}) - for _ in range(10): - prepared(data) - quantized_rn18 = convert_fx(prepared, is_reference=True) - - # Build trt int8 model - traced_rn18 = torch.fx.symbolic_trace(quantized_rn18) - shape_prop.ShapeProp(traced_rn18).propagate(data) - traced_rn18 = NormalizeArgs(traced_rn18).transform() - interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data])) - engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True) - trt_mod = TRTModule(engine, input_names, output_names) - return trt_mod - -class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d(3, 3, 3, padding=1) - - def forward(self, x): - out = self.conv(x) - # out = torch.nn.functional.relu(out) - out += x - out += out - out = torch.nn.functional.relu(out) - return out - -# rn18 = M().eval() -# rn18 = rn18.layer1 -int8_trt = build_int8_trt(rn18) -implicit_int8_trt = build_int8_trt_implicit_quant(rn18) -fp16_trt = build_fp16_trt(rn18) -x = torch.randn(5, 3, 224, 224, device="cuda") -rn18 = rn18.cuda() - -import time -NITER = 100 - -torch.cuda.synchronize() -s = time.time() -for _ in range(NITER): - fp16_trt(x) - torch.cuda.synchronize() -print('trt fp16 time (ms/iter)', (time.time() - s) / NITER * 1000) - -torch.cuda.synchronize() -s = time.time() -for _ in range(NITER): - int8_trt(x) - torch.cuda.synchronize() -print('trt int8 time (ms/iter)', (time.time() - s) / NITER * 1000) - -torch.cuda.synchronize() -s = time.time() -for _ in range(NITER): - implicit_int8_trt(x) - torch.cuda.synchronize() -print('trt implicit int8 time (ms/iter)', (time.time() - s) / NITER * 1000) - -torch.cuda.synchronize() -s = time.time() -for _ in range(NITER): - rn18(x) - torch.cuda.synchronize() -print('PyTorch time (ms/iter)', (time.time() - s) / NITER * 1000) From 87a661c79f486f2fcc11f125a86a2b5f0ec53d83 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Wed, 25 Aug 2021 07:15:18 -0700 Subject: [PATCH 204/530] Revert D30526034: [pytorch][PR] compute reduction intermediate buffer size in elements Test Plan: revert-hammer Differential Revision: D30526034 (https://github.com/pytorch/pytorch/commit/e69a1398cbe534874060460faf36af21d24ce6e7) Original commit changeset: 0aca7f887974 fbshipit-source-id: a22472723818d6fe0c11a6e134080df1ac408038 --- aten/src/ATen/native/cuda/Reduce.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 161a896094976..8c423061a79f6 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -923,7 +923,6 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id for (int dim = 0; dim < iter.ndim(); dim++) { output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); } - output_memory_size /= iter.element_size(0); //iter.strides is in bytes owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), sizeof(out_scalar_t), (char*) iter.data_ptr(0), From 3926fdbaa46d47483cec310f245ac60acf3ee13b Mon Sep 17 00:00:00 2001 From: zhouzhuojie Date: Wed, 25 Aug 2021 08:50:00 -0700 Subject: [PATCH 205/530] [skip ci] Add generated comment to ruleset json (#63896) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63896 Reviewed By: heitorschueroff Differential Revision: D30529820 Pulled By: zhouzhuojie fbshipit-source-id: 7529803af23ea36a7bcb673cd399da80da8e3feb --- .github/generated-ciflow-ruleset.json | 1 + .github/scripts/generate_ci_workflows.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 80b2cabfff788..70aa7767483e3 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -1,4 +1,5 @@ { + "__comment": "@generated DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py", "label_rules": { "ciflow/all": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index fce50ac7811e5..097974d88acf1 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -90,7 +90,9 @@ def add_label_rule(self, labels: Set[str], workflow_name: str) -> None: self.label_rules[label] = {workflow_name} def generate_json(self) -> None: + GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file output = { + "__comment": f"@{GENERATED} DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py", "version": self.version, "label_rules": { label: sorted(list(workflows)) From 8c897d254dda8c3f667a85e05a4a31739a2d85c5 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Wed, 25 Aug 2021 09:00:13 -0700 Subject: [PATCH 206/530] Swap CUDA 11.1 and 11.3 in CI to make 11.1 periodic (#63900) Summary: Preparing for supporting 11.3 in the next release. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63900 Reviewed By: malfet Differential Revision: D30541437 Pulled By: janeyx99 fbshipit-source-id: a7297da7f7818a4291b1c321d62d76fc2c0f1f90 --- .github/generated-ciflow-ruleset.json | 48 +++++++++---------- .github/scripts/generate_ci_workflows.py | 24 +++++----- ...orch-linux-xenial-cuda11.3-py3.6-gcc7.yml} | 10 ++-- ...ated-linux-xenial-cuda11.3-py3.6-gcc7.yml} | 16 +++---- ...orch-linux-xenial-cuda11.1-py3.6-gcc7.yml} | 10 ++-- ...odic-linux-xenial-cuda11.1-py3.6-gcc7.yml} | 16 +++---- ...ated-periodic-win-vs2019-cuda11.1-py3.yml} | 14 +++--- ... => generated-win-vs2019-cuda11.3-py3.yml} | 14 +++--- 8 files changed, 76 insertions(+), 76 deletions(-) rename .github/workflows/{generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml => generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml} (96%) rename .github/workflows/{generated-linux-xenial-cuda11.1-py3.6-gcc7.yml => generated-linux-xenial-cuda11.3-py3.6-gcc7.yml} (97%) rename .github/workflows/{generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml => generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml} (96%) rename .github/workflows/{generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml => generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml} (97%) rename .github/workflows/{generated-periodic-win-vs2019-cuda11.3-py3.yml => generated-periodic-win-vs2019-cuda11.1-py3.yml} (97%) rename .github/workflows/{generated-win-vs2019-cuda11.1-py3.yml => generated-win-vs2019-cuda11.3-py3.yml} (97%) diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 70aa7767483e3..d13561190d01f 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -3,19 +3,19 @@ "label_rules": { "ciflow/all": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", - "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda10.2-py3.6-gcc7", - "linux-xenial-cuda11.1-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", - "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-win-vs2019-cuda11.3-py3", + "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-win-vs2019-cuda11.1-py3", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3", - "win-vs2019-cuda11.1-py3" + "win-vs2019-cuda11.3-py3" ], "ciflow/bazel": [ "linux-xenial-py3.6-gcc7-bazel-test" @@ -31,19 +31,19 @@ ], "ciflow/cuda": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", - "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-xenial-cuda10.2-py3.6-gcc7", - "linux-xenial-cuda11.1-py3.6-gcc7", - "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-win-vs2019-cuda11.3-py3", + "linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-win-vs2019-cuda11.1-py3", "win-vs2019-cuda10.1-py3", - "win-vs2019-cuda11.1-py3" + "win-vs2019-cuda11.3-py3" ], "ciflow/default": [ "linux-bionic-py3.8-gcc9-coverage", - "linux-xenial-cuda11.1-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", "win-vs2019-cpu-py3", @@ -51,35 +51,35 @@ ], "ciflow/libtorch": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", - "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", - "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7" + "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", + "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7" ], "ciflow/linux": [ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", - "libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda10.2-py3.6-gcc7", - "linux-xenial-cuda11.1-py3.6-gcc7", + "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", - "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-linux-xenial-cuda11.3-py3.6-gcc7" + "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-linux-xenial-cuda11.1-py3.6-gcc7" ], "ciflow/scheduled": [ - "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-linux-xenial-cuda11.3-py3.6-gcc7", - "periodic-win-vs2019-cuda11.3-py3" + "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-linux-xenial-cuda11.1-py3.6-gcc7", + "periodic-win-vs2019-cuda11.1-py3" ], "ciflow/slow": [ "linux-bionic-cuda10.2-py3.9-gcc7", "linux-xenial-cuda10.2-py3.6-gcc7" ], "ciflow/win": [ - "periodic-win-vs2019-cuda11.3-py3", + "periodic-win-vs2019-cuda11.1-py3", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3", - "win-vs2019-cuda11.1-py3" + "win-vs2019-cuda11.3-py3" ] }, "version": "v1" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 097974d88acf1..e24c2e5af3893 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -214,8 +214,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ), CIWorkflow( arch="windows", - build_environment="win-vs2019-cuda11.1-py3", - cuda_version="11.1", + build_environment="win-vs2019-cuda11.3-py3", + cuda_version="11.3", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, on_pull_request=True, @@ -227,8 +227,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ), CIWorkflow( arch="windows", - build_environment="periodic-win-vs2019-cuda11.3-py3", - cuda_version="11.3", + build_environment="periodic-win-vs2019-cuda11.1-py3", + cuda_version="11.1", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, is_scheduled="45 0,4,8,12,16,20 * * *", @@ -337,8 +337,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ), CIWorkflow( arch="linux", - build_environment="linux-xenial-cuda11.1-py3.6-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", + build_environment="linux-xenial-cuda11.3-py3.6-gcc7", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, num_test_shards=2, on_pull_request=True, @@ -349,8 +349,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ), CIWorkflow( arch="linux", - build_environment="libtorch-linux-xenial-cuda11.1-py3.6-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", + build_environment="libtorch-linux-xenial-cuda11.3-py3.6-gcc7", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, is_libtorch=True, on_pull_request=True, @@ -362,8 +362,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ), CIWorkflow( arch="linux", - build_environment="periodic-linux-xenial-cuda11.3-py3.6-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", + build_environment="periodic-linux-xenial-cuda11.1-py3.6-gcc7", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, num_test_shards=2, is_scheduled="45 0,4,8,12,16,20 * * *", @@ -376,8 +376,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ), CIWorkflow( arch="linux", - build_environment="periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", + build_environment="periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, is_libtorch=True, is_scheduled="45 0,4,8,12,16,20 * * *", diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml similarity index 96% rename from .github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml rename to .github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index ba59027969b7a..95261026f3862 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -2,7 +2,7 @@ # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: libtorch-linux-xenial-cuda11.1-py3.6-gcc7 +name: libtorch-linux-xenial-cuda11.3-py3.6-gcc7 on: pull_request: @@ -14,8 +14,8 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.1-py3.6-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 + BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.3-py3.6-gcc7 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 @@ -26,7 +26,7 @@ env: ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" concurrency: - group: libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: @@ -120,7 +120,7 @@ jobs: needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build + JOB_BASE_NAME: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-build steps: - name: Log in to ECR run: | diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml similarity index 97% rename from .github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml rename to .github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 4275cc31ebddd..3273cb0395437 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -2,7 +2,7 @@ # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-cuda11.1-py3.6-gcc7 +name: linux-xenial-cuda11.3-py3.6-gcc7 on: pull_request: @@ -14,8 +14,8 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: linux-xenial-cuda11.1-py3.6-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 + BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 @@ -26,7 +26,7 @@ env: ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" concurrency: - group: linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: @@ -120,7 +120,7 @@ jobs: needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-build + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-build steps: - name: Log in to ECR run: | @@ -256,7 +256,7 @@ jobs: runs-on: ${{ matrix.runner }} env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-test + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-test TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} @@ -323,7 +323,7 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }} + BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then @@ -468,7 +468,7 @@ jobs: env: AWS_DEFAULT_REGION: us-east-1 CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-test + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-test CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml similarity index 96% rename from .github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml rename to .github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index 1f4c6d270ec54..de0aa4bb3333c 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -2,7 +2,7 @@ # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7 +name: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7 on: pull_request: @@ -12,8 +12,8 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 @@ -24,7 +24,7 @@ env: ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" concurrency: - group: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: @@ -118,7 +118,7 @@ jobs: needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7-build + JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build steps: - name: Log in to ECR run: | diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml similarity index 97% rename from .github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml rename to .github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 0282b206a117c..3f1b5b4a85f68 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -2,7 +2,7 @@ # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-linux-xenial-cuda11.3-py3.6-gcc7 +name: periodic-linux-xenial-cuda11.1-py3.6-gcc7 on: pull_request: @@ -12,8 +12,8 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.3-py3.6-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 @@ -24,7 +24,7 @@ env: ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" concurrency: - group: periodic-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: @@ -118,7 +118,7 @@ jobs: needs: [calculate-docker-image, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.3-py3.6-gcc7-build + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-build steps: - name: Log in to ECR run: | @@ -254,7 +254,7 @@ jobs: runs-on: ${{ matrix.runner }} env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.3-py3.6-gcc7-test + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-test TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} @@ -321,7 +321,7 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }} + BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then @@ -466,7 +466,7 @@ jobs: env: AWS_DEFAULT_REGION: us-east-1 CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.3-py3.6-gcc7-test + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-test CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml similarity index 97% rename from .github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml rename to .github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 6d1eff302dba9..4bf74faae1843 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -2,7 +2,7 @@ # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-win-vs2019-cuda11.3-py3 +name: periodic-win-vs2019-cuda11.1-py3 on: pull_request: @@ -12,9 +12,9 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.3-py3 + BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.1-py3 BUILD_WHEEL: 1 - CUDA_VERSION: "11.3" + CUDA_VERSION: "11.1" IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" @@ -29,7 +29,7 @@ env: USE_CUDA: 1 concurrency: - group: periodic-win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }} + group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: @@ -47,7 +47,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} needs: [ciflow_should_run] env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.3-py3-build + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-build http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" steps: @@ -150,7 +150,7 @@ jobs: test: env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.3-py3-test + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} TEST_CONFIG: ${{ matrix.config }} @@ -316,7 +316,7 @@ jobs: env: AWS_DEFAULT_REGION: us-east-1 CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.3-py3-test + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml similarity index 97% rename from .github/workflows/generated-win-vs2019-cuda11.1-py3.yml rename to .github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 2b3a30c6187fd..1b423008fe5fd 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -2,7 +2,7 @@ # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py -name: win-vs2019-cuda11.1-py3 +name: win-vs2019-cuda11.3-py3 on: pull_request: @@ -14,9 +14,9 @@ on: workflow_dispatch: env: - BUILD_ENVIRONMENT: win-vs2019-cuda11.1-py3 + BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3 BUILD_WHEEL: 1 - CUDA_VERSION: "11.1" + CUDA_VERSION: "11.3" IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" @@ -31,7 +31,7 @@ env: USE_CUDA: 1 concurrency: - group: win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }} + group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: @@ -49,7 +49,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} needs: [ciflow_should_run] env: - JOB_BASE_NAME: win-vs2019-cuda11.1-py3-build + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-build http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" steps: @@ -152,7 +152,7 @@ jobs: test: env: - JOB_BASE_NAME: win-vs2019-cuda11.1-py3-test + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} TEST_CONFIG: ${{ matrix.config }} @@ -318,7 +318,7 @@ jobs: env: AWS_DEFAULT_REGION: us-east-1 CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.1-py3-test + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} From 01c35115d82c7e782304ea68ee23cccb735f5432 Mon Sep 17 00:00:00 2001 From: Ansley Ussery Date: Wed, 25 Aug 2021 09:01:50 -0700 Subject: [PATCH 207/530] Fix bug in `check_empty_containers` (#63492) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63492 Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D30402749 Pulled By: ansley fbshipit-source-id: 7de533355fe91ca4f45b2bafc3bfb205a028c1ed --- test/jit/test_isinstance.py | 9 +++++++++ test/test_jit.py | 1 - torch/_jit_internal.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/test/jit/test_isinstance.py b/test/jit/test_isinstance.py index 93b2605748516..5fd2b87965607 100644 --- a/test/jit/test_isinstance.py +++ b/test/jit/test_isinstance.py @@ -310,3 +310,12 @@ def fn(x: Any): x: int = 2 fn(x) self.assertEqual(len(w), 0) + + def test_empty_container_special_cases(self): + # Should not throw "Boolean value of Tensor with no values is + # ambiguous" error + torch._jit_internal.check_empty_containers(torch.Tensor([])) + + # Should not throw "Boolean value of Tensor with more than + # one value is ambiguous" error + torch._jit_internal.check_empty_containers(torch.rand(2, 3)) diff --git a/test/test_jit.py b/test/test_jit.py index 06afe656a8d3c..28de1722dde47 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -5944,7 +5944,6 @@ def test_bool_arith_not(lhs): self.assertEqual(test_bool_arith_not(torch.zeros(3)), 1) self.assertTrue(str(test_bool_arith_not.graph).count('if') == 0) - def test_conditional_casting(self): def test_bool_cast_tensor(x): if x: diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index bd7b616996a24..418607add7373 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -1133,7 +1133,7 @@ def check_args_exist(target_type) -> None: def check_empty_containers(obj) -> None: - if not obj: + if obj == [] or obj == {} or obj == (): warnings.warn("The inner type of a container is lost when " "calling torch.jit.isinstance in eager mode. For " "example, List[int] would become list and " From 34ed16ffef80cf11c86a1f48d1b4930b71e19866 Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Wed, 25 Aug 2021 09:04:28 -0700 Subject: [PATCH 208/530] Temporary fix for remote gpu execution issue (#63899) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63899 See: T99020845 Test Plan: sandcastle Reviewed By: heitorschueroff Differential Revision: D30527384 fbshipit-source-id: ce9933e5e181322c02d4ed17f3fdaabe4c5ba29e --- torch/testing/_internal/common_device_type.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py index 8ec6e71d121ff..b5d61273afd3e 100644 --- a/torch/testing/_internal/common_device_type.py +++ b/torch/testing/_internal/common_device_type.py @@ -12,7 +12,7 @@ import torch from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \ skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \ - IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard, TEST_SKIP_NOARCH + IS_SANDCASTLE, IS_FBCODE, DeterministicGuard, TEST_SKIP_NOARCH from torch.testing._internal.common_cuda import _get_torch_cuda_version from torch.testing import \ (get_all_dtypes) @@ -469,13 +469,9 @@ def get_device_type_test_bases(): test_bases: List[Any] = list() if IS_SANDCASTLE or IS_FBCODE: - if IS_REMOTE_GPU: - # Skip if sanitizer is enabled - if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN: - test_bases.append(CUDATestBase) - else: - test_bases.append(CPUTestBase) - test_bases.append(MetaTestBase) + # temporarily disable IS_REMOTE_GPU, see T99020845 + test_bases.append(CPUTestBase) + test_bases.append(MetaTestBase) else: test_bases.append(CPUTestBase) if not TEST_SKIP_NOARCH: From 5ab356ffe6001cc54d99096f1981ad41d9b69e93 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 25 Aug 2021 09:24:27 -0700 Subject: [PATCH 209/530] Update CMake minimum version to 3.10 (#63660) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63660 Test Plan: Imported from OSS Reviewed By: janeyx99, mruberry Differential Revision: D30543878 fbshipit-source-id: a7d938807653f39727f2cc7d7ca167200567b6a0 --- .circleci/docker/build.sh | 16 +++++++++++++++- .circleci/docker/common/install_cmake.sh | 3 +++ .circleci/docker/common/install_conda.sh | 4 ++-- .circleci/docker/ubuntu-cuda/Dockerfile | 6 ++++++ .jenkins/pytorch/build-asan.sh | 3 +++ .jenkins/pytorch/build.sh | 2 ++ CMakeLists.txt | 8 ++++---- c10/CMakeLists.txt | 2 +- torch/CMakeLists.txt | 2 +- 9 files changed, 37 insertions(+), 9 deletions(-) diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 2b916a19ae117..18d19ae5d586f 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -78,11 +78,13 @@ TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/u case "$image" in pytorch-linux-xenial-py3.8) ANACONDA_PYTHON_VERSION=3.8 + CMAKE_VERSION=3.10.3 GCC_VERSION=7 # Do not install PROTOBUF, DB, and VISION as a test ;; pytorch-linux-xenial-py3.6-gcc5.4) ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 GCC_VERSION=5 PROTOBUF=yes DB=yes @@ -91,11 +93,13 @@ case "$image" in ;; pytorch-linux-xenial-py3.6-gcc7.2) ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 GCC_VERSION=7 # Do not install PROTOBUF, DB, and VISION as a test ;; pytorch-linux-xenial-py3.6-gcc7) ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 GCC_VERSION=7 PROTOBUF=yes DB=yes @@ -105,6 +109,7 @@ case "$image" in CUDA_VERSION=10.2 CUDNN_VERSION=7 ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 GCC_VERSION=7 PROTOBUF=yes DB=yes @@ -115,6 +120,7 @@ case "$image" in CUDA_VERSION=11.1 CUDNN_VERSION=8 ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 GCC_VERSION=7 PROTOBUF=yes DB=yes @@ -125,6 +131,7 @@ case "$image" in CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names CUDNN_VERSION=8 ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 GCC_VERSION=7 PROTOBUF=yes DB=yes @@ -134,6 +141,7 @@ case "$image" in pytorch-linux-xenial-py3-clang5-asan) ANACONDA_PYTHON_VERSION=3.6 CLANG_VERSION=5.0 + CMAKE_VERSION=3.10.3 PROTOBUF=yes DB=yes VISION=yes @@ -141,6 +149,7 @@ case "$image" in pytorch-linux-xenial-py3-clang7-asan) ANACONDA_PYTHON_VERSION=3.6 CLANG_VERSION=7 + CMAKE_VERSION=3.10.3 PROTOBUF=yes DB=yes VISION=yes @@ -148,6 +157,7 @@ case "$image" in pytorch-linux-xenial-py3-clang7-onnx) ANACONDA_PYTHON_VERSION=3.6 CLANG_VERSION=7 + CMAKE_VERSION=3.10.3 PROTOBUF=yes DB=yes VISION=yes @@ -155,16 +165,17 @@ case "$image" in pytorch-linux-xenial-py3-clang5-android-ndk-r19c) ANACONDA_PYTHON_VERSION=3.6 CLANG_VERSION=5.0 + CMAKE_VERSION=3.10.3 LLVMDEV=yes PROTOBUF=yes ANDROID=yes ANDROID_NDK_VERSION=r19c GRADLE_VERSION=6.8.3 - CMAKE_VERSION=3.7.0 NINJA_VERSION=1.9.0 ;; pytorch-linux-xenial-py3.6-clang7) ANACONDA_PYTHON_VERSION=3.6 + CMAKE_VERSION=3.10.3 CLANG_VERSION=7 PROTOBUF=yes DB=yes @@ -244,6 +255,9 @@ case "$image" in DB=yes VISION=yes echo "image '$image' did not match an existing build configuration" + if [[ "$image" == *xenial* ]]; then + CMAKE_VERSION=3.10.3 + fi if [[ "$image" == *py* ]]; then extract_version_from_image_name py ANACONDA_PYTHON_VERSION fi diff --git a/.circleci/docker/common/install_cmake.sh b/.circleci/docker/common/install_cmake.sh index 3ef71031db38f..5aa564d7c478c 100755 --- a/.circleci/docker/common/install_cmake.sh +++ b/.circleci/docker/common/install_cmake.sh @@ -4,6 +4,9 @@ set -ex [ -n "$CMAKE_VERSION" ] +# Remove system cmake install so it won't get used instead +apt-get remove cmake -y + # Turn 3.6.3 into v3.6 path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/') file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz" diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh index 86dbb153b2925..f12ae38aa58bd 100755 --- a/.circleci/docker/common/install_conda.sh +++ b/.circleci/docker/common/install_conda.sh @@ -69,8 +69,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then } # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README - # DO NOT install cmake here as it would install a version newer than 3.5, but - # we want to pin to version 3.5. + # DO NOT install cmake here as it would install a version newer than 3.10, but + # we want to pin to version 3.10. SCIPY_VERSION=1.1.0 if [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile index 003538f576bd5..84075db161358 100644 --- a/.circleci/docker/ubuntu-cuda/Dockerfile +++ b/.circleci/docker/ubuntu-cuda/Dockerfile @@ -65,6 +65,12 @@ ADD ./common/install_openssl.sh install_openssl.sh ENV OPENSSL_ROOT_DIR /opt/openssl RUN bash ./install_openssl.sh +# (optional) Install non-default CMake version +ARG CMAKE_VERSION +ADD ./common/install_cmake.sh install_cmake.sh +RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi +RUN rm install_cmake.sh + # Install ccache/sccache (do this last, so we get priority in PATH) ADD ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh index 37dfeebdbd332..8d0bcd2555342 100755 --- a/.jenkins/pytorch/build-asan.sh +++ b/.jenkins/pytorch/build-asan.sh @@ -16,6 +16,9 @@ clang --version # detect_leaks=0: Python is very leaky, so we need suppress it # symbolize=1: Gives us much better errors when things go wrong export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_odr_violation=0 +if [ -n "$(which conda)" ]; then + export CMAKE_PREFIX_PATH=/opt/conda +fi # FIXME: Remove the hardcoded "-pthread" option. # With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index f6ac52aed99c4..d7b66e7c9177e 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -88,6 +88,8 @@ if ! which conda; then else export USE_MKLDNN=0 fi +else + export CMAKE_PREFIX_PATH=/opt/conda fi if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d1653ffaded3..db38d592c55b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.10 FATAL_ERROR) #cmake_policy(SET CMP0022 NEW) #cmake_policy(SET CMP0023 NEW) @@ -323,9 +323,9 @@ option(WERROR "Build with -Werror supported by the compiler" OFF) if(USE_CCACHE) find_program(CCACHE_PROGRAM ccache) if(CCACHE_PROGRAM) - set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") - set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") - set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "C compiler launcher") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher") + set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher") else() message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.") endif() diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt index 3d2d4352ffef4..23a0e024d35ed 100644 --- a/c10/CMakeLists.txt +++ b/c10/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.10 FATAL_ERROR) project(c10 CXX) set(CMAKE_CXX_STANDARD 14) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 90504f025c4a3..761605fadcce8 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -2,7 +2,7 @@ # Now it only builds the Torch python bindings. if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + cmake_minimum_required(VERSION 3.10 FATAL_ERROR) project(torch CXX C) find_package(torch REQUIRED) option(USE_CUDA "Use CUDA" ON) From c1dfd58715c73dba3c089b2993e62d03a8647407 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Wed, 25 Aug 2021 09:35:26 -0700 Subject: [PATCH 210/530] Minor OptionalTensorRef updates (#63611) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63611 A few minor updates to `OptionalTensorRef`: 1. use `Tensor`'s `unsafe_borrow_t` constructor which avoids an unnecesary `nullptr` check. 2. copy constructor cannot defer to the `const Tensor&` constructor because it checks the tensor is defined, and so would fail for disengaged optionals. 3. use copy-swap idiom to avoid issues with self-assignment. `x = x` should be a no-op, but the old version would clear `x`. 4. Add pointer-like access for consistency with `optional` and `MaybeOwned` Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D30484704 Pulled By: ezyang fbshipit-source-id: 738f4bd22359eaecd0a519a04e89a4b44d92da5b --- aten/src/ATen/core/Tensor.h | 24 +++++++++++++----------- aten/src/ATen/templates/TensorBody.h | 2 ++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h index fb0f86952bea4..fa2479c800c05 100644 --- a/aten/src/ATen/core/Tensor.h +++ b/aten/src/ATen/core/Tensor.h @@ -6,28 +6,22 @@ namespace at { class TORCH_API OptionalTensorRef { public: - OptionalTensorRef() {} + OptionalTensorRef() = default; ~OptionalTensorRef() { ref_.unsafeReleaseTensorImpl(); } OptionalTensorRef(const Tensor& src) - : ref_(c10::intrusive_ptr( - src.unsafeGetTensorImpl(), - c10::raw::DontIncreaseRefcount{})) { + : ref_(Tensor::unsafe_borrow_t{}, src) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.defined()); } OptionalTensorRef(const OptionalTensorRef& rhs) - : OptionalTensorRef(rhs.ref_) {} + : ref_(Tensor::unsafe_borrow_t{}, rhs.ref_) {} - OptionalTensorRef& operator=(const OptionalTensorRef& rhs) { - // Need to call unsafeReleaseTensorImpl on ref_ since we are reassigning it - // (which does not call the destructor). - ref_.unsafeReleaseTensorImpl(); - ref_ = Tensor(c10::intrusive_ptr( - rhs.ref_.unsafeGetTensorImpl(), c10::raw::DontIncreaseRefcount{})); + OptionalTensorRef& operator=(OptionalTensorRef rhs) { + std::swap(ref_, rhs.ref_); return *this; } @@ -39,6 +33,14 @@ class TORCH_API OptionalTensorRef { return ref_; } + const Tensor& operator*() const & { + return ref_; + } + + const Tensor* operator->() const & { + return &ref_; + } + operator bool() const { return ref_.defined(); } diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index a6e6583c7b19c..95312ff5d10f3 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -52,6 +52,7 @@ struct Node; namespace at { +class OptionalTensorRef; class Tensor; using TensorList = ArrayRef; @@ -96,6 +97,7 @@ class TORCH_API Tensor { explicit Tensor(unsafe_borrow_t, const Tensor& rhs) : impl_(c10::intrusive_ptr::reclaim(rhs.impl_.get())) {} friend MaybeOwnedTraits; + friend OptionalTensorRef; public: Tensor(){}; From 83b132b112c2e035a23dcab4a88393209c4325ee Mon Sep 17 00:00:00 2001 From: Karen Zhou Date: Wed, 25 Aug 2021 09:55:02 -0700 Subject: [PATCH 211/530] [pruner] add support for pruning BatchNorm2d (#63519) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63519 If the pruner should be pruning biases along with weights, then if the model has BatchNorm2d following pruned Conv2d layers, then the corresponding channels of the BatchNorm must also be pruned. Specifically, they need to zeroed out, rather than fully removed, since in eager mode, the dimensions between layers need to be preserved. To do this, we add a pruning parametrization called `ZeroesParametrization` which zeroes out pruned channels, rather than removing them. The user must provide in the config, a tuple of the Conv2d and BatchNorm layers that go together. The `prepare` method will add the tuple to the `module_groups`; then it will add a PruningParametrization to the Conv2d layer, and a ZeroesParametrization to BatchNorm, and then set their pruned sets to be the same set. That way, during `step`, both masks are updated with the same pruned indices. ghstack-source-id: 136562278 Test Plan: `buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner` https://pxl.cl/1N1P6 Reviewed By: z-a-f Differential Revision: D30349855 fbshipit-source-id: 3199d3688d5a70963f9b32d7a8fdac3962ae6a65 --- test/ao/sparsity/test_pruner.py | 157 ++++++++++---- torch/ao/sparsity/__init__.py | 1 + .../experimental/pruner/base_pruner.py | 200 +++++++++++++++--- .../experimental/pruner/parametrization.py | 13 ++ 4 files changed, 298 insertions(+), 73 deletions(-) diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_pruner.py index c358df6ac95ae..663c6f033a9ce 100644 --- a/test/ao/sparsity/test_pruner.py +++ b/test/ao/sparsity/test_pruner.py @@ -4,7 +4,7 @@ import torch from torch import nn -from torch.ao.sparsity import BasePruner, PruningParametrization +from torch.ao.sparsity import BasePruner, PruningParametrization, ZeroesParametrization from torch.nn.utils import parametrize from torch.testing._internal.common_utils import TestCase @@ -13,8 +13,13 @@ DEVICES = {"cpu", "cuda" if torch.cuda.is_available() else "cpu"} +NEEDS_ZEROS = { # these layers should have pruned indices zero-ed, not removed + nn.BatchNorm2d +} + class Linear(nn.Module): + r"""Model with Linear layers, in Sequential and outside, without biases""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -29,6 +34,7 @@ def forward(self, x): class LinearB(nn.Module): + r"""Model with Linear layers, in Sequential and outside, with biases""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -43,6 +49,8 @@ def forward(self, x): class MultipleLinear(nn.Module): + r"""Model with multiple Linear layers, in Sequential and outside, without biases + and with activation functions""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -61,6 +69,8 @@ def forward(self, x): class MultipleLinearB(nn.Module): + r"""Model with multiple Linear layers, in Sequential and outside, with biases + and with activation functions""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -79,6 +89,8 @@ def forward(self, x): class MultipleLinearMixed(nn.Module): + r"""Model with multiple Linear layers, in Sequential and outside, some with biases + and with activation functions""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -97,6 +109,7 @@ def forward(self, x): class Conv2dA(nn.Module): + r"""Model with Conv2d layers, in Sequential and outside, without biases""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -111,6 +124,7 @@ def forward(self, x): class Conv2dB(nn.Module): + r"""Model with Conv2d layers, in Sequential and outside, with biases""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -125,6 +139,7 @@ def forward(self, x): class Conv2dC(nn.Module): + r"""Model with Conv2d layers, in Sequential and outside, with and without biases""" def __init__(self): super().__init__() self.seq = nn.Sequential( @@ -138,6 +153,24 @@ def forward(self, x): return x +class Conv2dBN(nn.Module): + r"""Model with Conv2d layers and BatchNorms""" + def __init__(self): + super().__init__() + self.seq = nn.Sequential( + nn.Conv2d(1, 32, 3, 1, bias=True), + nn.BatchNorm2d(32) + ) + self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=True) + self.bn = nn.BatchNorm2d(64) + + def forward(self, x): + x = self.seq(x) + x = self.conv2d(x) + x = self.bn(x) + return x + + class SimplePruner(BasePruner): def update_mask(self, layer, **kwargs): layer.parametrizations.weight[0].pruned_outputs.add(1) @@ -150,35 +183,66 @@ def update_mask(self, layer, **kwargs): class TestBasePruner(TestCase): def _check_pruner_prepared(self, model, pruner, device): - for g in pruner.module_groups: - module = g['module'] - assert module.weight.device == device - # Check mask exists - assert hasattr(module, 'mask') - # Check parametrization exists and is correct - assert parametrize.is_parametrized(module) - assert hasattr(module, "parametrizations") - # Assume that this is the 1st/only parametrization - assert type(module.parametrizations.weight[0]) == PruningParametrization + for config in pruner.module_groups: + modules = [] + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + for module in modules: + assert module.weight.device == device + # Check mask exists + assert hasattr(module, 'mask') + # Check parametrization exists and is correct + assert parametrize.is_parametrized(module) + assert hasattr(module, "parametrizations") + # Assume that this is the 1st/only parametrization + if isinstance(module, tuple(NEEDS_ZEROS)): + assert type(module.parametrizations.weight[0]) == ZeroesParametrization + else: + assert type(module.parametrizations.weight[0]) == PruningParametrization def _check_pruner_mask_squashed(self, model, pruner, device): - for g in pruner.module_groups: - module = g['module'] - assert module.weight.device == device - assert not hasattr(module, "parametrizations") - assert not hasattr(module, 'mask') + for config in pruner.module_groups: + modules = [] + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + for module in modules: + assert module.weight.device == device + assert not hasattr(module, "parametrizations") + assert not hasattr(module, 'mask') def _check_pruner_valid_before_step(self, model, pruner, device): - for g in pruner.module_groups: - module = g['module'] - assert module.weight.device == device - assert module.parametrizations.weight[0].pruned_outputs == set() + for config in pruner.module_groups: + modules = [] + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + for module in modules: + assert module.weight.device == device + assert module.parametrizations.weight[0].pruned_outputs == set() def _check_pruner_valid_after_step(self, model, pruner, pruned_set, device): - for g in pruner.module_groups: - module = g['module'] - assert module.weight.device == device - assert module.parametrizations.weight[0].pruned_outputs == pruned_set + for config in pruner.module_groups: + modules = [] + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + for module in modules: + assert module.weight.device == device + assert module.parametrizations.weight[0].pruned_outputs == pruned_set def _test_constructor_on_device(self, model, device): self.assertRaisesRegex(TypeError, 'with abstract methods update_mask', @@ -218,19 +282,23 @@ def test_prepare_linear(self): for model in models: self._test_prepare_linear_on_device(model, torch.device(device)) - def _test_prepare_conv2d_on_device(self, model, device): + def _test_prepare_conv2d_on_device(self, model, config, device): model = model.to(device) x = torch.ones((1, 1, 28, 28)) pruner = SimplePruner(None) - pruner.prepare(model, None) + pruner.prepare(model, config) self._check_pruner_prepared(model, pruner, device) assert model(x).shape == (1, 64, 24, 24) def test_prepare_conv2d(self): - models = [Conv2dA(), Conv2dB(), Conv2dC()] + bn_model = Conv2dBN() + bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)] + + models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model] + configs = [None, None, None, bn_config] for device in DEVICES: - for model in models: - self._test_prepare_conv2d_on_device(model, torch.device(device)) + for model, config in zip(models, configs): + self._test_prepare_conv2d_on_device(model, config, torch.device(device)) def _test_squash_mask_linear_on_device(self, model, device): model = model.to(device) @@ -247,20 +315,24 @@ def test_squash_mask_linear(self): for model in models: self._test_squash_mask_linear_on_device(model, torch.device(device)) - def _test_squash_mask_conv2d_on_device(self, model, device): + def _test_squash_mask_conv2d_on_device(self, model, config, device): model = model.to(device) x = torch.ones((1, 1, 28, 28)) pruner = SimplePruner(None) - pruner.prepare(model, None) + pruner.prepare(model, config) pruner.squash_mask() self._check_pruner_mask_squashed(model, pruner, device) assert model(x).shape == (1, 64, 24, 24) def test_squash_mask_conv2d(self): - models = [Conv2dA(), Conv2dB(), Conv2dC()] + bn_model = Conv2dBN() + bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)] + + models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model] + configs = [None, None, None, bn_config] for device in DEVICES: - for model in models: - self._test_squash_mask_conv2d_on_device(model, torch.device(device)) + for model, config in zip(models, configs): + self._test_squash_mask_conv2d_on_device(model, config, torch.device(device)) def _test_step_linear_on_device(self, model, is_basic, device): model = model.to(device) @@ -288,18 +360,25 @@ def test_step_linear(self): for model in complex_models: self._test_step_linear_on_device(model, False, torch.device(device)) - def _test_step_conv2d_on_device(self, model, device): + def _test_step_conv2d_on_device(self, model, config, device): model = model.to(device) x = torch.ones((1, 1, 28, 28)) pruner = SimplePruner(None) - pruner.prepare(model, None) + pruner.prepare(model, config) self._check_pruner_valid_before_step(model, pruner, device) pruner.step() + if type(model) is Conv2dBN: + assert model.seq[1].parametrizations.weight[0].pruned_outputs == model.seq[0].parametrizations.weight[0].pruned_outputs + assert model.bn.parametrizations.weight[0].pruned_outputs == model.conv2d.parametrizations.weight[0].pruned_outputs self._check_pruner_valid_after_step(model, pruner, {1}, device) assert model(x).shape == (1, 64, 24, 24) def test_step_conv2d(self): - models = [Conv2dA(), Conv2dB(), Conv2dC()] + bn_model = Conv2dBN() + bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)] + + models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model] + configs = [None, None, None, bn_config] for device in DEVICES: - for model in models: - self._test_step_conv2d_on_device(model, torch.device(device)) + for model, config in zip(models, configs): + self._test_step_conv2d_on_device(model, config, torch.device(device)) diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py index 06854a42cf9ce..80aa30814eac8 100644 --- a/torch/ao/sparsity/__init__.py +++ b/torch/ao/sparsity/__init__.py @@ -19,6 +19,7 @@ # Parametrizations from .experimental.pruner.parametrization import PruningParametrization +from .experimental.pruner.parametrization import ZeroesParametrization from .experimental.pruner.parametrization import ActivationReconstruction from .experimental.pruner.parametrization import BiasHook diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py index a8a7b69141be3..6baeb6efda849 100644 --- a/torch/ao/sparsity/experimental/pruner/base_pruner.py +++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py @@ -1,4 +1,6 @@ +import copy +import warnings import abc import torch @@ -7,13 +9,18 @@ from torch.nn.modules.container import ModuleDict, ModuleList -from .parametrization import PruningParametrization, ActivationReconstruction, BiasHook +from .parametrization import PruningParametrization, ZeroesParametrization, ActivationReconstruction, BiasHook -from torch.ao.sparsity import BaseSparsifier, fqn_to_module +from torch.ao.sparsity import BaseSparsifier, module_to_fqn, fqn_to_module -SUPPORTED_MODULES = { +SUPPORTED_MODULES = { # added to config if None given nn.Linear, - nn.Conv2d + nn.Conv2d, + nn.BatchNorm2d, # will need manual update to match conv2d +} + +NEEDS_ZEROS = { # these layers should have pruned indices zero-ed, not removed + nn.BatchNorm2d } @@ -44,45 +51,170 @@ def _prepare(self, use_path=False, *args, **kwargs): self.bias_handles = [] for config in self.module_groups: + modules = [] if use_path: - module = fqn_to_module(self.model, config['fqn']) + if type(config['module']) is tuple: # (Conv2d, BN) + for fqn in config['fqn']: + module = fqn_to_module(self.model, fqn) + modules.append(module) + else: + module = fqn_to_module(self.model, config['fqn']) + modules.append(module) else: - module = config['module'] - - if getattr(module, 'mask', None) is None: - module.register_buffer('mask', torch.tensor(module.weight.shape[0])) - param = config.get('parametrization', PruningParametrization) - parametrize.register_parametrization(module, 'weight', - param(module.mask), - unsafe=True) - - assert isinstance(module.parametrizations, ModuleDict) # make mypy happy - assert isinstance(module.parametrizations.weight, ModuleList) - if isinstance(module, tuple(SUPPORTED_MODULES)): - self.activation_handles.append(module.register_forward_hook( - ActivationReconstruction(module.parametrizations.weight[0]) - )) + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + + for module in modules: + if not isinstance(module, tuple(NEEDS_ZEROS)): + # add pruning parametrization and forward hooks + if getattr(module, 'mask', None) is None: + module.register_buffer('mask', torch.tensor(module.weight.shape[0])) + param = config.get('parametrization', PruningParametrization) + parametrize.register_parametrization(module, 'weight', param(module.mask), unsafe=True) + + assert isinstance(module.parametrizations, ModuleDict) # make mypy happy + assert isinstance(module.parametrizations.weight, ModuleList) + if isinstance(module, tuple(SUPPORTED_MODULES)): + self.activation_handles.append(module.register_forward_hook( + ActivationReconstruction(module.parametrizations.weight[0]) + )) + else: + raise NotImplementedError("This module type is not supported yet.") + + else: # needs zeros + if getattr(module, 'mask', None) is None: + module.register_buffer('mask', torch.tensor(module.weight.shape[0])) + param = config.get('parametrization', ZeroesParametrization) + parametrize.register_parametrization(module, 'weight', param(module.mask), unsafe=True) + + if module.bias is not None: + module.register_parameter('_bias', nn.Parameter(module.bias.detach())) + module.bias = None + self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias))) + + if len(modules) == 2: # (Conv2d, BN) + # should have the same set of pruned outputs + modules[1].parametrizations.weight[0].pruned_outputs = modules[0].parametrizations.weight[0].pruned_outputs + + + def prepare(self, model, config): + r"""Prepares a model, by adding the parametrizations and forward post-hooks. + Note:: + The model is modified inplace. If you need to preserve the original + model, use copy.deepcopy. + + Args: + - model [nn.Module]: model to configure. The model itself is not saved + but used for the state_dict saving / loading. + - config [list]: configuration elements could either be instances of + nn.Module or dict maps. The dicts must have a key 'module' with the + value being an instance of a nn.Module. + """ + self.model = model # TODO: Need to figure out how to load without this. + self.config = config + + # If no config -- try getting all the supported layers + if self.config is None: + # Add all models to the config + self.config = [] + stack = [model] + while stack: + module = stack.pop() + for name, child in module.named_children(): + if type(child) in SUPPORTED_MODULES: + self.config.append(child) + else: + if type(child) in NEEDS_ZEROS and self.prune_bias: + warnings.warn(f"Models with {type(child)} layers have config provided by user.") + stack.append(child) + + for module_config in self.config: + if type(module_config) is tuple: + first_layer, next_layer = module_config + assert isinstance(first_layer, nn.Conv2d) and isinstance(next_layer, nn.BatchNorm2d) + module_config = {'module': module_config} + local_args = copy.deepcopy(self.defaults) + local_args.update(module_config) + fqn_list = [] + for module in local_args['module']: + module_fqn = module_to_fqn(model, module) + if module_fqn and module_fqn[0] == '.': + module_fqn = module_fqn[1:] + fqn_list.append(module_fqn) + local_args['fqn'] = fqn_list else: - raise NotImplementedError("This module type is not supported yet.") + if isinstance(module_config, nn.Module): + module_config = {'module': module_config} + local_args = copy.deepcopy(self.defaults) + local_args.update(module_config) + module = local_args['module'] + module_fqn = module_to_fqn(model, module) + if module_fqn and module_fqn[0] == '.': + module_fqn = module_fqn[1:] + local_args['fqn'] = module_fqn + + self.module_groups.append(local_args) - if module.bias is not None: - module.register_parameter('_bias', nn.Parameter(module.bias.detach())) - module.bias = None - self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias))) + self._prepare() def squash_mask(self, use_path=False, *args, **kwargs): for config in self.module_groups: + modules = [] if use_path: - module = fqn_to_module(self.model, config['fqn']) + if type(config['module']) is tuple: # (Conv2d, BN) + for fqn in config['fqn']: + module = fqn_to_module(self.model, fqn) + modules.append(module) + else: + module = fqn_to_module(self.model, config['fqn']) + modules.append(module) else: - module = config['module'] - parametrize.remove_parametrizations(module, 'weight', - leave_parametrized=True) - if getattr(module._parameters, 'mask', None): - del module._parameters['mask'] - elif getattr(module._buffers, 'mask', None): - del module._buffers['mask'] - delattr(module, 'mask') + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + + for module in modules: + parametrize.remove_parametrizations(module, 'weight', + leave_parametrized=True) + if getattr(module._parameters, 'mask', None): + del module._parameters['mask'] + elif getattr(module._buffers, 'mask', None): + del module._buffers['mask'] + delattr(module, 'mask') + + def step(self, use_path=False): + if not self.enable_mask_update: + return + with torch.no_grad(): + for config in self.module_groups: + modules = [] + if use_path: + if type(config['module']) is tuple: # (Conv2d, BN) + for fqn in config['fqn']: + module = fqn_to_module(self.model, fqn) + modules.append(module) + else: + module = fqn_to_module(self.model, config['fqn']) + modules.append(module) + else: + if type(config['module']) is tuple: + for module in config['module']: + modules.append(module) + else: + module = config['module'] + modules.append(module) + + # only need to update the first module in modules if len(modules) > 1 + # since they should share the same set of pruned outputs + module = modules[0] + self.update_mask(module, **config) @abc.abstractmethod def update_mask(self, layer, **kwargs): diff --git a/torch/ao/sparsity/experimental/pruner/parametrization.py b/torch/ao/sparsity/experimental/pruner/parametrization.py index 696b16e1edccc..0ee937a4a8ae4 100644 --- a/torch/ao/sparsity/experimental/pruner/parametrization.py +++ b/torch/ao/sparsity/experimental/pruner/parametrization.py @@ -14,6 +14,19 @@ def forward(self, x): return x[list(valid_outputs)] +class ZeroesParametrization(nn.Module): + r"""Zero out pruned channels instead of removing. + E.g. used for Batch Norm pruning, which should match previous Conv2d layer.""" + def __init__(self, original_outputs): + super().__init__() + self.original_outputs = set(range(original_outputs.item())) + self.pruned_outputs = set() # Will contain indicies of outputs to prune + + def forward(self, x): + x.data[list(self.pruned_outputs)] = 0 + return x + + class ActivationReconstruction: def __init__(self, parametrization): self.param = parametrization From eebac46282a9166ae330816a7203da13e7b272ad Mon Sep 17 00:00:00 2001 From: Karen Zhou Date: Wed, 25 Aug 2021 09:55:02 -0700 Subject: [PATCH 212/530] [pruner] add getter for pruned outputs in base pruner (#63520) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63520 Rather than having to call `module.parametrizations.weight[0].pruned_outputs` each time we need to access the set of pruned indices, we add a getter `get_module_pruned_outputs` which takes the module as an argument and returns the set. This is used for testing. ghstack-source-id: 136561130 Test Plan: ` buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner` https://pxl.cl/1N4gK Reviewed By: z-a-f Differential Revision: D30374558 fbshipit-source-id: e38dfee0879cadde52b942e899a3d8d7151ee493 --- test/ao/sparsity/test_pruner.py | 4 ++-- .../ao/sparsity/experimental/pruner/base_pruner.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_pruner.py index 663c6f033a9ce..55364536b6191 100644 --- a/test/ao/sparsity/test_pruner.py +++ b/test/ao/sparsity/test_pruner.py @@ -368,8 +368,8 @@ def _test_step_conv2d_on_device(self, model, config, device): self._check_pruner_valid_before_step(model, pruner, device) pruner.step() if type(model) is Conv2dBN: - assert model.seq[1].parametrizations.weight[0].pruned_outputs == model.seq[0].parametrizations.weight[0].pruned_outputs - assert model.bn.parametrizations.weight[0].pruned_outputs == model.conv2d.parametrizations.weight[0].pruned_outputs + assert pruner.get_module_pruned_outputs(model.seq[1]) == pruner.get_module_pruned_outputs(model.seq[0]) + assert pruner.get_module_pruned_outputs(model.bn) == pruner.get_module_pruned_outputs(model.conv2d) self._check_pruner_valid_after_step(model, pruner, {1}, device) assert model(x).shape == (1, 64, 24, 24) diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py index 6baeb6efda849..6017e8f53ae69 100644 --- a/torch/ao/sparsity/experimental/pruner/base_pruner.py +++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py @@ -189,6 +189,19 @@ def squash_mask(self, use_path=False, *args, **kwargs): del module._buffers['mask'] delattr(module, 'mask') + def get_module_pruned_outputs(self, module): + r"""Returns the set of pruned indices of module""" + assert parametrize.is_parametrized(module) # can only get pruned indices of pruned module + modules = {config['module'] for config in self.module_groups} + module_list = set() + for m in modules: + if type(m) is tuple: + module_list.update(m) + else: + module_list.add(m) + assert module in module_list # check that module is in pruner.module_groups + return module.parametrizations.weight[0].pruned_outputs # assume only one parametrization attached + def step(self, use_path=False): if not self.enable_mask_update: return From 6324d98e9e736214ee7a161482b02aaf8cebbc9d Mon Sep 17 00:00:00 2001 From: riship Date: Wed, 25 Aug 2021 09:56:41 -0700 Subject: [PATCH 213/530] bf16 Error message cleanup as well as addition of is_bf16_supported (#63798) Summary: ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/63798 Reviewed By: heitorschueroff Differential Revision: D30526187 Pulled By: ngimel fbshipit-source-id: c484aec14638097c96c720095d3491249b6b2d14 --- torch/autocast_mode.py | 12 ++++++------ torch/cpu/amp/autocast_mode.py | 4 ++-- torch/cuda/__init__.py | 8 ++++++++ torch/cuda/amp/autocast_mode.py | 4 ++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/torch/autocast_mode.py b/torch/autocast_mode.py index ec9fdb0326d62..97d51b8f1ca7b 100644 --- a/torch/autocast_mode.py +++ b/torch/autocast_mode.py @@ -80,7 +80,7 @@ def forward(self, input): c_float32 = torch.rand((8, 8), device="cpu") d_float32 = torch.rand((8, 8), device="cpu") - with autocast(fast_dtype=torch.bfloat16, device_type="cpu"): + with autocast(dtype=torch.bfloat16, device_type="cpu"): # torch.mm is on autocast's list of ops that should run in bfloat16. # Inputs are float32, but the op runs in bfloat16 and produces bfloat16 output. # No manual casts are required. @@ -125,7 +125,7 @@ def forward(self, input): Args: device_type(string, required): Whether to use 'cuda' or 'cpu' device enabled(bool, optional, default=True)": Whether autocasting should be enabled in the region. - fast_dtype(torch_dtype, optional): Whether to use torch.float16 or torch.bfloat16 + dtype(torch_dtype, optional): Whether to use torch.float16 or torch.bfloat16 """ def __init__(self, device_type, enabled=True, **kwargs): self.device = device_type @@ -139,9 +139,9 @@ def __init__(self, device_type, enabled=True, **kwargs): warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling') enabled = False for key, value in kwargs.items(): - if key == 'fast_dtype': + if key == 'dtype': self.fast_dtype = value - if not (key == 'fast_dtype'): + if not (key == 'dtype'): raise RuntimeError('Unrecognized optional argument supplied to autocast context manager: ' + str(key)) if self.device == 'cpu': @@ -152,8 +152,8 @@ def __init__(self, device_type, enabled=True, **kwargs): warnings.warn(error_message) enabled = False if self.device == 'cuda': - if self.fast_dtype == torch.bfloat16 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8: - raise RuntimeError('Current CUDA Device does not support bfloat16. Switching fast_dtype to float16.') + if self.fast_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported(): + raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.') self._enabled = enabled def __enter__(self): diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py index 027ef382f1599..08ea200a2bdc4 100644 --- a/torch/cpu/amp/autocast_mode.py +++ b/torch/cpu/amp/autocast_mode.py @@ -5,5 +5,5 @@ class autocast(torch.autocast_mode.autocast): See :class:`torch.autocast`. ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)`` """ - def __init__(self, enabled=True, fast_dtype=torch.float16): - super().__init__("cpu", enabled=enabled, fast_dtype=fast_dtype) + def __init__(self, enabled=True, dtype=torch.float16): + super().__init__("cpu", enabled=enabled, dtype=dtype) diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 29e112fc67abd..d5a9cbb52f34f 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -78,6 +78,14 @@ def is_available() -> bool: # be initialized return torch._C._cuda_getDeviceCount() > 0 +def is_bf16_supported(): + r"""Returns a bool indicating if the current CUDA device supports dtype bfloat16""" + cu_vers = torch.version.cuda + if cu_vers is not None: + cuda_maj_decide = int(cu_vers.split(',')[0]) >= 11 + else: + cuda_maj_decide = False + return torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8 and cuda_maj_decide def _sleep(cycles): torch._C._cuda_sleep(cycles) diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py index e9bfe06a0a352..ca8a2fcaf29d5 100644 --- a/torch/cuda/amp/autocast_mode.py +++ b/torch/cuda/amp/autocast_mode.py @@ -13,8 +13,8 @@ class autocast(torch.autocast_mode.autocast): See :class:`torch.autocast`. ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)`` """ - def __init__(self, enabled=True, fast_dtype=torch.float16): - super().__init__("cuda", enabled=enabled, fast_dtype=fast_dtype) + def __init__(self, enabled=True, dtype=torch.float16): + super().__init__("cuda", enabled=enabled, dtype=dtype) # Casts Tensors and containers of Tensors. Special-cases passthroughs for strings and np.ndarrays, which From c06dfd7c26102ac2436ca25609c92fa794e972ca Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Wed, 25 Aug 2021 10:22:17 -0700 Subject: [PATCH 214/530] [fx2trt] Check input device in TRTModule (#63893) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63893 Add a check to ensure all the inputs are on cuda device. Test Plan: CI Reviewed By: kflu, houseroad Differential Revision: D30525265 fbshipit-source-id: 6e50b70fd535defc1f802d51e8bb991b2dd73741 --- torch/fx/experimental/fx2trt/fx2trt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py index 0e7cc24c18be5..ede99fd6f1700 100644 --- a/torch/fx/experimental/fx2trt/fx2trt.py +++ b/torch/fx/experimental/fx2trt/fx2trt.py @@ -86,6 +86,7 @@ def forward(self, *inputs): bindings: List[Any] = [None] * (len(self.input_names) + len(self.output_names)) for i, input_name in enumerate(self.input_names): + assert inputs[i].is_cuda, f"{i}th input is not on cuda device." idx = self.engine.get_binding_index(input_name) bindings[idx] = contiguous_inputs[i].data_ptr() From ab954cb0d176a5632f123ac19d9469e6f863d39a Mon Sep 17 00:00:00 2001 From: albanD Date: Wed, 25 Aug 2021 11:07:24 -0700 Subject: [PATCH 215/530] clean up engine.cpp thread state (#63115) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63115 This actually changes: - callbacks now run with proper grad mode even in worker threads - graphtask's Future callbacks now run with proper TLS when erroring out from a worker thread Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30388100 Pulled By: albanD fbshipit-source-id: 7ae9c461c2f0040548dd9e1e314f25e8da0c2e67 --- torch/csrc/autograd/engine.cpp | 11 ++++++----- .../csrc/distributed/autograd/engine/dist_engine.cpp | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 252a74b4c07c7..de2078d2d6432 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -407,7 +407,12 @@ auto Engine::thread_main(const std::shared_ptr& graph_task) -> void { } if (task.fn_ && !local_graph_task->has_error_.load()) { + // Set the ThreadLocalState before calling the function. + // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask + // always saves ThreadLocalState without grad_mode. + at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_); AutoGradMode grad_mode(local_graph_task->grad_mode_); + try { // The guard sets the thread_local current_graph_task on construction // and restores it on exit. The current_graph_task variable helps @@ -575,6 +580,7 @@ void GraphTask::exec_post_processing() { // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask // always saves ThreadLocalState without grad_mode. at::ThreadLocalStateGuard tls_guard(this->thread_locals_); + AutoGradMode grad_mode(this->grad_mode_); // WARNING: Don't use a range-for loop here because more callbacks may be // added in between callback calls, so iterators may become invalidated. @@ -764,11 +770,6 @@ void Engine::evaluate_function( Node* func, InputBuffer& inputs, const std::shared_ptr& cpu_ready_queue) { - // Set the ThreadLocalState before calling the function. - // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask - // always saves ThreadLocalState without grad_mode. - at::ThreadLocalStateGuard tls_guard(graph_task->thread_locals_); - // The InputBuffer::adds that supplied incoming grads took pains to // ensure they're safe to consume in the context of the present // func's stream (if applicable). So we guard onto that stream diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp index 76f2eaebe5f77..4a3b3fff2e20b 100644 --- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp +++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp @@ -359,6 +359,7 @@ void DistEngine::execute_graph_task_until_ready_queue_empty( continue; } if (task.fn_ && !local_graph_task->has_error_.load()) { + at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_); AutoGradMode grad_mode(local_graph_task->grad_mode_); try { GraphTaskGuard guard(local_graph_task); From 8a22d4fa5c5953bcb83293b5349b5f79ae08f193 Mon Sep 17 00:00:00 2001 From: Aayush Prakash Date: Wed, 25 Aug 2021 11:11:08 -0700 Subject: [PATCH 216/530] [Reland] Replacing the p.data acccess in utils with tensor.set_ . Passes both test_post_localSGD_optimizer_pari and test_periodic_model_averager tests (#63895) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63895 When updating the model parameter, updating `parameter.data` is no longer recommended, because this `data` field will be deprecated in the future. The replacement is `tensor.set_`. ghstack-source-id: 136593433 Test Plan: buck test mode/dev-nosan //caffe2/test/distributed:distributed_nccl_spawn -- test_periodic_model_averager buck test mode/dev-nosan //caffe2/test/distributed:distributed_nccl_spawn -- test_post_localSGD_optimizer_parity Reviewed By: SciPioneer Differential Revision: D30526178 fbshipit-source-id: a1ac0ec3665d8623edd5bf94f01c1132daff5c00 --- torch/distributed/algorithms/model_averaging/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py index 44ee422b9e92d..ce1fb65401ad2 100644 --- a/torch/distributed/algorithms/model_averaging/utils.py +++ b/torch/distributed/algorithms/model_averaging/utils.py @@ -29,5 +29,6 @@ def average_parameters( offset = 0 for p in params_it2: - p.data = flat_params[offset : offset + p.numel()].view_as(p) + with torch.no_grad(): + p.set_(flat_params[offset : offset + p.numel()].view_as(p).type_as(p)) # type: ignore[call-overload] offset += p.numel() From a2399a76e18b31747019ead8f80e2a4eb53e8223 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Wed, 25 Aug 2021 11:12:57 -0700 Subject: [PATCH 217/530] [Static Runtime] Moved NNC operator definitions to separate files. (#63838) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63838 Refactored NNC operator definitions code into separate files. Made `TEWrapper` a class with a fixed set of methods and added separate definitions for them based on `TORCH_ENABLE_LLVM` to keep the same functionality as before. Test Plan: Build and ran Static Runtime tests. Reviewed By: hlu1 Differential Revision: D30405467 fbshipit-source-id: 606ef852bb820d5e23a0f8af1bf5dc122e90bceb --- tools/build_variables.bzl | 1 + torch/csrc/jit/runtime/static/ops.cpp | 193 +------------------ torch/csrc/jit/runtime/static/te_wrapper.cpp | 184 ++++++++++++++++++ torch/csrc/jit/runtime/static/te_wrapper.h | 33 ++++ 4 files changed, 219 insertions(+), 192 deletions(-) create mode 100644 torch/csrc/jit/runtime/static/te_wrapper.cpp create mode 100644 torch/csrc/jit/runtime/static/te_wrapper.h diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 157c30663ce1c..0d888ea8a4cb0 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -329,6 +329,7 @@ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/runtime/static/native_ops.cpp", "torch/csrc/jit/runtime/static/ops.cpp", "torch/csrc/jit/runtime/static/passes.cpp", + "torch/csrc/jit/runtime/static/te_wrapper.cpp", "torch/csrc/jit/tensorexpr/external_functions.cpp", "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp", ] diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index e6af641083fc2..d9fb9bad080f0 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -493,198 +494,6 @@ REGISTER_OPERATOR_FUNCTOR(aten::leaky_relu, aten_leaky_relu, [](Node* n) -> SROp }; }); -namespace { - -// Use the width of an AVX-512 vector by default; this happens to work OK -// for AVX2 as well. Some ops benefit from using multiple AVX ports, in -// which case they are vectorized by twice this constant. An exception is -// logit, since it contains FP divide, which is single-ported. -static constexpr int kVectorWidth = 16; - -#ifdef TORCH_ENABLE_LLVM - -struct TEWrapper { - std::unique_ptr cg; - TEWrapper() = default; - void update(std::unique_ptr&& cg_) { - cg = std::move(cg_); - } - - void call(const std::vector& args) { - cg->call_raw(args); - } - - inline bool supports(const at::Tensor& t) { - return t.is_contiguous() && t.dtype().Match(); - } -}; - -void optimizePointwise( - tensorexpr::LoopNest* ln, - tensorexpr::Tensor target, - int width) { - using namespace torch::jit::tensorexpr; - std::vector loops = ln->getLoopStmtsFor(target); - ForPtr inner, tail; - TORCH_CHECK(loops.size() > 0, "No loops created for pointwise op"); - ln->splitWithTail(loops[0], width, &inner, &tail); - ln->vectorize(inner); -} - -std::shared_ptr wrapTECompute( - std::shared_ptr wrap, - tensorexpr::Placeholder& in, - tensorexpr::Tensor out, - tensorexpr::VarHandle& dim, - int width = kVectorWidth) { - using namespace torch::jit::tensorexpr; - LoopNest ln({out}); - optimizePointwise(&ln, out, width); - ln.prepareForCodegen(); - StmtPtr s = ln.root_stmt(); - s = tensorexpr::IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(out); - args.emplace_back(in); - args.emplace_back(dim); - auto cg = std::make_unique(s, args); - wrap->update(std::move(cg)); - return wrap; -}; - -#else - -struct TEWrapper { - TEWrapper() = default; - template - void operator()(const Ts&... ts) { - DCHECK(0 && "Invalid call"); - } - void call(const std::vector& args) { - DCHECK(0 && "Invalid call"); - } - - inline bool supports(const at::Tensor& t) { - return false; - } -}; - -std::shared_ptr wrapTECompute( - std::shared_ptr wrap, - tensorexpr::Placeholder& in, - tensorexpr::Tensor out, - tensorexpr::VarHandle& dim, - int width = kVectorWidth) { - return wrap; -}; - -#endif - -std::mutex& getNNCCacheMutex() { - static std::mutex nncCacheMutex; - return nncCacheMutex; -} - -std::unordered_map>& getNNCCache() { - static std::unordered_map> nncCache; - return nncCache; -} - -std::shared_ptr lookupNNCCache(NodeKind kind) { - std::lock_guard lock(getNNCCacheMutex()); - auto it = getNNCCache().find(kind); - if (it != getNNCCache().end()) { - return it->second; - } - return nullptr; -} - -void updateNNCCache(NodeKind kind, std::shared_ptr code) { - std::lock_guard lock(getNNCCacheMutex()); - getNNCCache()[kind] = code; -} - -} // namespace - -std::shared_ptr createLogit(c10::optional clamp) { - using namespace torch::jit::tensorexpr; - // TODO: Use NNC cache for this op. - auto wrap = std::make_shared(); - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto A_elem = [&]() { - if (!clamp) { - return A.load(i); - } else { - auto elem = A.load(i); - auto min = FloatImm::make(*clamp); - auto max = FloatImm::make(1.0f - *clamp); - elem = CompareSelect::make(elem, min, min, elem, kLT); - return CompareSelect::make(elem, max, max, elem, kGT); - } - }(); - return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem)); - }); - return wrapTECompute(wrap, A, B, N); -} - -std::shared_ptr createRelu() { - using namespace torch::jit::tensorexpr; - auto wrap = lookupNNCCache(aten::relu); - if (wrap) { - return wrap; - } - wrap = std::make_shared(); - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto zero = FloatImm::make(0.f); - auto a = A.load(i); - return ifThenElse(a < zero, zero, a); - }); - wrap = wrapTECompute(wrap, A, B, N); - updateNNCCache(aten::relu, wrap); - return wrap; -} - -std::shared_ptr createTanh() { - using namespace torch::jit::tensorexpr; - auto wrap = lookupNNCCache(aten::tanh); - if (wrap) { - return wrap; - } - wrap = std::make_shared(); - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) { - auto a = A.load(i); - return fast_tanh(a); - }); - wrap = wrapTECompute(wrap, A, B, N); - updateNNCCache(aten::tanh, wrap); - return wrap; -} - -std::shared_ptr createSigmoid() { - using namespace torch::jit::tensorexpr; - auto wrap = lookupNNCCache(aten::sigmoid); - if (wrap) { - return wrap; - } - wrap = std::make_shared(); - auto N = VarHandle("N", kInt); - Placeholder A("A", kFloat, {N}); - Tensor B = - Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); }); - // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor - // (Sleef_expf8). - constexpr int kSleefWidth = 8; - wrap = wrapTECompute(wrap, A, B, N, kSleefWidth); - updateNNCCache(aten::sigmoid, wrap); - return wrap; -} - REGISTER_OPERATOR_FUNCTOR(aten::relu, aten_relu, [](Node* n) -> SROperator { if (!n->matches(torch::schema("aten::relu(Tensor self) -> Tensor"))) { LogAndDumpSchema(n); diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp new file mode 100644 index 0000000000000..413d44348a5c0 --- /dev/null +++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp @@ -0,0 +1,184 @@ +#include + +#include +#include + +namespace torch { +namespace jit { + +using namespace torch::jit::tensorexpr; + +// Use the width of an AVX-512 vector by default; this happens to work OK for +// AVX2 as well. Some ops benefit from using multiple AVX ports, in which case +// they are vectorized by twice this constant. An exception is logit, since it +// contains FP divide, which is single-ported. +static constexpr int kVectorWidth = 16; + +#ifdef TORCH_ENABLE_LLVM + +void TEWrapper::update(std::unique_ptr&& cg_) { + cg = std::move(cg_); +} + +void TEWrapper::call(const std::vector& args) { + cg->call_raw(args); +} + +bool TEWrapper::supports(const at::Tensor& t) { + return t.is_contiguous() && t.dtype().Match(); +} + +void optimizePointwise(LoopNest* ln, Tensor target, int width) { + std::vector loops = ln->getLoopStmtsFor(target); + ForPtr inner, tail; + TORCH_CHECK(loops.size() > 0, "No loops created for pointwise op"); + ln->splitWithTail(loops[0], width, &inner, &tail); + ln->vectorize(inner); +} + +std::shared_ptr wrapTECompute( + std::shared_ptr wrap, + Placeholder& in, + Tensor out, + VarHandle& dim, + int width = kVectorWidth) { + LoopNest ln({out}); + optimizePointwise(&ln, out, width); + ln.prepareForCodegen(); + StmtPtr s = ln.root_stmt(); + s = IRSimplifier::simplify(s); + std::vector args; + args.emplace_back(out); + args.emplace_back(in); + args.emplace_back(dim); + auto cg = std::make_unique(s, args); + wrap->update(std::move(cg)); + return wrap; +}; + +#else + +void TEWrapper::call(const std::vector& args) { + DCHECK(0 && "Invalid call"); +} + +bool TEWrapper::supports(const at::Tensor& t) { + return false; +} + +std::shared_ptr wrapTECompute( + std::shared_ptr wrap, + Placeholder& in, + Tensor out, + VarHandle& dim, + int width = kVectorWidth) { + return wrap; +}; + +#endif + +namespace { + +std::mutex& getNNCCacheMutex() { + static std::mutex nncCacheMutex; + return nncCacheMutex; +} + +std::unordered_map>& getNNCCache() { + static std::unordered_map> nncCache; + return nncCache; +} + +std::shared_ptr lookupNNCCache(NodeKind kind) { + std::lock_guard lock(getNNCCacheMutex()); + auto it = getNNCCache().find(kind); + if (it != getNNCCache().end()) { + return it->second; + } + return nullptr; +} + +void updateNNCCache(NodeKind kind, std::shared_ptr code) { + std::lock_guard lock(getNNCCacheMutex()); + getNNCCache()[kind] = code; +} + +} // namespace + +std::shared_ptr createLogit(c10::optional clamp) { + // TODO: Use NNC cache for this op. + auto wrap = std::make_shared(); + auto N = VarHandle("N", kInt); + Placeholder A("A", kFloat, {N}); + Tensor B = Compute("B", {N}, [&](const VarHandle& i) { + auto A_elem = [&]() { + if (!clamp) { + return A.load(i); + } else { + auto elem = A.load(i); + auto min = FloatImm::make(*clamp); + auto max = FloatImm::make(1.0f - *clamp); + elem = CompareSelect::make(elem, min, min, elem, kLT); + return CompareSelect::make(elem, max, max, elem, kGT); + } + }(); + return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem)); + }); + return wrapTECompute(wrap, A, B, N); +} + +std::shared_ptr createRelu() { + auto wrap = lookupNNCCache(aten::relu); + if (wrap) { + return wrap; + } + wrap = std::make_shared(); + auto N = VarHandle("N", kInt); + Placeholder A("A", kFloat, {N}); + Tensor B = Compute("B", {N}, [&](const VarHandle& i) { + auto zero = FloatImm::make(0.f); + auto a = A.load(i); + return ifThenElse(a < zero, zero, a); + }); + wrap = wrapTECompute(wrap, A, B, N); + updateNNCCache(aten::relu, wrap); + return wrap; +} + +std::shared_ptr createTanh() { + auto wrap = lookupNNCCache(aten::tanh); + if (wrap) { + return wrap; + } + wrap = std::make_shared(); + auto N = VarHandle("N", kInt); + Placeholder A("A", kFloat, {N}); + Tensor B = Compute("B", {N}, [&](const VarHandle& i) { + auto a = A.load(i); + return fast_tanh(a); + }); + wrap = wrapTECompute(wrap, A, B, N); + updateNNCCache(aten::tanh, wrap); + return wrap; +} + +std::shared_ptr createSigmoid() { + auto wrap = lookupNNCCache(aten::sigmoid); + if (wrap) { + return wrap; + } + wrap = std::make_shared(); + auto N = VarHandle("N", kInt); + Placeholder A("A", kFloat, {N}); + Tensor B = + Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); }); + // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor + // (Sleef_expf8). + constexpr int kSleefWidth = 8; + wrap = wrapTECompute(wrap, A, B, N, kSleefWidth); + updateNNCCache(aten::sigmoid, wrap); + return wrap; +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/static/te_wrapper.h b/torch/csrc/jit/runtime/static/te_wrapper.h new file mode 100644 index 0000000000000..776602dc3edec --- /dev/null +++ b/torch/csrc/jit/runtime/static/te_wrapper.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +class TEWrapper { + public: + TEWrapper() = default; + void call(const std::vector& args); + bool supports(const at::Tensor& t); +#ifdef TORCH_ENABLE_LLVM + void update(std::unique_ptr&& cg_); +#endif + + private: +#ifdef TORCH_ENABLE_LLVM + std::unique_ptr cg; +#endif +}; + +std::shared_ptr createLogit(c10::optional clamp); +std::shared_ptr createRelu(); +std::shared_ptr createTanh(); +std::shared_ptr createSigmoid(); + +} // namespace jit +} // namespace torch From dde07cad6f029c2727487544b851dfd74945efb1 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Wed, 25 Aug 2021 11:12:57 -0700 Subject: [PATCH 218/530] [Static Runtime] Added a variable for clamp in the NNC code for Logit. (#63839) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63839 Replaced the use of a constant for clamp in the NNC code for Logit with a variable. This makes it easier to enable caching for Logit. There is no performance difference with this change, as shown in the micro-benchmarks below. ``` Logit NNC Benchmark Time (ns) const-clamp var-clamp logit_nnc_sleef/64 550 543 logit_nnc_sleef/512 3514 3517 logit_nnc_sleef/8192 85537 82900 logit_nnc_sleef/32768 347635 337016 logit_nnc_fast/64 173 167 logit_nnc_fast/512 829 866 logit_nnc_fast/8192 13286 13069 logit_nnc_fast/32768 51116 53429 logit_nnc_vml/64 146 164 logit_nnc_vml/512 773 783 logit_nnc_vml/8192 11556 11563 logit_nnc_vml/32768 44815 46720 ``` Test Plan: SR unit tests and the inline_cvr model. Reviewed By: bertmaher Differential Revision: D30405466 fbshipit-source-id: adb891fdae5746439931ce5f43165291fec08f52 --- torch/csrc/jit/runtime/static/ops.cpp | 8 ++-- torch/csrc/jit/runtime/static/te_wrapper.cpp | 41 ++++++++------------ torch/csrc/jit/runtime/static/te_wrapper.h | 2 +- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index d9fb9bad080f0..140fdf188a951 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -579,8 +579,9 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator { ? c10::make_optional(static_cast(clamp_d.value())) : c10::nullopt; } - auto te = clamp ? createLogit(clamp) : nullptr; - return [te](ProcessedNode* p_node) { + auto te = clamp ? createLogit() : nullptr; + float clamp_value = clamp ? *clamp : 0.0f; + return [te, clamp_value](ProcessedNode* p_node) { const auto& in0_t = p_node->Input(0).toTensor(); if (p_node->Output(0).isNone()) { p_node->Output(0) = create_empty_from(in0_t); @@ -594,7 +595,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator { } else { at::native::resize_(out_t, in0_t.sizes(), c10::nullopt); int64_t nn = in0_t.numel(); - te->call({out_t.data_ptr(), in0_t.data_ptr(), &nn}); + float c = clamp_value; + te->call({out_t.data_ptr(), in0_t.data_ptr(), &nn, &c}); } }; }); diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp index 413d44348a5c0..9c3cbe9ac5941 100644 --- a/torch/csrc/jit/runtime/static/te_wrapper.cpp +++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp @@ -38,23 +38,19 @@ void optimizePointwise(LoopNest* ln, Tensor target, int width) { std::shared_ptr wrapTECompute( std::shared_ptr wrap, - Placeholder& in, Tensor out, - VarHandle& dim, + std::vector args, int width = kVectorWidth) { LoopNest ln({out}); optimizePointwise(&ln, out, width); ln.prepareForCodegen(); StmtPtr s = ln.root_stmt(); s = IRSimplifier::simplify(s); - std::vector args; - args.emplace_back(out); - args.emplace_back(in); - args.emplace_back(dim); + args.insert(args.begin(), out); auto cg = std::make_unique(s, args); wrap->update(std::move(cg)); return wrap; -}; +} #else @@ -68,12 +64,11 @@ bool TEWrapper::supports(const at::Tensor& t) { std::shared_ptr wrapTECompute( std::shared_ptr wrap, - Placeholder& in, Tensor out, - VarHandle& dim, + std::vector args, int width = kVectorWidth) { return wrap; -}; +} #endif @@ -105,26 +100,24 @@ void updateNNCCache(NodeKind kind, std::shared_ptr code) { } // namespace -std::shared_ptr createLogit(c10::optional clamp) { +std::shared_ptr createLogit() { // TODO: Use NNC cache for this op. auto wrap = std::make_shared(); auto N = VarHandle("N", kInt); + auto C = VarHandle("C", kFloat); Placeholder A("A", kFloat, {N}); Tensor B = Compute("B", {N}, [&](const VarHandle& i) { auto A_elem = [&]() { - if (!clamp) { - return A.load(i); - } else { - auto elem = A.load(i); - auto min = FloatImm::make(*clamp); - auto max = FloatImm::make(1.0f - *clamp); - elem = CompareSelect::make(elem, min, min, elem, kLT); - return CompareSelect::make(elem, max, max, elem, kGT); - } + auto elem = A.load(i); + auto one = FloatImm::make(1.0f); + const auto& min = C; + auto max = one - C; + elem = CompareSelect::make(elem, min, min, elem, kLT); + return CompareSelect::make(elem, max, max, elem, kGT); }(); return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem)); }); - return wrapTECompute(wrap, A, B, N); + return wrapTECompute(wrap, B, {A, N, C}); } std::shared_ptr createRelu() { @@ -140,7 +133,7 @@ std::shared_ptr createRelu() { auto a = A.load(i); return ifThenElse(a < zero, zero, a); }); - wrap = wrapTECompute(wrap, A, B, N); + wrap = wrapTECompute(wrap, B, {A, N}); updateNNCCache(aten::relu, wrap); return wrap; } @@ -157,7 +150,7 @@ std::shared_ptr createTanh() { auto a = A.load(i); return fast_tanh(a); }); - wrap = wrapTECompute(wrap, A, B, N); + wrap = wrapTECompute(wrap, B, {A, N}); updateNNCCache(aten::tanh, wrap); return wrap; } @@ -175,7 +168,7 @@ std::shared_ptr createSigmoid() { // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor // (Sleef_expf8). constexpr int kSleefWidth = 8; - wrap = wrapTECompute(wrap, A, B, N, kSleefWidth); + wrap = wrapTECompute(wrap, B, {A, N}, kSleefWidth); updateNNCCache(aten::sigmoid, wrap); return wrap; } diff --git a/torch/csrc/jit/runtime/static/te_wrapper.h b/torch/csrc/jit/runtime/static/te_wrapper.h index 776602dc3edec..0a5f3d8532990 100644 --- a/torch/csrc/jit/runtime/static/te_wrapper.h +++ b/torch/csrc/jit/runtime/static/te_wrapper.h @@ -24,7 +24,7 @@ class TEWrapper { #endif }; -std::shared_ptr createLogit(c10::optional clamp); +std::shared_ptr createLogit(); std::shared_ptr createRelu(); std::shared_ptr createTanh(); std::shared_ptr createSigmoid(); From 64d605bab82792bd1f89cb896302e59313466884 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Wed, 25 Aug 2021 11:12:57 -0700 Subject: [PATCH 219/530] [Static Runtime] Added caching for the NNC code generated for Logit. (#63840) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63840 Added NNC generated code for Logit to the cache. ``` Logit NNC Benchmark Time (ns) w/o cache w/ cache logit_nnc_sleef/64 543 536 logit_nnc_sleef/512 3517 3465 logit_nnc_sleef/8192 88483 85881 logit_nnc_sleef/32768 337016 323090 logit_nnc_fast/64 167 163 logit_nnc_fast/512 866 817 logit_nnc_fast/8192 13069 12801 logit_nnc_fast/32768 53429 52530 logit_nnc_vml/64 164 151 logit_nnc_vml/512 783 769 logit_nnc_vml/8192 11563 11674 logit_nnc_vml/32768 46720 46452 ``` Test Plan: Unit tests and inline_cvr model. Reviewed By: hlu1 Differential Revision: D30405424 fbshipit-source-id: 938b1b74758e2612ae151bac890c5f8ebbc42d50 --- torch/csrc/jit/runtime/static/te_wrapper.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp index 9c3cbe9ac5941..d8b494c9d4a23 100644 --- a/torch/csrc/jit/runtime/static/te_wrapper.cpp +++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp @@ -101,8 +101,11 @@ void updateNNCCache(NodeKind kind, std::shared_ptr code) { } // namespace std::shared_ptr createLogit() { - // TODO: Use NNC cache for this op. - auto wrap = std::make_shared(); + auto wrap = lookupNNCCache(aten::logit); + if (wrap) { + return wrap; + } + wrap = std::make_shared(); auto N = VarHandle("N", kInt); auto C = VarHandle("C", kFloat); Placeholder A("A", kFloat, {N}); @@ -117,7 +120,9 @@ std::shared_ptr createLogit() { }(); return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem)); }); - return wrapTECompute(wrap, B, {A, N, C}); + wrap = wrapTECompute(wrap, B, {A, N, C}); + updateNNCCache(aten::logit, wrap); + return wrap; } std::shared_ptr createRelu() { From 67d8e7b659b19e1ee68208b28bfa7dba73375dbc Mon Sep 17 00:00:00 2001 From: driazati Date: Wed, 25 Aug 2021 11:19:49 -0700 Subject: [PATCH 220/530] Reformat run_test.py (#63808) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63808 `black run_test.py` Test Plan: Imported from OSS Reviewed By: seemethere Differential Revision: D30497437 Pulled By: driazati fbshipit-source-id: 41b29b73f41fa4bb15fce5eaa69f8efe614e02f7 --- test/run_test.py | 1209 +++++++++++++++++++++++++--------------------- 1 file changed, 650 insertions(+), 559 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index f3b7bf72bbbba..ecc93fe03aa30 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -14,7 +14,13 @@ import torch from torch.utils import cpp_extension -from torch.testing._internal.common_utils import FILE_SCHEMA, IS_IN_CI, TEST_WITH_ROCM, shell, set_cwd +from torch.testing._internal.common_utils import ( + FILE_SCHEMA, + IS_IN_CI, + TEST_WITH_ROCM, + shell, + set_cwd, +) import torch.distributed as dist from typing import Dict, Optional, List @@ -29,243 +35,245 @@ get_reordered_tests, get_test_case_configs, ) + HAVE_TEST_SELECTION_TOOLS = True except ImportError: HAVE_TEST_SELECTION_TOOLS = False - print("Unable to import test_selections from tools/testing. Running without test selection stats...") + print( + "Unable to import test_selections from tools/testing. Running without test selection stats..." + ) TESTS = [ - 'test_import_time', - 'test_public_bindings', - 'test_type_hints', - 'test_ao_sparsity', - 'test_autograd', - 'benchmark_utils/test_benchmark_utils', - 'test_binary_ufuncs', - 'test_buffer_protocol', - 'test_bundled_inputs', - 'test_complex', - 'test_cpp_api_parity', - 'test_cpp_extensions_aot_no_ninja', - 'test_cpp_extensions_aot_ninja', - 'test_cpp_extensions_jit', - 'distributed/test_c10d_common', - 'distributed/test_c10d_gloo', - 'distributed/test_c10d_nccl', - 'distributed/test_jit_c10d', - 'distributed/test_c10d_spawn_gloo', - 'distributed/test_c10d_spawn_nccl', - 'distributed/test_store', - 'distributed/test_pg_wrapper', - 'distributed/algorithms/test_join', - 'test_cuda', - 'test_jit_cuda_fuser', - 'test_cuda_primary_ctx', - 'test_dataloader', - 'test_datapipe', - 'distributed/test_data_parallel', - 'distributed/test_distributed_spawn', - 'distributions/test_constraints', - 'distributions/test_distributions', - 'test_dispatch', - 'test_foreach', - 'test_indexing', - 'test_jit', - 'test_linalg', - 'test_logging', - 'test_mkldnn', - 'test_model_dump', - 'test_module_init', - 'test_modules', - 'test_multiprocessing', - 'test_multiprocessing_spawn', - 'distributed/test_nccl', - 'test_native_functions', - 'test_numba_integration', - 'test_nn', - 'test_ops', - 'test_optim', - 'test_functional_optim', - 'test_pytree', - 'test_mobile_optimizer', - 'test_set_default_mobile_cpu_allocator', - 'test_xnnpack_integration', - 'test_vulkan', - 'test_sparse', - 'test_sparse_csr', - 'test_quantization', - 'test_pruning_op', - 'test_spectral_ops', - 'test_serialization', - 'test_shape_ops', - 'test_show_pickle', - 'test_sort_and_select', - 'test_tensor_creation_ops', - 'test_testing', - 'test_torch', - 'test_type_info', - 'test_unary_ufuncs', - 'test_utils', - 'test_view_ops', - 'test_vmap', - 'test_namedtuple_return_api', - 'test_numpy_interop', - 'test_jit_profiling', - 'test_jit_legacy', - 'test_jit_fuser_legacy', - 'test_tensorboard', - 'test_namedtensor', - 'test_reductions', - 'test_type_promotion', - 'test_jit_disabled', - 'test_function_schema', - 'test_overrides', - 'test_jit_fuser_te', - 'test_tensorexpr', - 'test_tensorexpr_pybind', - 'test_openmp', - 'test_profiler', + "test_import_time", + "test_public_bindings", + "test_type_hints", + "test_ao_sparsity", + "test_autograd", + "benchmark_utils/test_benchmark_utils", + "test_binary_ufuncs", + "test_buffer_protocol", + "test_bundled_inputs", + "test_complex", + "test_cpp_api_parity", + "test_cpp_extensions_aot_no_ninja", + "test_cpp_extensions_aot_ninja", + "test_cpp_extensions_jit", + "distributed/test_c10d_common", + "distributed/test_c10d_gloo", + "distributed/test_c10d_nccl", + "distributed/test_jit_c10d", + "distributed/test_c10d_spawn_gloo", + "distributed/test_c10d_spawn_nccl", + "distributed/test_store", + "distributed/test_pg_wrapper", + "distributed/algorithms/test_join", + "test_cuda", + "test_jit_cuda_fuser", + "test_cuda_primary_ctx", + "test_dataloader", + "test_datapipe", + "distributed/test_data_parallel", + "distributed/test_distributed_spawn", + "distributions/test_constraints", + "distributions/test_distributions", + "test_dispatch", + "test_foreach", + "test_indexing", + "test_jit", + "test_linalg", + "test_logging", + "test_mkldnn", + "test_model_dump", + "test_module_init", + "test_modules", + "test_multiprocessing", + "test_multiprocessing_spawn", + "distributed/test_nccl", + "test_native_functions", + "test_numba_integration", + "test_nn", + "test_ops", + "test_optim", + "test_functional_optim", + "test_pytree", + "test_mobile_optimizer", + "test_set_default_mobile_cpu_allocator", + "test_xnnpack_integration", + "test_vulkan", + "test_sparse", + "test_sparse_csr", + "test_quantization", + "test_pruning_op", + "test_spectral_ops", + "test_serialization", + "test_shape_ops", + "test_show_pickle", + "test_sort_and_select", + "test_tensor_creation_ops", + "test_testing", + "test_torch", + "test_type_info", + "test_unary_ufuncs", + "test_utils", + "test_view_ops", + "test_vmap", + "test_namedtuple_return_api", + "test_numpy_interop", + "test_jit_profiling", + "test_jit_legacy", + "test_jit_fuser_legacy", + "test_tensorboard", + "test_namedtensor", + "test_reductions", + "test_type_promotion", + "test_jit_disabled", + "test_function_schema", + "test_overrides", + "test_jit_fuser_te", + "test_tensorexpr", + "test_tensorexpr_pybind", + "test_openmp", + "test_profiler", "distributed/test_launcher", - 'distributed/nn/jit/test_instantiator', - 'distributed/rpc/test_faulty_agent', - 'distributed/rpc/test_tensorpipe_agent', - 'distributed/rpc/cuda/test_tensorpipe_agent', - 'test_determination', - 'test_futures', - 'test_fx', - 'test_fx_experimental', - 'test_functional_autograd_benchmark', - 'test_package', - 'test_license', - 'distributed/pipeline/sync/skip/test_api', - 'distributed/pipeline/sync/skip/test_gpipe', - 'distributed/pipeline/sync/skip/test_inspect_skip_layout', - 'distributed/pipeline/sync/skip/test_leak', - 'distributed/pipeline/sync/skip/test_portal', - 'distributed/pipeline/sync/skip/test_stash_pop', - 'distributed/pipeline/sync/skip/test_tracker', - 'distributed/pipeline/sync/skip/test_verify_skippables', - 'distributed/pipeline/sync/test_balance', - 'distributed/pipeline/sync/test_bugs', - 'distributed/pipeline/sync/test_checkpoint', - 'distributed/pipeline/sync/test_copy', - 'distributed/pipeline/sync/test_deferred_batch_norm', - 'distributed/pipeline/sync/test_dependency', - 'distributed/pipeline/sync/test_inplace', - 'distributed/pipeline/sync/test_microbatch', - 'distributed/pipeline/sync/test_phony', - 'distributed/pipeline/sync/test_pipe', - 'distributed/pipeline/sync/test_pipeline', - 'distributed/pipeline/sync/test_stream', - 'distributed/pipeline/sync/test_transparency', - 'distributed/pipeline/sync/test_worker', - 'distributed/optim/test_zero_redundancy_optimizer', - 'distributed/elastic/timer/api_test', - 'distributed/elastic/timer/local_timer_example', - 'distributed/elastic/timer/local_timer_test', - 'distributed/elastic/events/lib_test', - 'distributed/elastic/metrics/api_test', - 'distributed/elastic/utils/logging_test', - 'distributed/elastic/utils/util_test', - 'distributed/elastic/utils/distributed_test', - 'distributed/elastic/multiprocessing/api_test', - 'distributed/_sharding_spec/test_sharding_spec', - 'distributed/_sharded_tensor/test_sharded_tensor', + "distributed/nn/jit/test_instantiator", + "distributed/rpc/test_faulty_agent", + "distributed/rpc/test_tensorpipe_agent", + "distributed/rpc/cuda/test_tensorpipe_agent", + "test_determination", + "test_futures", + "test_fx", + "test_fx_experimental", + "test_functional_autograd_benchmark", + "test_package", + "test_license", + "distributed/pipeline/sync/skip/test_api", + "distributed/pipeline/sync/skip/test_gpipe", + "distributed/pipeline/sync/skip/test_inspect_skip_layout", + "distributed/pipeline/sync/skip/test_leak", + "distributed/pipeline/sync/skip/test_portal", + "distributed/pipeline/sync/skip/test_stash_pop", + "distributed/pipeline/sync/skip/test_tracker", + "distributed/pipeline/sync/skip/test_verify_skippables", + "distributed/pipeline/sync/test_balance", + "distributed/pipeline/sync/test_bugs", + "distributed/pipeline/sync/test_checkpoint", + "distributed/pipeline/sync/test_copy", + "distributed/pipeline/sync/test_deferred_batch_norm", + "distributed/pipeline/sync/test_dependency", + "distributed/pipeline/sync/test_inplace", + "distributed/pipeline/sync/test_microbatch", + "distributed/pipeline/sync/test_phony", + "distributed/pipeline/sync/test_pipe", + "distributed/pipeline/sync/test_pipeline", + "distributed/pipeline/sync/test_stream", + "distributed/pipeline/sync/test_transparency", + "distributed/pipeline/sync/test_worker", + "distributed/optim/test_zero_redundancy_optimizer", + "distributed/elastic/timer/api_test", + "distributed/elastic/timer/local_timer_example", + "distributed/elastic/timer/local_timer_test", + "distributed/elastic/events/lib_test", + "distributed/elastic/metrics/api_test", + "distributed/elastic/utils/logging_test", + "distributed/elastic/utils/util_test", + "distributed/elastic/utils/distributed_test", + "distributed/elastic/multiprocessing/api_test", + "distributed/_sharding_spec/test_sharding_spec", + "distributed/_sharded_tensor/test_sharded_tensor", ] # Tests need to be run with pytest. USE_PYTEST_LIST = [ - 'distributed/pipeline/sync/skip/test_api', - 'distributed/pipeline/sync/skip/test_gpipe', - 'distributed/pipeline/sync/skip/test_inspect_skip_layout', - 'distributed/pipeline/sync/skip/test_leak', - 'distributed/pipeline/sync/skip/test_portal', - 'distributed/pipeline/sync/skip/test_stash_pop', - 'distributed/pipeline/sync/skip/test_tracker', - 'distributed/pipeline/sync/skip/test_verify_skippables', - 'distributed/pipeline/sync/test_balance', - 'distributed/pipeline/sync/test_bugs', - 'distributed/pipeline/sync/test_checkpoint', - 'distributed/pipeline/sync/test_copy', - 'distributed/pipeline/sync/test_deferred_batch_norm', - 'distributed/pipeline/sync/test_dependency', - 'distributed/pipeline/sync/test_inplace', - 'distributed/pipeline/sync/test_microbatch', - 'distributed/pipeline/sync/test_phony', - 'distributed/pipeline/sync/test_pipe', - 'distributed/pipeline/sync/test_pipeline', - 'distributed/pipeline/sync/test_stream', - 'distributed/pipeline/sync/test_transparency', - 'distributed/pipeline/sync/test_worker', - 'distributions/test_constraints', - 'distributions/test_transforms', - 'distributions/test_utils', - 'test_typing', + "distributed/pipeline/sync/skip/test_api", + "distributed/pipeline/sync/skip/test_gpipe", + "distributed/pipeline/sync/skip/test_inspect_skip_layout", + "distributed/pipeline/sync/skip/test_leak", + "distributed/pipeline/sync/skip/test_portal", + "distributed/pipeline/sync/skip/test_stash_pop", + "distributed/pipeline/sync/skip/test_tracker", + "distributed/pipeline/sync/skip/test_verify_skippables", + "distributed/pipeline/sync/test_balance", + "distributed/pipeline/sync/test_bugs", + "distributed/pipeline/sync/test_checkpoint", + "distributed/pipeline/sync/test_copy", + "distributed/pipeline/sync/test_deferred_batch_norm", + "distributed/pipeline/sync/test_dependency", + "distributed/pipeline/sync/test_inplace", + "distributed/pipeline/sync/test_microbatch", + "distributed/pipeline/sync/test_phony", + "distributed/pipeline/sync/test_pipe", + "distributed/pipeline/sync/test_pipeline", + "distributed/pipeline/sync/test_stream", + "distributed/pipeline/sync/test_transparency", + "distributed/pipeline/sync/test_worker", + "distributions/test_constraints", + "distributions/test_transforms", + "distributions/test_utils", + "test_typing", "distributed/elastic/events/lib_test", "distributed/elastic/agent/server/test/api_test", ] WINDOWS_BLOCKLIST = [ - 'distributed/nn/jit/test_instantiator', - 'distributed/rpc/test_faulty_agent', - 'distributed/rpc/test_tensorpipe_agent', - 'distributed/rpc/cuda/test_tensorpipe_agent', - 'distributed/pipeline/sync/skip/test_api', - 'distributed/pipeline/sync/skip/test_gpipe', - 'distributed/pipeline/sync/skip/test_inspect_skip_layout', - 'distributed/pipeline/sync/skip/test_leak', - 'distributed/pipeline/sync/skip/test_portal', - 'distributed/pipeline/sync/skip/test_stash_pop', - 'distributed/pipeline/sync/skip/test_tracker', - 'distributed/pipeline/sync/skip/test_verify_skippables', - 'distributed/pipeline/sync/test_balance', - 'distributed/pipeline/sync/test_bugs', - 'distributed/pipeline/sync/test_checkpoint', - 'distributed/pipeline/sync/test_copy', - 'distributed/pipeline/sync/test_deferred_batch_norm', - 'distributed/pipeline/sync/test_dependency', - 'distributed/pipeline/sync/test_inplace', - 'distributed/pipeline/sync/test_microbatch', - 'distributed/pipeline/sync/test_phony', - 'distributed/pipeline/sync/test_pipe', - 'distributed/pipeline/sync/test_pipeline', - 'distributed/pipeline/sync/test_stream', - 'distributed/pipeline/sync/test_transparency', - 'distributed/pipeline/sync/test_worker', + "distributed/nn/jit/test_instantiator", + "distributed/rpc/test_faulty_agent", + "distributed/rpc/test_tensorpipe_agent", + "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/pipeline/sync/skip/test_api", + "distributed/pipeline/sync/skip/test_gpipe", + "distributed/pipeline/sync/skip/test_inspect_skip_layout", + "distributed/pipeline/sync/skip/test_leak", + "distributed/pipeline/sync/skip/test_portal", + "distributed/pipeline/sync/skip/test_stash_pop", + "distributed/pipeline/sync/skip/test_tracker", + "distributed/pipeline/sync/skip/test_verify_skippables", + "distributed/pipeline/sync/test_balance", + "distributed/pipeline/sync/test_bugs", + "distributed/pipeline/sync/test_checkpoint", + "distributed/pipeline/sync/test_copy", + "distributed/pipeline/sync/test_deferred_batch_norm", + "distributed/pipeline/sync/test_dependency", + "distributed/pipeline/sync/test_inplace", + "distributed/pipeline/sync/test_microbatch", + "distributed/pipeline/sync/test_phony", + "distributed/pipeline/sync/test_pipe", + "distributed/pipeline/sync/test_pipeline", + "distributed/pipeline/sync/test_stream", + "distributed/pipeline/sync/test_transparency", + "distributed/pipeline/sync/test_worker", "distributed/elastic/agent/server/test/api_test", - 'distributed/elastic/multiprocessing/api_test', - 'distributed/_sharded_tensor/test_sharded_tensor', + "distributed/elastic/multiprocessing/api_test", + "distributed/_sharded_tensor/test_sharded_tensor", ] ROCM_BLOCKLIST = [ - 'distributed/nn/jit/test_instantiator', - 'distributed/rpc/test_faulty_agent', - 'distributed/rpc/test_tensorpipe_agent', - 'distributed/rpc/cuda/test_tensorpipe_agent', - 'distributed/_sharded_tensor/test_sharded_tensor', - 'test_determination', - 'test_multiprocessing', - 'test_jit_legacy', - 'test_type_hints', - 'test_openmp', + "distributed/nn/jit/test_instantiator", + "distributed/rpc/test_faulty_agent", + "distributed/rpc/test_tensorpipe_agent", + "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/_sharded_tensor/test_sharded_tensor", + "test_determination", + "test_multiprocessing", + "test_jit_legacy", + "test_type_hints", + "test_openmp", ] RUN_PARALLEL_BLOCKLIST = [ - 'test_cpp_extensions_jit', - 'test_jit_disabled', - 'test_mobile_optimizer', - 'test_multiprocessing', - 'test_multiprocessing_spawn', - 'test_namedtuple_return_api', - 'test_overrides', - 'test_show_pickle', - 'test_tensorexpr', - 'test_cuda_primary_ctx', -] + [test for test in TESTS if test.startswith('distributed/')] - -WINDOWS_COVERAGE_BLOCKLIST = [ -] + "test_cpp_extensions_jit", + "test_jit_disabled", + "test_mobile_optimizer", + "test_multiprocessing", + "test_multiprocessing_spawn", + "test_namedtuple_return_api", + "test_overrides", + "test_show_pickle", + "test_tensorexpr", + "test_cuda_primary_ctx", +] + [test for test in TESTS if test.startswith("distributed/")] + +WINDOWS_COVERAGE_BLOCKLIST = [] # These tests are slow enough that it's worth calculating whether the patch @@ -273,76 +281,76 @@ # run with --determine-from, we use another generated list based on this one and the # previous test stats. TARGET_DET_LIST = [ - 'distributions/test_distributions', - 'test_nn', - 'test_autograd', - 'test_cpp_extensions_jit', - 'test_jit_legacy', - 'test_dataloader', - 'test_overrides', - 'test_linalg', - 'test_jit', - 'test_jit_profiling', - 'test_torch', - 'test_binary_ufuncs', - 'test_numpy_interop', - 'test_reductions', - 'test_shape_ops', - 'test_sort_and_select', - 'test_testing', - 'test_view_ops', - 'distributed/nn/jit/test_instantiator', - 'distributed/rpc/test_tensorpipe_agent', - 'distributed/rpc/cuda/test_tensorpipe_agent', - 'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks', - 'distributed/test_distributed_spawn', - 'test_cuda', - 'test_cuda_primary_ctx', - 'test_cpp_extensions_aot_ninja', - 'test_cpp_extensions_aot_no_ninja', - 'test_serialization', - 'test_optim', - 'test_utils', - 'test_multiprocessing', - 'test_tensorboard', - 'distributed/test_c10d_common', - 'distributed/test_c10d_gloo', - 'distributed/test_c10d_nccl', - 'distributed/test_jit_c10d', - 'distributed/test_c10d_spawn_gloo', - 'distributed/test_c10d_spawn_nccl', - 'distributed/test_store', - 'distributed/test_pg_wrapper', - 'test_quantization', - 'test_pruning_op', - 'test_determination', - 'test_futures', - 'distributed/pipeline/sync/skip/test_api', - 'distributed/pipeline/sync/skip/test_gpipe', - 'distributed/pipeline/sync/skip/test_inspect_skip_layout', - 'distributed/pipeline/sync/skip/test_leak', - 'distributed/pipeline/sync/skip/test_portal', - 'distributed/pipeline/sync/skip/test_stash_pop', - 'distributed/pipeline/sync/skip/test_tracker', - 'distributed/pipeline/sync/skip/test_verify_skippables', - 'distributed/pipeline/sync/test_balance', - 'distributed/pipeline/sync/test_bugs', - 'distributed/pipeline/sync/test_checkpoint', - 'distributed/pipeline/sync/test_copy', - 'distributed/pipeline/sync/test_deferred_batch_norm', - 'distributed/pipeline/sync/test_dependency', - 'distributed/pipeline/sync/test_inplace', - 'distributed/pipeline/sync/test_microbatch', - 'distributed/pipeline/sync/test_phony', - 'distributed/pipeline/sync/test_pipe', - 'distributed/pipeline/sync/test_pipeline', - 'distributed/pipeline/sync/test_stream', - 'distributed/pipeline/sync/test_transparency', - 'distributed/pipeline/sync/test_worker', + "distributions/test_distributions", + "test_nn", + "test_autograd", + "test_cpp_extensions_jit", + "test_jit_legacy", + "test_dataloader", + "test_overrides", + "test_linalg", + "test_jit", + "test_jit_profiling", + "test_torch", + "test_binary_ufuncs", + "test_numpy_interop", + "test_reductions", + "test_shape_ops", + "test_sort_and_select", + "test_testing", + "test_view_ops", + "distributed/nn/jit/test_instantiator", + "distributed/rpc/test_tensorpipe_agent", + "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks", + "distributed/test_distributed_spawn", + "test_cuda", + "test_cuda_primary_ctx", + "test_cpp_extensions_aot_ninja", + "test_cpp_extensions_aot_no_ninja", + "test_serialization", + "test_optim", + "test_utils", + "test_multiprocessing", + "test_tensorboard", + "distributed/test_c10d_common", + "distributed/test_c10d_gloo", + "distributed/test_c10d_nccl", + "distributed/test_jit_c10d", + "distributed/test_c10d_spawn_gloo", + "distributed/test_c10d_spawn_nccl", + "distributed/test_store", + "distributed/test_pg_wrapper", + "test_quantization", + "test_pruning_op", + "test_determination", + "test_futures", + "distributed/pipeline/sync/skip/test_api", + "distributed/pipeline/sync/skip/test_gpipe", + "distributed/pipeline/sync/skip/test_inspect_skip_layout", + "distributed/pipeline/sync/skip/test_leak", + "distributed/pipeline/sync/skip/test_portal", + "distributed/pipeline/sync/skip/test_stash_pop", + "distributed/pipeline/sync/skip/test_tracker", + "distributed/pipeline/sync/skip/test_verify_skippables", + "distributed/pipeline/sync/test_balance", + "distributed/pipeline/sync/test_bugs", + "distributed/pipeline/sync/test_checkpoint", + "distributed/pipeline/sync/test_copy", + "distributed/pipeline/sync/test_deferred_batch_norm", + "distributed/pipeline/sync/test_dependency", + "distributed/pipeline/sync/test_inplace", + "distributed/pipeline/sync/test_microbatch", + "distributed/pipeline/sync/test_phony", + "distributed/pipeline/sync/test_pipe", + "distributed/pipeline/sync/test_pipeline", + "distributed/pipeline/sync/test_stream", + "distributed/pipeline/sync/test_transparency", + "distributed/pipeline/sync/test_worker", ] # the JSON file to store the S3 test stats -TEST_TIMES_FILE = '.pytorch-test-times.json' +TEST_TIMES_FILE = ".pytorch-test-times.json" # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST SLOW_TEST_THRESHOLD = 300 @@ -353,28 +361,27 @@ if dist.is_available(): - DISTRIBUTED_TESTS_CONFIG['test'] = { - 'WORLD_SIZE': '1' - } + DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"} if not TEST_WITH_ROCM and dist.is_mpi_available(): - DISTRIBUTED_TESTS_CONFIG['mpi'] = { - 'WORLD_SIZE': '3', - 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi' + DISTRIBUTED_TESTS_CONFIG["mpi"] = { + "WORLD_SIZE": "3", + "TEST_REPORT_SOURCE_OVERRIDE": "dist-mpi", } if dist.is_nccl_available(): - DISTRIBUTED_TESTS_CONFIG['nccl'] = { - 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', - 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl' + DISTRIBUTED_TESTS_CONFIG["nccl"] = { + "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3", + "TEST_REPORT_SOURCE_OVERRIDE": "dist-nccl", } if dist.is_gloo_available(): - DISTRIBUTED_TESTS_CONFIG['gloo'] = { - 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', - 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo' + DISTRIBUTED_TESTS_CONFIG["gloo"] = { + "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3", + "TEST_REPORT_SOURCE_OVERRIDE": "dist-gloo", } # https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python -SIGNALS_TO_NAMES_DICT = {getattr(signal, n): n for n in dir(signal) - if n.startswith('SIG') and '_' not in n} +SIGNALS_TO_NAMES_DICT = { + getattr(signal, n): n for n in dir(signal) if n.startswith("SIG") and "_" not in n +} CPP_EXTENSIONS_ERROR = """ Ninja (https://ninja-build.org) is required for some of the C++ extensions @@ -385,18 +392,20 @@ PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE")) -ENABLE_PR_HISTORY_REORDERING = bool(os.environ.get("ENABLE_PR_HISTORY_REORDERING", "0") == "1") +ENABLE_PR_HISTORY_REORDERING = bool( + os.environ.get("ENABLE_PR_HISTORY_REORDERING", "0") == "1" +) JIT_EXECUTOR_TESTS = [ - 'test_jit_cuda_fuser', - 'test_jit_profiling', - 'test_jit_legacy', - 'test_jit_fuser_legacy', + "test_jit_cuda_fuser", + "test_jit_profiling", + "test_jit_legacy", + "test_jit_fuser_legacy", ] DISTRIBUTED_TESTS = [ - 'distributed/test_distributed_fork', - 'distributed/test_distributed_spawn', + "distributed/test_distributed_fork", + "distributed/test_distributed_spawn", ] # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when @@ -411,7 +420,7 @@ # The file from which the SPECIFIED_TEST_CASES_DICT will be filled, a CSV of test cases that would be run when # options.run_specified_test_cases is enabled. -SPECIFIED_TEST_CASES_FILE: str = '.pytorch_specified_test_cases.csv' +SPECIFIED_TEST_CASES_FILE: str = ".pytorch_specified_test_cases.csv" def print_to_stderr(message): @@ -421,15 +430,18 @@ def print_to_stderr(message): def get_test_case_args(test_module, using_pytest) -> List[str]: args = [] # if test_module not specified or specified with '__all__' then run all tests - if test_module not in SPECIFIED_TEST_CASES_DICT or '__all__' in SPECIFIED_TEST_CASES_DICT[test_module]: + if ( + test_module not in SPECIFIED_TEST_CASES_DICT + or "__all__" in SPECIFIED_TEST_CASES_DICT[test_module] + ): return args if using_pytest: - args.append('-k') - args.append(' or '.join(SPECIFIED_TEST_CASES_DICT[test_module])) + args.append("-k") + args.append(" or ".join(SPECIFIED_TEST_CASES_DICT[test_module])) else: for test in SPECIFIED_TEST_CASES_DICT[test_module]: - args.append('-k') + args.append("-k") args.append(test) return args @@ -437,59 +449,70 @@ def get_test_case_args(test_module, using_pytest) -> List[str]: def get_executable_command(options, allow_pytest, disable_coverage=False): if options.coverage and not disable_coverage: - executable = ['coverage', 'run', '--parallel-mode', '--source=torch'] + executable = ["coverage", "run", "--parallel-mode", "--source=torch"] else: executable = [sys.executable] if options.pytest: if allow_pytest: - executable += ['-m', 'pytest'] + executable += ["-m", "pytest"] else: - print_to_stderr('Pytest cannot be used for this test. Falling back to unittest.') + print_to_stderr( + "Pytest cannot be used for this test. Falling back to unittest." + ) return executable -def run_test(test_module, test_directory, options, launcher_cmd=None, extra_unittest_args=None): +def run_test( + test_module, test_directory, options, launcher_cmd=None, extra_unittest_args=None +): unittest_args = options.additional_unittest_args.copy() if options.verbose: unittest_args.append(f'-{"v"*options.verbose}') # in case of pytest if test_module in RUN_PARALLEL_BLOCKLIST: - unittest_args = [arg for arg in unittest_args if not arg.startswith('--run-parallel')] + unittest_args = [ + arg for arg in unittest_args if not arg.startswith("--run-parallel") + ] if extra_unittest_args: assert isinstance(extra_unittest_args, list) unittest_args.extend(extra_unittest_args) # If using pytest, replace -f with equivalent -x if options.pytest: - unittest_args = [arg if arg != '-f' else '-x' for arg in unittest_args] + unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args] elif IS_IN_CI: # use the downloaded test cases configuration, not supported in pytest - unittest_args.extend(['--import-slow-tests', '--import-disabled-tests']) + unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"]) # Multiprocessing related tests cannot run with coverage. # Tracking issue: https://github.com/pytorch/pytorch/issues/50661 - disable_coverage = sys.platform == 'win32' and test_module in WINDOWS_COVERAGE_BLOCKLIST + disable_coverage = ( + sys.platform == "win32" and test_module in WINDOWS_COVERAGE_BLOCKLIST + ) # Extra arguments are not supported with pytest - executable = get_executable_command(options, allow_pytest=not extra_unittest_args, - disable_coverage=disable_coverage) + executable = get_executable_command( + options, allow_pytest=not extra_unittest_args, disable_coverage=disable_coverage + ) # TODO: move this logic into common_utils.py instead of passing in "-k" individually # The following logic for running specified tests will only run for non-distributed tests, as those are dispatched # to test_distributed and not run_test (this function) if options.run_specified_test_cases: - unittest_args.extend(get_test_case_args(test_module, 'pytest' in executable)) + unittest_args.extend(get_test_case_args(test_module, "pytest" in executable)) # Can't call `python -m unittest test_*` here because it doesn't run code # in `if __name__ == '__main__': `. So call `python test_*.py` instead. - argv = [test_module + '.py'] + unittest_args + argv = [test_module + ".py"] + unittest_args command = (launcher_cmd or []) + executable + argv - print_to_stderr('Executing {} ... [{}]'.format(command, datetime.now())) + print_to_stderr("Executing {} ... [{}]".format(command, datetime.now())) return shell(command, test_directory) def test_cuda_primary_ctx(test_module, test_directory, options): - return run_test(test_module, test_directory, options, extra_unittest_args=['--subprocess']) + return run_test( + test_module, test_directory, options, extra_unittest_args=["--subprocess"] + ) def _test_cpp_extensions_aot(test_directory, options, use_ninja): @@ -501,46 +524,52 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja): return 1 # Wipe the build folder, if it exists already - cpp_extensions_test_dir = os.path.join(test_directory, 'cpp_extensions') - cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, 'build') + cpp_extensions_test_dir = os.path.join(test_directory, "cpp_extensions") + cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, "build") if os.path.exists(cpp_extensions_test_build_dir): shutil.rmtree(cpp_extensions_test_build_dir) # Build the test cpp extensions modules shell_env = os.environ.copy() - shell_env['USE_NINJA'] = str(1 if use_ninja else 0) - cmd = [sys.executable, 'setup.py', 'install', '--root', './install'] + shell_env["USE_NINJA"] = str(1 if use_ninja else 0) + cmd = [sys.executable, "setup.py", "install", "--root", "./install"] return_code = shell(cmd, cwd=cpp_extensions_test_dir, env=shell_env) if return_code != 0: return return_code - if sys.platform != 'win32': - return_code = shell(cmd, - cwd=os.path.join(cpp_extensions_test_dir, 'no_python_abi_suffix_test'), - env=shell_env) + if sys.platform != "win32": + return_code = shell( + cmd, + cwd=os.path.join(cpp_extensions_test_dir, "no_python_abi_suffix_test"), + env=shell_env, + ) if return_code != 0: return return_code # "install" the test modules and run tests - python_path = os.environ.get('PYTHONPATH', '') + python_path = os.environ.get("PYTHONPATH", "") from shutil import copyfile - test_module = 'test_cpp_extensions_aot' + ('_ninja' if use_ninja else '_no_ninja') - copyfile(test_directory + '/test_cpp_extensions_aot.py', test_directory + '/' + test_module + '.py') + + test_module = "test_cpp_extensions_aot" + ("_ninja" if use_ninja else "_no_ninja") + copyfile( + test_directory + "/test_cpp_extensions_aot.py", + test_directory + "/" + test_module + ".py", + ) try: - cpp_extensions = os.path.join(test_directory, 'cpp_extensions') - install_directory = '' + cpp_extensions = os.path.join(test_directory, "cpp_extensions") + install_directory = "" # install directory is the one that is named site-packages - for root, directories, _ in os.walk(os.path.join(cpp_extensions, 'install')): + for root, directories, _ in os.walk(os.path.join(cpp_extensions, "install")): for directory in directories: - if '-packages' in directory: + if "-packages" in directory: install_directory = os.path.join(root, directory) - assert install_directory, 'install_directory must not be empty' - os.environ['PYTHONPATH'] = os.pathsep.join([install_directory, python_path]) + assert install_directory, "install_directory must not be empty" + os.environ["PYTHONPATH"] = os.pathsep.join([install_directory, python_path]) return run_test(test_module, test_directory, options) finally: - os.environ['PYTHONPATH'] = python_path - if os.path.exists(test_directory + '/' + test_module + '.py'): - os.remove(test_directory + '/' + test_module + '.py') + os.environ["PYTHONPATH"] = python_path + if os.path.exists(test_directory + "/" + test_module + ".py"): + os.remove(test_directory + "/" + test_module + ".py") def test_cpp_extensions_aot_ninja(test_module, test_directory, options): @@ -553,53 +582,73 @@ def test_cpp_extensions_aot_no_ninja(test_module, test_directory, options): def test_distributed(test_module, test_directory, options): # MPI tests are broken with Python-3.9 - mpi_available = subprocess.call('command -v mpiexec', shell=True) == 0 and sys.version_info < (3, 9) + mpi_available = subprocess.call( + "command -v mpiexec", shell=True + ) == 0 and sys.version_info < (3, 9) if options.verbose and not mpi_available: - print_to_stderr( - 'MPI not available -- MPI backend tests will be skipped') + print_to_stderr("MPI not available -- MPI backend tests will be skipped") config = DISTRIBUTED_TESTS_CONFIG for backend, env_vars in config.items(): - if sys.platform == 'win32' and backend != 'gloo': + if sys.platform == "win32" and backend != "gloo": continue - if backend == 'mpi' and not mpi_available: + if backend == "mpi" and not mpi_available: continue for with_init_file in {True, False}: - if sys.platform == 'win32' and not with_init_file: + if sys.platform == "win32" and not with_init_file: continue tmp_dir = tempfile.mkdtemp() if options.verbose: init_str = "with {} init_method" with_init = init_str.format("file" if with_init_file else "env") print_to_stderr( - 'Running distributed tests for the {} backend {}'.format( - backend, with_init)) - os.environ['TEMP_DIR'] = tmp_dir - os.environ['BACKEND'] = backend - os.environ['INIT_METHOD'] = 'env://' + "Running distributed tests for the {} backend {}".format( + backend, with_init + ) + ) + os.environ["TEMP_DIR"] = tmp_dir + os.environ["BACKEND"] = backend + os.environ["INIT_METHOD"] = "env://" os.environ.update(env_vars) if with_init_file: if test_module == "test_distributed_spawn": - init_method = f'{FILE_SCHEMA}{tmp_dir}/' + init_method = f"{FILE_SCHEMA}{tmp_dir}/" else: - init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' - os.environ['INIT_METHOD'] = init_method + init_method = f"{FILE_SCHEMA}{tmp_dir}/shared_init_file" + os.environ["INIT_METHOD"] = init_method try: - os.mkdir(os.path.join(tmp_dir, 'barrier')) - os.mkdir(os.path.join(tmp_dir, 'test_dir')) - if backend == 'mpi': + os.mkdir(os.path.join(tmp_dir, "barrier")) + os.mkdir(os.path.join(tmp_dir, "test_dir")) + if backend == "mpi": # test mpiexec for --noprefix option - with open(os.devnull, 'w') as devnull: - allowrunasroot_opt = '--allow-run-as-root' if subprocess.call( - 'mpiexec --allow-run-as-root -n 1 bash -c ""', shell=True, - stdout=devnull, stderr=subprocess.STDOUT) == 0 else '' - noprefix_opt = '--noprefix' if subprocess.call( - f'mpiexec {allowrunasroot_opt} -n 1 --noprefix bash -c ""', shell=True, - stdout=devnull, stderr=subprocess.STDOUT) == 0 else '' - - mpiexec = ['mpiexec', '-n', '3', noprefix_opt, allowrunasroot_opt] - - return_code = run_test(test_module, test_directory, options, - launcher_cmd=mpiexec) + with open(os.devnull, "w") as devnull: + allowrunasroot_opt = ( + "--allow-run-as-root" + if subprocess.call( + 'mpiexec --allow-run-as-root -n 1 bash -c ""', + shell=True, + stdout=devnull, + stderr=subprocess.STDOUT, + ) + == 0 + else "" + ) + noprefix_opt = ( + "--noprefix" + if subprocess.call( + f'mpiexec {allowrunasroot_opt} -n 1 --noprefix bash -c ""', + shell=True, + stdout=devnull, + stderr=subprocess.STDOUT, + ) + == 0 + else "" + ) + + mpiexec = ["mpiexec", "-n", "3", noprefix_opt, allowrunasroot_opt] + + return_code = run_test( + test_module, test_directory, options, launcher_cmd=mpiexec + ) else: return_code = run_test(test_module, test_directory, options) if return_code != 0: @@ -610,15 +659,15 @@ def test_distributed(test_module, test_directory, options): CUSTOM_HANDLERS = { - 'test_cuda_primary_ctx': test_cuda_primary_ctx, - 'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja, - 'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja, - 'distributed/test_distributed_spawn': test_distributed, + "test_cuda_primary_ctx": test_cuda_primary_ctx, + "test_cpp_extensions_aot_no_ninja": test_cpp_extensions_aot_no_ninja, + "test_cpp_extensions_aot_ninja": test_cpp_extensions_aot_ninja, + "distributed/test_distributed_spawn": test_distributed, } def parse_test_module(test): - return test.split('.')[0] + return test.split(".")[0] class TestChoices(list): @@ -631,137 +680,152 @@ def __contains__(self, item): def parse_args(): parser = argparse.ArgumentParser( - description='Run the PyTorch unit test suite', - epilog='where TESTS is any of: {}'.format(', '.join(TESTS)), - formatter_class=argparse.RawTextHelpFormatter) + description="Run the PyTorch unit test suite", + epilog="where TESTS is any of: {}".format(", ".join(TESTS)), + formatter_class=argparse.RawTextHelpFormatter, + ) parser.add_argument( - '-v', - '--verbose', - action='count', + "-v", + "--verbose", + action="count", default=0, - help='print verbose information and test-by-test results') - parser.add_argument( - '--jit', - '--jit', - action='store_true', - help='run all jit tests') + help="print verbose information and test-by-test results", + ) + parser.add_argument("--jit", "--jit", action="store_true", help="run all jit tests") parser.add_argument( - '--distributed-tests', - '--distributed-tests', - action='store_true', - help='run all distributed tests') + "--distributed-tests", + "--distributed-tests", + action="store_true", + help="run all distributed tests", + ) parser.add_argument( - '-pt', '--pytest', action='store_true', - help='If true, use `pytest` to execute the tests. E.g., this runs ' - 'TestTorch with pytest in verbose and coverage mode: ' - 'python run_test.py -vci torch -pt') + "-pt", + "--pytest", + action="store_true", + help="If true, use `pytest` to execute the tests. E.g., this runs " + "TestTorch with pytest in verbose and coverage mode: " + "python run_test.py -vci torch -pt", + ) parser.add_argument( - '-c', '--coverage', action='store_true', help='enable coverage', - default=PYTORCH_COLLECT_COVERAGE) + "-c", + "--coverage", + action="store_true", + help="enable coverage", + default=PYTORCH_COLLECT_COVERAGE, + ) parser.add_argument( - '-i', - '--include', - nargs='+', + "-i", + "--include", + nargs="+", choices=TestChoices(TESTS), default=TESTS, - metavar='TESTS', - help='select a set of tests to include (defaults to ALL tests).' - ' tests must be a part of the TESTS list defined in run_test.py') + metavar="TESTS", + help="select a set of tests to include (defaults to ALL tests)." + " tests must be a part of the TESTS list defined in run_test.py", + ) parser.add_argument( - '-x', - '--exclude', - nargs='+', + "-x", + "--exclude", + nargs="+", choices=TESTS, - metavar='TESTS', + metavar="TESTS", default=[], - help='select a set of tests to exclude') + help="select a set of tests to exclude", + ) parser.add_argument( - '-f', - '--first', + "-f", + "--first", choices=TESTS, - metavar='TESTS', - help='select the test to start from (excludes previous tests)') + metavar="TESTS", + help="select the test to start from (excludes previous tests)", + ) parser.add_argument( - '-l', - '--last', + "-l", + "--last", choices=TESTS, - metavar='TESTS', - help='select the last test to run (excludes following tests)') + metavar="TESTS", + help="select the last test to run (excludes following tests)", + ) parser.add_argument( - '--bring-to-front', - nargs='+', + "--bring-to-front", + nargs="+", choices=TestChoices(TESTS), default=[], - metavar='TESTS', - help='select a set of tests to run first. This can be used in situations' - ' where you want to run all tests, but care more about some set, ' - 'e.g. after making a change to a specific component') + metavar="TESTS", + help="select a set of tests to run first. This can be used in situations" + " where you want to run all tests, but care more about some set, " + "e.g. after making a change to a specific component", + ) parser.add_argument( - '--ignore-win-blocklist', - action='store_true', - help='always run blocklisted windows tests') + "--ignore-win-blocklist", + action="store_true", + help="always run blocklisted windows tests", + ) parser.add_argument( - '--determine-from', - help='File of affected source filenames to determine which tests to run.') + "--determine-from", + help="File of affected source filenames to determine which tests to run.", + ) parser.add_argument( - '--continue-through-error', - action='store_true', - help='Runs the full test suite despite one of the tests failing', - default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False"))) + "--continue-through-error", + action="store_true", + help="Runs the full test suite despite one of the tests failing", + default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False")), + ) parser.add_argument( - 'additional_unittest_args', - nargs='*', - help='additional arguments passed through to unittest, e.g., ' - 'python run_test.py -i sparse -- TestSparse.test_factory_size_check') + "additional_unittest_args", + nargs="*", + help="additional arguments passed through to unittest, e.g., " + "python run_test.py -i sparse -- TestSparse.test_factory_size_check", + ) parser.add_argument( - '--export-past-test-times', - nargs='?', + "--export-past-test-times", + nargs="?", type=str, const=TEST_TIMES_FILE, - help='dumps test times from previous S3 stats into a file, format JSON', + help="dumps test times from previous S3 stats into a file, format JSON", ) parser.add_argument( - '--shard', + "--shard", nargs=2, type=int, - help='runs a shard of the tests (taking into account other selections), e.g., ' - '--shard 2 3 will break up the selected tests into 3 shards and run the tests ' - 'in the 2nd shard (the first number should not exceed the second)', + help="runs a shard of the tests (taking into account other selections), e.g., " + "--shard 2 3 will break up the selected tests into 3 shards and run the tests " + "in the 2nd shard (the first number should not exceed the second)", ) parser.add_argument( - '--exclude-jit-executor', - action='store_true', - help='exclude tests that are run for a specific jit config' + "--exclude-jit-executor", + action="store_true", + help="exclude tests that are run for a specific jit config", ) parser.add_argument( - '--exclude-distributed-tests', - action='store_true', - help='exclude distributed tests' + "--exclude-distributed-tests", + action="store_true", + help="exclude distributed tests", ) parser.add_argument( - '--run-specified-test-cases', - nargs='?', + "--run-specified-test-cases", + nargs="?", type=str, const=SPECIFIED_TEST_CASES_FILE, - help='load specified test cases file dumped from previous OSS CI stats, format CSV. ' - ' If all test cases should run for a please add a single row: \n' - ' test_filename,test_case_name\n' - ' ...\n' - ' ,__all__\n' - ' ...\n' - 'how we use the stats will be based on option "--use-specified-test-cases-by".' + help="load specified test cases file dumped from previous OSS CI stats, format CSV. " + " If all test cases should run for a please add a single row: \n" + " test_filename,test_case_name\n" + " ...\n" + " ,__all__\n" + " ...\n" + 'how we use the stats will be based on option "--use-specified-test-cases-by".', ) parser.add_argument( - '--use-specified-test-cases-by', + "--use-specified-test-cases-by", type=str, - choices=['include', 'bring-to-front'], - default='include', + choices=["include", "bring-to-front"], + default="include", help='used together with option "--run-specified-test-cases". When specified test case ' - 'file is set, this option allows the user to control whether to only run the specified test ' - 'modules or to simply bring the specified modules to front and also run the remaining ' - 'modules. Note: regardless of this option, we will only run the specified test cases ' - ' within a specified test module. For unspecified test modules with the bring-to-front ' - 'option, all test cases will be run, as one may expect.', + "file is set, this option allows the user to control whether to only run the specified test " + "modules or to simply bring the specified modules to front and also run the remaining " + "modules. Note: regardless of this option, we will only run the specified test cases " + " within a specified test module. For unspecified test modules with the bring-to-front " + "option, all test cases will be run, as one may expect.", ) return parser.parse_args() @@ -809,7 +873,7 @@ def exclude_tests(exclude_list, selected_tests, exclude_message=None): for test in tests_copy: if test.startswith(exclude_test): if exclude_message is not None: - print_to_stderr('Excluding {} {}'.format(test, exclude_message)) + print_to_stderr("Excluding {} {}".format(test, exclude_message)) selected_tests.remove(test) return selected_tests @@ -817,9 +881,9 @@ def exclude_tests(exclude_list, selected_tests, exclude_message=None): def get_selected_tests(options): # First make sure run specific test cases options are processed. if options.run_specified_test_cases: - if options.use_specified_test_cases_by == 'include': + if options.use_specified_test_cases_by == "include": options.include = list(SPECIFIED_TEST_CASES_DICT.keys()) - elif options.use_specified_test_cases_by == 'bring-to-front': + elif options.use_specified_test_cases_by == "bring-to-front": options.bring_to_front = list(SPECIFIED_TEST_CASES_DICT.keys()) selected_tests = options.include @@ -827,17 +891,20 @@ def get_selected_tests(options): # filter if there's JIT only and distributed only test options if options.jit: selected_tests = list( - filter(lambda test_name: "jit" in test_name, selected_tests)) + filter(lambda test_name: "jit" in test_name, selected_tests) + ) if options.distributed_tests: selected_tests = list( - filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests)) + filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests) + ) # process reordering if options.bring_to_front: to_front = set(options.bring_to_front) - selected_tests = options.bring_to_front + list(filter(lambda name: name not in to_front, - selected_tests)) + selected_tests = options.bring_to_front + list( + filter(lambda name: name not in to_front, selected_tests) + ) if options.first: first_index = find_test_index(options.first, selected_tests) @@ -845,7 +912,7 @@ def get_selected_tests(options): if options.last: last_index = find_test_index(options.last, selected_tests, find_last_index=True) - selected_tests = selected_tests[:last_index + 1] + selected_tests = selected_tests[: last_index + 1] # process exclusion if options.exclude_jit_executor: @@ -856,30 +923,36 @@ def get_selected_tests(options): selected_tests = exclude_tests(options.exclude, selected_tests) - if sys.platform == 'win32' and not options.ignore_win_blocklist: - target_arch = os.environ.get('VSCMD_ARG_TGT_ARCH') - if target_arch != 'x64': - WINDOWS_BLOCKLIST.append('cpp_extensions_aot_no_ninja') - WINDOWS_BLOCKLIST.append('cpp_extensions_aot_ninja') - WINDOWS_BLOCKLIST.append('cpp_extensions_jit') - WINDOWS_BLOCKLIST.append('jit') - WINDOWS_BLOCKLIST.append('jit_fuser') + if sys.platform == "win32" and not options.ignore_win_blocklist: + target_arch = os.environ.get("VSCMD_ARG_TGT_ARCH") + if target_arch != "x64": + WINDOWS_BLOCKLIST.append("cpp_extensions_aot_no_ninja") + WINDOWS_BLOCKLIST.append("cpp_extensions_aot_ninja") + WINDOWS_BLOCKLIST.append("cpp_extensions_jit") + WINDOWS_BLOCKLIST.append("jit") + WINDOWS_BLOCKLIST.append("jit_fuser") - selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests, 'on Windows') + selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests, "on Windows") elif TEST_WITH_ROCM: - selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, 'on ROCm') + selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, "on ROCm") # sharding if options.shard: assert len(options.shard) == 2, "Unexpected shard format" assert min(options.shard) > 0, "Shards must be positive numbers" which_shard, num_shards = options.shard - assert which_shard <= num_shards, "Selected shard must be less than or equal to total number of shards" - assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}" + assert ( + which_shard <= num_shards + ), "Selected shard must be less than or equal to total number of shards" + assert num_shards <= len( + selected_tests + ), f"Number of shards must be less than {len(selected_tests)}" # TODO: fix this to use test_times_filename, but currently this is not working # because setting the export arg immeidately halts the test execution. - selected_tests = get_shard_based_on_S3(which_shard, num_shards, selected_tests, TEST_TIMES_FILE) + selected_tests = get_shard_based_on_S3( + which_shard, num_shards, selected_tests, TEST_TIMES_FILE + ) return selected_tests @@ -896,27 +969,27 @@ def test_impact_of_file(filename): CI - CI configuration files """ parts = filename.split(os.sep) - if parts[0] in ['.jenkins', '.circleci']: - return 'CI' - if parts[0] in ['docs', 'scripts', 'CODEOWNERS', 'README.md']: - return 'NONE' - elif parts[0] == 'torch': - if parts[-1].endswith('.py') or parts[-1].endswith('.pyi'): - return 'TORCH' - elif parts[0] == 'caffe2': - if parts[-1].endswith('.py') or parts[-1].endswith('.pyi'): - return 'CAFFE2' - elif parts[0] == 'test': - if parts[-1].endswith('.py') or parts[-1].endswith('.pyi'): - return 'TEST' - - return 'UNKNOWN' + if parts[0] in [".jenkins", ".circleci"]: + return "CI" + if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]: + return "NONE" + elif parts[0] == "torch": + if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): + return "TORCH" + elif parts[0] == "caffe2": + if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): + return "CAFFE2" + elif parts[0] == "test": + if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): + return "TEST" + + return "UNKNOWN" def log_test_reason(file_type, filename, test, options): if options.verbose: print_to_stderr( - 'Determination found {} file {} -- running {}'.format( + "Determination found {} file {} -- running {}".format( file_type, filename, test, @@ -930,37 +1003,37 @@ def get_dep_modules(test): return _DEP_MODULES_CACHE[test] repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - test_location = os.path.join(repo_root, 'test', test + '.py') + test_location = os.path.join(repo_root, "test", test + ".py") finder = modulefinder.ModuleFinder( # Ideally exclude all third party modules, to speed up calculation. excludes=[ - 'scipy', - 'numpy', - 'numba', - 'multiprocessing', - 'sklearn', - 'setuptools', - 'hypothesis', - 'llvmlite', - 'joblib', - 'email', - 'importlib', - 'unittest', - 'urllib', - 'json', - 'collections', + "scipy", + "numpy", + "numba", + "multiprocessing", + "sklearn", + "setuptools", + "hypothesis", + "llvmlite", + "joblib", + "email", + "importlib", + "unittest", + "urllib", + "json", + "collections", # Modules below are excluded because they are hitting https://bugs.python.org/issue40350 # Trigger AttributeError: 'NoneType' object has no attribute 'is_package' - 'mpl_toolkits', - 'google', - 'onnx', + "mpl_toolkits", + "google", + "onnx", # Triggers RecursionError - 'mypy' + "mypy", ], ) # HACK: some platforms default to ascii, so we can't just run_script :( - with open(test_location, 'r', encoding='utf-8') as fp: - finder.load_module('__main__', fp, test_location, ('', 'r', 1)) + with open(test_location, "r", encoding="utf-8") as fp: + finder.load_module("__main__", fp, test_location, ("", "r", 1)) dep_modules = set(finder.modules.keys()) _DEP_MODULES_CACHE[test] = dep_modules @@ -972,45 +1045,44 @@ def determine_target(target_det_list, test, touched_files, options): # Some tests are faster to execute than to determine. if test not in target_det_list: if options.verbose: - print_to_stderr(f'Running {test} without determination') + print_to_stderr(f"Running {test} without determination") return True # HACK: "no_ninja" is not a real module - if test.endswith('_no_ninja'): - test = test[:(-1 * len('_no_ninja'))] - if test.endswith('_ninja'): - test = test[:(-1 * len('_ninja'))] + if test.endswith("_no_ninja"): + test = test[: (-1 * len("_no_ninja"))] + if test.endswith("_ninja"): + test = test[: (-1 * len("_ninja"))] dep_modules = get_dep_modules(test) for touched_file in touched_files: file_type = test_impact_of_file(touched_file) - if file_type == 'NONE': + if file_type == "NONE": continue - elif file_type == 'CI': + elif file_type == "CI": # Force all tests to run if any change is made to the CI # configurations. log_test_reason(file_type, touched_file, test, options) return True - elif file_type == 'UNKNOWN': + elif file_type == "UNKNOWN": # Assume uncategorized source files can affect every test. log_test_reason(file_type, touched_file, test, options) return True - elif file_type in ['TORCH', 'CAFFE2', 'TEST']: + elif file_type in ["TORCH", "CAFFE2", "TEST"]: parts = os.path.splitext(touched_file)[0].split(os.sep) touched_module = ".".join(parts) # test/ path does not have a "test." namespace - if touched_module.startswith('test.'): - touched_module = touched_module.split('test.')[1] - if ( - touched_module in dep_modules - or touched_module == test.replace('/', '.') + if touched_module.startswith("test."): + touched_module = touched_module.split("test.")[1] + if touched_module in dep_modules or touched_module == test.replace( + "/", "." ): log_test_reason(file_type, touched_file, test, options) return True # If nothing has determined the test has run, don't run the test. if options.verbose: - print_to_stderr(f'Determination is skipping {test}') + print_to_stderr(f"Determination is skipping {test}") return False @@ -1019,20 +1091,21 @@ def run_test_module(test: str, test_directory: str, options) -> Optional[str]: test_module = parse_test_module(test) # Printing the date here can help diagnose which tests are slow - print_to_stderr('Running {} ... [{}]'.format(test, datetime.now())) + print_to_stderr("Running {} ... [{}]".format(test, datetime.now())) handler = CUSTOM_HANDLERS.get(test_module, run_test) return_code = handler(test_module, test_directory, options) assert isinstance(return_code, int) and not isinstance( - return_code, bool), 'Return code should be an integer' + return_code, bool + ), "Return code should be an integer" if return_code == 0: return None - message = f'{test} failed!' + message = f"{test} failed!" if return_code < 0: # subprocess.Popen returns the child process' exit signal as # return code -N, where N is the signal number. signal_name = SIGNALS_TO_NAMES_DICT[-return_code] - message += f' Received signal: {signal_name}' + message += f" Received signal: {signal_name}" return message @@ -1042,44 +1115,60 @@ def main(): # TODO: move this export & download function in tools/ folder test_times_filename = options.export_past_test_times if test_times_filename: - print(f'Exporting past test times from S3 to {test_times_filename}, no tests will be run.') + print( + f"Exporting past test times from S3 to {test_times_filename}, no tests will be run." + ) export_S3_test_times(test_times_filename) return specified_test_cases_filename = options.run_specified_test_cases if specified_test_cases_filename: - print(f'Loading specified test cases to run from {specified_test_cases_filename}.') + print( + f"Loading specified test cases to run from {specified_test_cases_filename}." + ) global SPECIFIED_TEST_CASES_DICT - SPECIFIED_TEST_CASES_DICT = get_specified_test_cases(specified_test_cases_filename, TESTS) + SPECIFIED_TEST_CASES_DICT = get_specified_test_cases( + specified_test_cases_filename, TESTS + ) test_directory = os.path.dirname(os.path.abspath(__file__)) selected_tests = get_selected_tests(options) if options.verbose: - print_to_stderr('Selected tests: {}'.format(', '.join(selected_tests))) + print_to_stderr("Selected tests: {}".format(", ".join(selected_tests))) if options.coverage and not PYTORCH_COLLECT_COVERAGE: - shell(['coverage', 'erase']) + shell(["coverage", "erase"]) if options.determine_from is not None and os.path.exists(options.determine_from): - slow_tests = get_slow_tests_based_on_S3(TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD) - print('Added the following tests to target_det tests as calculated based on S3:') + slow_tests = get_slow_tests_based_on_S3( + TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD + ) + print( + "Added the following tests to target_det tests as calculated based on S3:" + ) print(slow_tests) - with open(options.determine_from, 'r') as fh: + with open(options.determine_from, "r") as fh: touched_files = [ - os.path.normpath(name.strip()) for name in fh.read().split('\n') + os.path.normpath(name.strip()) + for name in fh.read().split("\n") if len(name.strip()) > 0 ] # HACK: Ensure the 'test' paths can be traversed by Modulefinder - sys.path.append('test') + sys.path.append("test") selected_tests = [ - test for test in selected_tests - if determine_target(TARGET_DET_LIST + slow_tests, test, touched_files, options) + test + for test in selected_tests + if determine_target( + TARGET_DET_LIST + slow_tests, test, touched_files, options + ) ] - sys.path.remove('test') + sys.path.remove("test") if IS_IN_CI: - selected_tests = get_reordered_tests(selected_tests, ENABLE_PR_HISTORY_REORDERING) + selected_tests = get_reordered_tests( + selected_tests, ENABLE_PR_HISTORY_REORDERING + ) # downloading test cases configuration to local environment get_test_case_configs(dirpath=os.path.dirname(os.path.abspath(__file__))) @@ -1101,6 +1190,7 @@ def main(): finally: if options.coverage: from coverage import Coverage + test_dir = os.path.dirname(os.path.abspath(__file__)) with set_cwd(test_dir): cov = Coverage() @@ -1116,5 +1206,6 @@ def main(): print_to_stderr(err) sys.exit(1) -if __name__ == '__main__': + +if __name__ == "__main__": main() From 3d4aabfc483f274817749c45870a32306b67bfd8 Mon Sep 17 00:00:00 2001 From: zhouzhuojie Date: Wed, 25 Aug 2021 11:30:28 -0700 Subject: [PATCH 221/530] Fix ciflow/all label generation (#63954) Summary: the `ciflow/all` is automatically added but need to be added before we call `gen_root_job_condition`. - fix the order of adding `ciflow/all` - refactor all the string into global constants Pull Request resolved: https://github.com/pytorch/pytorch/pull/63954 Reviewed By: malfet Differential Revision: D30545596 Pulled By: zhouzhuojie fbshipit-source-id: 83ab668f0234488afb855a72e3ebd4503f7f1a78 --- .github/scripts/generate_ci_workflows.py | 68 +++++++++++-------- ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 +- ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 +- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 2 +- ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 2 +- ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 +- ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 +- .../generated-linux-xenial-py3.6-gcc5.4.yml | 2 +- ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 2 +- ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 +- ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 +- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 2 +- .../generated-win-vs2019-cpu-py3.yml | 2 +- .../generated-win-vs2019-cuda10.1-py3.yml | 2 +- .../generated-win-vs2019-cuda11.3-py3.yml | 2 +- 15 files changed, 53 insertions(+), 43 deletions(-) diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index e24c2e5af3893..946d8da6a29ad 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -38,6 +38,18 @@ LINUX_CPU_TEST_RUNNER, } +LABEL_CIFLOW_ALL = "ciflow/all" +LABEL_CIFLOW_BAZEL = "ciflow/bazel" +LABEL_CIFLOW_COVERAGE = "ciflow/coverage" +LABEL_CIFLOW_CPU = "ciflow/cpu" +LABEL_CIFLOW_CUDA = "ciflow/cuda" +LABEL_CIFLOW_DEFAULT = "ciflow/default" +LABEL_CIFLOW_LIBTORCH = "ciflow/libtorch" +LABEL_CIFLOW_LINUX = "ciflow/linux" +LABEL_CIFLOW_SCHEDULED = "ciflow/scheduled" +LABEL_CIFLOW_SLOW = "ciflow/slow" +LABEL_CIFLOW_WIN = "ciflow/win" + @dataclass class CIFlowConfig: @@ -73,6 +85,7 @@ def __post_init__(self) -> None: if not self.enabled: self.reset_root_job() return + self.labels.add(LABEL_CIFLOW_ALL) self.gen_root_job_condition() @@ -149,10 +162,6 @@ def __post_init__(self) -> None: self.num_test_shards_on_pull_request = 1 else: self.num_test_shards_on_pull_request = self.num_test_shards - - # Add ciflow/all to labels - self.ciflow_config.labels.add('ciflow/all') - self.assert_valid() def assert_valid(self) -> None: @@ -163,18 +172,19 @@ def assert_valid(self) -> None: assert self.test_runner_type in WINDOWS_RUNNERS, err_message if self.ciflow_config.enabled: - # make sure if ciflow/default is set, we then need to set trigger_action_only to False - assert self.ciflow_config.trigger_action_only != ('ciflow/default' in self.ciflow_config.labels) + # make sure if LABEL_CIFLOW_DEFAULT is set, we then need to set trigger_action_only to False + assert self.ciflow_config.trigger_action_only != (LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels) assert self.on_pull_request - assert 'ciflow/all' in self.ciflow_config.labels + assert LABEL_CIFLOW_ALL in self.ciflow_config.labels + assert LABEL_CIFLOW_ALL in self.ciflow_config.root_job_condition if self.arch == 'linux': - assert 'ciflow/linux' in self.ciflow_config.labels + assert LABEL_CIFLOW_LINUX in self.ciflow_config.labels if self.arch == 'windows': - assert 'ciflow/win' in self.ciflow_config.labels + assert LABEL_CIFLOW_WIN in self.ciflow_config.labels if self.test_runner_type in CUDA_RUNNERS: - assert 'ciflow/cuda' in self.ciflow_config.labels + assert LABEL_CIFLOW_CUDA in self.ciflow_config.labels if self.test_runner_type in CPU_RUNNERS: - assert 'ciflow/cpu' in self.ciflow_config.labels + assert LABEL_CIFLOW_CPU in self.ciflow_config.labels def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml" @@ -196,7 +206,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, - labels={'ciflow/default', 'ciflow/cpu', 'ciflow/win'} + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN} ), ), CIWorkflow( @@ -209,7 +219,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, - labels={'ciflow/default', 'ciflow/cuda', 'ciflow/win'} + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} ), ), CIWorkflow( @@ -222,7 +232,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/cuda', 'ciflow/win'} + labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} ), ), CIWorkflow( @@ -236,7 +246,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/scheduled', 'ciflow/win', 'ciflow/cuda'} + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_WIN, LABEL_CIFLOW_CUDA} ), ), ] @@ -252,7 +262,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, - labels={'ciflow/default', 'ciflow/linux', 'ciflow/cpu'} + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} ), ), # CIWorkflow( @@ -301,7 +311,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/slow', 'ciflow/linux', 'ciflow/cuda'} + labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA} ), ), CIWorkflow( @@ -319,7 +329,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels=set(['ciflow/slow', 'ciflow/linux', 'ciflow/cuda']), + labels=set([LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), ), ), CIWorkflow( @@ -332,7 +342,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']), + labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), ), ), CIWorkflow( @@ -344,7 +354,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: on_pull_request=True, ciflow_config=CIFlowConfig( enabled=True, - labels=set(['ciflow/default', 'ciflow/linux', 'ciflow/cuda']), + labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), ), ), CIWorkflow( @@ -357,7 +367,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']), + labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), ), ), CIWorkflow( @@ -371,7 +381,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/cuda'} + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA} ), ), CIWorkflow( @@ -385,7 +395,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_config=CIFlowConfig( enabled=True, trigger_action_only=True, - labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/libtorch', 'ciflow/cuda'}, + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CUDA}, ), ), # CIWorkflow( @@ -416,7 +426,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, - labels={'ciflow/default', 'ciflow/coverage', 'ciflow/linux', 'ciflow/cpu'}, + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_COVERAGE, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, ), ), # CIWorkflow( @@ -485,7 +495,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: on_pull_request=True, ciflow_config=CIFlowConfig( enabled=True, - labels={'ciflow/default', 'ciflow/bazel', 'ciflow/cpu', 'ciflow/linux'}, + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX}, ), ), ] @@ -517,8 +527,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: ciflow_ruleset.add_label_rule(workflow.ciflow_config.labels, workflow.build_environment) elif workflow.on_pull_request: # If ciflow is disabled but still on_pull_request, we can denote - # it as a special label 'ciflow/default' in the ruleset, which will be later - # turned into an actual 'ciflow/default' label in the workflow. - # During the rollout phase, it has the same effect as 'ciflow/default' - ciflow_ruleset.add_label_rule({'ciflow/default'}, workflow.build_environment) + # it as a special label LABEL_CIFLOW_DEFAULT in the ruleset, which will be later + # turned into an actual LABEL_CIFLOW_DEFAULT label in the workflow. + # During the rollout phase, it has the same effect as LABEL_CIFLOW_DEFAULT + ciflow_ruleset.add_label_rule({LABEL_CIFLOW_DEFAULT}, workflow.build_environment) ciflow_ruleset.generate_json() diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index f45ed052e3838..72a9c4effeee3 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 95261026f3862..937a531c977e5 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 61a817ea64bc1..5a4b6c6a56c78 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index f07b8712b6ea1..1226715485f21 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index cb8c6b55b1789..38321b1834b26 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 3273cb0395437..2daf432ae76c8 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index e3be43370a777..3551fe9845218 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 1827249beae99..9e787e4ba3845 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -32,7 +32,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index de0aa4bb3333c..09989ef516a7e 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -30,7 +30,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 3f1b5b4a85f68..c87397849106e 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -30,7 +30,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 4bf74faae1843..2c673ccce0f43 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -35,7 +35,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 53acdd8a961b7..54362c903f7d0 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -35,7 +35,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index a3447bc41f616..c8497bd3029ee 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -37,7 +37,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} steps: - name: noop run: echo running ciflow_should_run diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 1b423008fe5fd..205758657d9b2 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -37,7 +37,7 @@ concurrency: jobs: ciflow_should_run: runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }} steps: - name: noop run: echo running ciflow_should_run From 72995657681f6173413b5ee7c62bd91212d07e8d Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 25 Aug 2021 11:53:24 -0700 Subject: [PATCH 222/530] Update torch.distributed.run OMP_NUM_THREADS message to log.warning (#63953) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63953 Closes #61138 Test: `python -m torch.distributed.run --nproc_per_node 2 test.py` Still outputs message `LOGLEVEL=ERROR python -m torch.distributed.run --nproc_per_node 2 test.py` Does not output message anymore cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30542997 Pulled By: H-Huang fbshipit-source-id: e7da30dcda51516abf4e56f1f510132e44397027 --- torch/distributed/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index 9fb88fa3a2c96..f21fc4e68808f 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -595,7 +595,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 - print( + log.warning( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " From b0782f0f328321ab3ede798dfed3c7a143130e31 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Wed, 25 Aug 2021 11:53:52 -0700 Subject: [PATCH 223/530] add BFloat16 support for bernoulli and Dropout on CPU (#56372) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/56372 Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D28836792 Pulled By: VitalyFedyunin fbshipit-source-id: ede951d172a59276e11383fd767778ab959b5a6b --- aten/src/ATen/native/cpu/DistributionTemplates.h | 6 +++--- aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 2 +- test/test_nn.py | 2 +- test/test_torch.py | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 66bd31fa74d45..15b1916b9892c 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -308,7 +308,7 @@ struct ExponentialKernel { template void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); using self_t = scalar_t; @@ -325,7 +325,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { return static_cast(bernoulli(generator)); }); } else { - AT_DISPATCH_FLOATING_TYPES(p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] { using p_t = scalar_t; cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); @@ -338,7 +338,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { template void bernoulli_kernel(Tensor& self, double p, RNG generator) { - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); auto iter = TensorIterator::borrowing_nullary_op(self); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 007e444d5cd33..f86f0a349dace 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -488,7 +488,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional ge int64_t n = self.numel(); bool contig = self.is_contiguous(); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { at::Tensor tmp_int_tensor; if (std::is_same::value && contig) { tmp_int_tensor = self; diff --git a/test/test_nn.py b/test/test_nn.py index d577493fd531c..8c3541aca0716 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -12984,7 +12984,7 @@ def test_Dropout(self, device): self._test_dropout_stride_mean_preserve(nn.Dropout, device) - if self.device_type == 'cuda': + if self.device_type == 'cuda' or self.device_type == 'cpu': input = input.bfloat16() self._test_dropout(nn.Dropout, device, input) diff --git a/test/test_torch.py b/test/test_torch.py index d0f631a2eab52..15e36c83654db 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -4324,6 +4324,7 @@ def test_repeat_interleave(self, device): self.assertEqual(a_with_output.size(), torch.Size([3, 2])) @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False))) + @dtypesIfCPU(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=True))) @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False))) def test_bernoulli_p(self, device, dtype): for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]): From 7edeead796abf374a713e7855f13b980d7a9c517 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 25 Aug 2021 12:46:09 -0700 Subject: [PATCH 224/530] Add a comment on the potential implicit type up-casting (#63905) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63905 as title ghstack-source-id: 136590703 Test Plan: N/A Reviewed By: mrshenli Differential Revision: D30527929 fbshipit-source-id: 69402bbfa87cfd8fc166ce313cde9736ee072589 --- torch/distributed/algorithms/model_averaging/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py index ce1fb65401ad2..a2bbac2a25474 100644 --- a/torch/distributed/algorithms/model_averaging/utils.py +++ b/torch/distributed/algorithms/model_averaging/utils.py @@ -20,6 +20,9 @@ def average_parameters( return params_it1, params_it2 = itertools.tee(params) + # If the input parameters have different data types, + # packing these parameters will trigger an implicit type up-casting. + # The original parameter data types will be restored during the subsequent unpacking. flat_params = torch.cat([p.data.view(-1) for p in params_it1]) flat_params /= dist.get_world_size(group_to_use) # Make sure the allreduce will not conflict with any other ongoing process group. From ab5cf5a1eb17516dddf5162dc7ab3c670c997376 Mon Sep 17 00:00:00 2001 From: driazati Date: Wed, 25 Aug 2021 12:58:24 -0700 Subject: [PATCH 225/530] Move existing target determinator to tools (#63809) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63809 This moves out the modulefinder determinator to `tools/testing` since it is supposed to be CI-only. This also simplifies run_test.py a little bit. Test Plan: Imported from OSS Reviewed By: malfet, seemethere, janeyx99 Differential Revision: D30497438 Pulled By: driazati fbshipit-source-id: 1d203037af5af6a20c1e7812da935e7cbb5cd82f --- test/run_test.py | 232 ++------------------- test/test_determination.py | 2 +- tools/testing/modulefinder_determinator.py | 224 ++++++++++++++++++++ 3 files changed, 241 insertions(+), 217 deletions(-) create mode 100644 tools/testing/modulefinder_determinator.py diff --git a/test/run_test.py b/test/run_test.py index ecc93fe03aa30..d3c661093a6e8 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -4,8 +4,8 @@ import copy from datetime import datetime from distutils.util import strtobool -import modulefinder import os +import pathlib import shutil import signal import subprocess @@ -24,9 +24,11 @@ import torch.distributed as dist from typing import Dict, Optional, List +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent + try: # using tools/ to optimize test run. - sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) + sys.path.append(str(REPO_ROOT)) from tools.testing.test_selections import ( export_S3_test_times, get_shard_based_on_S3, @@ -35,6 +37,10 @@ get_reordered_tests, get_test_case_configs, ) + from tools.testing.modulefinder_determinator import ( + should_run_test, + TARGET_DET_LIST, + ) HAVE_TEST_SELECTION_TOOLS = True except ImportError: @@ -276,87 +282,12 @@ WINDOWS_COVERAGE_BLOCKLIST = [] -# These tests are slow enough that it's worth calculating whether the patch -# touched any related files first. This list was manually generated, but for every -# run with --determine-from, we use another generated list based on this one and the -# previous test stats. -TARGET_DET_LIST = [ - "distributions/test_distributions", - "test_nn", - "test_autograd", - "test_cpp_extensions_jit", - "test_jit_legacy", - "test_dataloader", - "test_overrides", - "test_linalg", - "test_jit", - "test_jit_profiling", - "test_torch", - "test_binary_ufuncs", - "test_numpy_interop", - "test_reductions", - "test_shape_ops", - "test_sort_and_select", - "test_testing", - "test_view_ops", - "distributed/nn/jit/test_instantiator", - "distributed/rpc/test_tensorpipe_agent", - "distributed/rpc/cuda/test_tensorpipe_agent", - "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks", - "distributed/test_distributed_spawn", - "test_cuda", - "test_cuda_primary_ctx", - "test_cpp_extensions_aot_ninja", - "test_cpp_extensions_aot_no_ninja", - "test_serialization", - "test_optim", - "test_utils", - "test_multiprocessing", - "test_tensorboard", - "distributed/test_c10d_common", - "distributed/test_c10d_gloo", - "distributed/test_c10d_nccl", - "distributed/test_jit_c10d", - "distributed/test_c10d_spawn_gloo", - "distributed/test_c10d_spawn_nccl", - "distributed/test_store", - "distributed/test_pg_wrapper", - "test_quantization", - "test_pruning_op", - "test_determination", - "test_futures", - "distributed/pipeline/sync/skip/test_api", - "distributed/pipeline/sync/skip/test_gpipe", - "distributed/pipeline/sync/skip/test_inspect_skip_layout", - "distributed/pipeline/sync/skip/test_leak", - "distributed/pipeline/sync/skip/test_portal", - "distributed/pipeline/sync/skip/test_stash_pop", - "distributed/pipeline/sync/skip/test_tracker", - "distributed/pipeline/sync/skip/test_verify_skippables", - "distributed/pipeline/sync/test_balance", - "distributed/pipeline/sync/test_bugs", - "distributed/pipeline/sync/test_checkpoint", - "distributed/pipeline/sync/test_copy", - "distributed/pipeline/sync/test_deferred_batch_norm", - "distributed/pipeline/sync/test_dependency", - "distributed/pipeline/sync/test_inplace", - "distributed/pipeline/sync/test_microbatch", - "distributed/pipeline/sync/test_phony", - "distributed/pipeline/sync/test_pipe", - "distributed/pipeline/sync/test_pipeline", - "distributed/pipeline/sync/test_stream", - "distributed/pipeline/sync/test_transparency", - "distributed/pipeline/sync/test_worker", -] - # the JSON file to store the S3 test stats TEST_TIMES_FILE = ".pytorch-test-times.json" # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST SLOW_TEST_THRESHOLD = 300 -_DEP_MODULES_CACHE: Dict[str, set] = {} - DISTRIBUTED_TESTS_CONFIG = {} @@ -957,136 +888,6 @@ def get_selected_tests(options): return selected_tests -def test_impact_of_file(filename): - """Determine what class of impact this file has on test runs. - - Possible values: - TORCH - torch python code - CAFFE2 - caffe2 python code - TEST - torch test code - UNKNOWN - may affect all tests - NONE - known to have no effect on test outcome - CI - CI configuration files - """ - parts = filename.split(os.sep) - if parts[0] in [".jenkins", ".circleci"]: - return "CI" - if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]: - return "NONE" - elif parts[0] == "torch": - if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): - return "TORCH" - elif parts[0] == "caffe2": - if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): - return "CAFFE2" - elif parts[0] == "test": - if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): - return "TEST" - - return "UNKNOWN" - - -def log_test_reason(file_type, filename, test, options): - if options.verbose: - print_to_stderr( - "Determination found {} file {} -- running {}".format( - file_type, - filename, - test, - ) - ) - - -def get_dep_modules(test): - # Cache results in case of repetition - if test in _DEP_MODULES_CACHE: - return _DEP_MODULES_CACHE[test] - - repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - test_location = os.path.join(repo_root, "test", test + ".py") - finder = modulefinder.ModuleFinder( - # Ideally exclude all third party modules, to speed up calculation. - excludes=[ - "scipy", - "numpy", - "numba", - "multiprocessing", - "sklearn", - "setuptools", - "hypothesis", - "llvmlite", - "joblib", - "email", - "importlib", - "unittest", - "urllib", - "json", - "collections", - # Modules below are excluded because they are hitting https://bugs.python.org/issue40350 - # Trigger AttributeError: 'NoneType' object has no attribute 'is_package' - "mpl_toolkits", - "google", - "onnx", - # Triggers RecursionError - "mypy", - ], - ) - # HACK: some platforms default to ascii, so we can't just run_script :( - with open(test_location, "r", encoding="utf-8") as fp: - finder.load_module("__main__", fp, test_location, ("", "r", 1)) - - dep_modules = set(finder.modules.keys()) - _DEP_MODULES_CACHE[test] = dep_modules - return dep_modules - - -def determine_target(target_det_list, test, touched_files, options): - test = parse_test_module(test) - # Some tests are faster to execute than to determine. - if test not in target_det_list: - if options.verbose: - print_to_stderr(f"Running {test} without determination") - return True - # HACK: "no_ninja" is not a real module - if test.endswith("_no_ninja"): - test = test[: (-1 * len("_no_ninja"))] - if test.endswith("_ninja"): - test = test[: (-1 * len("_ninja"))] - - dep_modules = get_dep_modules(test) - - for touched_file in touched_files: - file_type = test_impact_of_file(touched_file) - if file_type == "NONE": - continue - elif file_type == "CI": - # Force all tests to run if any change is made to the CI - # configurations. - log_test_reason(file_type, touched_file, test, options) - return True - elif file_type == "UNKNOWN": - # Assume uncategorized source files can affect every test. - log_test_reason(file_type, touched_file, test, options) - return True - elif file_type in ["TORCH", "CAFFE2", "TEST"]: - parts = os.path.splitext(touched_file)[0].split(os.sep) - touched_module = ".".join(parts) - # test/ path does not have a "test." namespace - if touched_module.startswith("test."): - touched_module = touched_module.split("test.")[1] - if touched_module in dep_modules or touched_module == test.replace( - "/", "." - ): - log_test_reason(file_type, touched_file, test, options) - return True - - # If nothing has determined the test has run, don't run the test. - if options.verbose: - print_to_stderr(f"Determination is skipping {test}") - - return False - - def run_test_module(test: str, test_directory: str, options) -> Optional[str]: test_module = parse_test_module(test) @@ -1131,7 +932,7 @@ def main(): specified_test_cases_filename, TESTS ) - test_directory = os.path.dirname(os.path.abspath(__file__)) + test_directory = str(REPO_ROOT / "test") selected_tests = get_selected_tests(options) if options.verbose: @@ -1144,10 +945,10 @@ def main(): slow_tests = get_slow_tests_based_on_S3( TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD ) - print( + print_to_stderr( "Added the following tests to target_det tests as calculated based on S3:" ) - print(slow_tests) + print_to_stderr(slow_tests) with open(options.determine_from, "r") as fh: touched_files = [ os.path.normpath(name.strip()) @@ -1155,22 +956,22 @@ def main(): if len(name.strip()) > 0 ] # HACK: Ensure the 'test' paths can be traversed by Modulefinder - sys.path.append("test") + sys.path.append(test_directory) selected_tests = [ test for test in selected_tests - if determine_target( + if should_run_test( TARGET_DET_LIST + slow_tests, test, touched_files, options ) ] - sys.path.remove("test") + sys.path.remove(test_directory) if IS_IN_CI: selected_tests = get_reordered_tests( selected_tests, ENABLE_PR_HISTORY_REORDERING ) # downloading test cases configuration to local environment - get_test_case_configs(dirpath=os.path.dirname(os.path.abspath(__file__))) + get_test_case_configs(dirpath=test_directory) has_failed = False failure_messages = [] @@ -1191,8 +992,7 @@ def main(): if options.coverage: from coverage import Coverage - test_dir = os.path.dirname(os.path.abspath(__file__)) - with set_cwd(test_dir): + with set_cwd(test_directory): cov = Coverage() if PYTORCH_COLLECT_COVERAGE: cov.load() diff --git a/test/test_determination.py b/test/test_determination.py index 6b7fcc0f0d242..277bbd2bc166c 100644 --- a/test/test_determination.py +++ b/test/test_determination.py @@ -30,7 +30,7 @@ def determined_tests(cls, changed_files): return [ test for test in cls.TESTS - if run_test.determine_target(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions()) + if run_test.should_run_test(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions()) ] def test_config_change_only(self): diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py new file mode 100644 index 0000000000000..8acd0ed9cc2f0 --- /dev/null +++ b/tools/testing/modulefinder_determinator.py @@ -0,0 +1,224 @@ +import os +import modulefinder +import sys +import pathlib +import warnings +from typing import Dict, Any, List, Set + +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent + +# These tests are slow enough that it's worth calculating whether the patch +# touched any related files first. This list was manually generated, but for every +# run with --determine-from, we use another generated list based on this one and the +# previous test stats. +TARGET_DET_LIST = [ + "distributions/test_distributions", + "test_nn", + "test_autograd", + "test_cpp_extensions_jit", + "test_jit_legacy", + "test_dataloader", + "test_overrides", + "test_linalg", + "test_jit", + "test_jit_profiling", + "test_torch", + "test_binary_ufuncs", + "test_numpy_interop", + "test_reductions", + "test_shape_ops", + "test_sort_and_select", + "test_testing", + "test_view_ops", + "distributed/nn/jit/test_instantiator", + "distributed/rpc/test_tensorpipe_agent", + "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks", + "distributed/test_distributed_spawn", + "test_cuda", + "test_cuda_primary_ctx", + "test_cpp_extensions_aot_ninja", + "test_cpp_extensions_aot_no_ninja", + "test_serialization", + "test_optim", + "test_utils", + "test_multiprocessing", + "test_tensorboard", + "distributed/test_c10d_common", + "distributed/test_c10d_gloo", + "distributed/test_c10d_nccl", + "distributed/test_jit_c10d", + "distributed/test_c10d_spawn_gloo", + "distributed/test_c10d_spawn_nccl", + "distributed/test_store", + "distributed/test_pg_wrapper", + "test_quantization", + "test_pruning_op", + "test_determination", + "test_futures", + "distributed/pipeline/sync/skip/test_api", + "distributed/pipeline/sync/skip/test_gpipe", + "distributed/pipeline/sync/skip/test_inspect_skip_layout", + "distributed/pipeline/sync/skip/test_leak", + "distributed/pipeline/sync/skip/test_portal", + "distributed/pipeline/sync/skip/test_stash_pop", + "distributed/pipeline/sync/skip/test_tracker", + "distributed/pipeline/sync/skip/test_verify_skippables", + "distributed/pipeline/sync/test_balance", + "distributed/pipeline/sync/test_bugs", + "distributed/pipeline/sync/test_checkpoint", + "distributed/pipeline/sync/test_copy", + "distributed/pipeline/sync/test_deferred_batch_norm", + "distributed/pipeline/sync/test_dependency", + "distributed/pipeline/sync/test_inplace", + "distributed/pipeline/sync/test_microbatch", + "distributed/pipeline/sync/test_phony", + "distributed/pipeline/sync/test_pipe", + "distributed/pipeline/sync/test_pipeline", + "distributed/pipeline/sync/test_stream", + "distributed/pipeline/sync/test_transparency", + "distributed/pipeline/sync/test_worker", +] + +_DEP_MODULES_CACHE: Dict[str, Set[str]] = {} + + +def should_run_test( + target_det_list: List[str], test: str, touched_files: List[str], options: Any +) -> bool: + test = parse_test_module(test) + # Some tests are faster to execute than to determine. + if test not in target_det_list: + if options.verbose: + print_to_stderr(f"Running {test} without determination") + return True + # HACK: "no_ninja" is not a real module + if test.endswith("_no_ninja"): + test = test[: (-1 * len("_no_ninja"))] + if test.endswith("_ninja"): + test = test[: (-1 * len("_ninja"))] + + dep_modules = get_dep_modules(test) + + for touched_file in touched_files: + file_type = test_impact_of_file(touched_file) + if file_type == "NONE": + continue + elif file_type == "CI": + # Force all tests to run if any change is made to the CI + # configurations. + log_test_reason(file_type, touched_file, test, options) + return True + elif file_type == "UNKNOWN": + # Assume uncategorized source files can affect every test. + log_test_reason(file_type, touched_file, test, options) + return True + elif file_type in ["TORCH", "CAFFE2", "TEST"]: + parts = os.path.splitext(touched_file)[0].split(os.sep) + touched_module = ".".join(parts) + # test/ path does not have a "test." namespace + if touched_module.startswith("test."): + touched_module = touched_module.split("test.")[1] + if touched_module in dep_modules or touched_module == test.replace( + "/", "." + ): + log_test_reason(file_type, touched_file, test, options) + return True + + # If nothing has determined the test has run, don't run the test. + if options.verbose: + print_to_stderr(f"Determination is skipping {test}") + + return False + + +def test_impact_of_file(filename: str) -> str: + """Determine what class of impact this file has on test runs. + + Possible values: + TORCH - torch python code + CAFFE2 - caffe2 python code + TEST - torch test code + UNKNOWN - may affect all tests + NONE - known to have no effect on test outcome + CI - CI configuration files + """ + parts = filename.split(os.sep) + if parts[0] in [".jenkins", ".circleci"]: + return "CI" + if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]: + return "NONE" + elif parts[0] == "torch": + if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): + return "TORCH" + elif parts[0] == "caffe2": + if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): + return "CAFFE2" + elif parts[0] == "test": + if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"): + return "TEST" + + return "UNKNOWN" + + +def log_test_reason(file_type: str, filename: str, test: str, options: Any) -> None: + if options.verbose: + print_to_stderr( + "Determination found {} file {} -- running {}".format( + file_type, + filename, + test, + ) + ) + + +def get_dep_modules(test: str) -> Set[str]: + # Cache results in case of repetition + if test in _DEP_MODULES_CACHE: + return _DEP_MODULES_CACHE[test] + + test_location = REPO_ROOT / "test" / f"{test}.py" + + # HACK: some platforms default to ascii, so we can't just run_script :( + finder = modulefinder.ModuleFinder( + # Ideally exclude all third party modules, to speed up calculation. + excludes=[ + "scipy", + "numpy", + "numba", + "multiprocessing", + "sklearn", + "setuptools", + "hypothesis", + "llvmlite", + "joblib", + "email", + "importlib", + "unittest", + "urllib", + "json", + "collections", + # Modules below are excluded because they are hitting https://bugs.python.org/issue40350 + # Trigger AttributeError: 'NoneType' object has no attribute 'is_package' + "mpl_toolkits", + "google", + "onnx", + # Triggers RecursionError + "mypy", + ], + ) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + finder.run_script(str(test_location)) + dep_modules = set(finder.modules.keys()) + _DEP_MODULES_CACHE[test] = dep_modules + return dep_modules + + +def parse_test_module(test: str) -> str: + return test.split(".")[0] + + +def print_to_stderr(message: str) -> None: + print(message, file=sys.stderr) From 5b548f6f64ebd7b2187cf7e79043eb7d2e92f2cf Mon Sep 17 00:00:00 2001 From: Priya Ramani Date: Wed, 25 Aug 2021 13:08:12 -0700 Subject: [PATCH 226/530] Shape Propagation Pass: Fix AdaptiveAveragePooling2d (#63629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63629 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30461727 Pulled By: priyaramani fbshipit-source-id: 3873d1d636f79185680b82de06174d8de288c941 --- test/jit/test_symbolic_shape_analysis.py | 32 +++++++++++++------ .../jit/runtime/symbolic_shape_registry.cpp | 13 ++++++-- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py index 33dc515c51145..6d4e33cda852f 100644 --- a/test/jit/test_symbolic_shape_analysis.py +++ b/test/jit/test_symbolic_shape_analysis.py @@ -3,7 +3,6 @@ import operator from torch.testing import FileCheck -from typing import List if __name__ == '__main__': @@ -60,15 +59,6 @@ def prop_shapes_on_graph(inp0, inp1): self.assertEqual(output_shape[1], sym2) self.assertEqual(output_shape[2], sym3) - def test_sharing_of_list_len(self): - @torch.jit.script - def foo(x, out: List[int]): - return torch.nn.functional.adaptive_avg_pool2d(x, out) - - self.run_pass("inline", foo.graph) - torch._C._jit_pass_propagate_shapes_on_graph(foo.graph) - FileCheck().check("Tensor(*, *)").check_same("adaptive_avg_pool2d").run(foo.graph) - def test_shared_shape_graph(self): @torch.jit.script def foo(x, y): @@ -165,3 +155,25 @@ def foo2(x, y): inputs[1].setType(inputs[1].type().with_sizes([5, 8, sym1])) torch._C._jit_pass_propagate_shapes_on_graph(graph) self.assertEqual(next(graph.outputs()).type().symbolic_sizes(), [5, 8, sym1]) + + def test_adaptive_avg_pool2d(self): + inps = [ + [(1, 64, 8, 9), (5, 7)], + [(1, 64, 10, 9), (7)], + [(1, 64, 10, 9), (5, None)], + [(1, 8, 4, 3), (None, None)], + [(1, 8, 4, 3), (None, 5)], + ] + + for inp in inps: + t = torch.randn(*inp[0]) + out_size = torch.nn.functional.adaptive_avg_pool2d(t, inp[1]).size() + + def foo(x): + return torch.nn.functional.adaptive_avg_pool2d(x, inp[1]) + + fn = torch.jit.trace(foo, (t,)) + torch._C._jit_erase_non_input_shape_information(fn.graph) + torch._C._jit_pass_peephole(fn.graph) + torch._C._jit_pass_constant_propagation(fn.graph) + self.checkShapeAnalysis(out_size, fn.graph, assert_propagation=True) diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp index ffc2f44e16dac..d4471998d11e8 100644 --- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp +++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp @@ -36,10 +36,17 @@ const std::string shape_compute_functions = return expandedSizes def adaptive_avg_pool2d(self: List[int], out: List[int]): - # TODO: return out directly, list len refiner would need to - # annotate the List Type with len directly in IR assert len(out) == 2 - return [out[0], out[1]] + assert len(self) == 3 or len(self) == 4 + for i in range (1, len(self)): + assert self[i] != 0 + + shape: List[int] = [] + for i in range(0, len(self) -2): + shape.append(self[i]) + for elem in out: + shape.append(elem) + return shape # TODO: maybe make it customary that extra arguments are unused ? # TODO: return self directly From 52ebe7e14efc3e1ebb5cf974245a4d37f4441e9d Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Wed, 25 Aug 2021 14:34:40 -0700 Subject: [PATCH 227/530] Back out "Temporary fix for remote gpu execution issue" (#63983) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63983 Test for fixes in D30545351. it should resolve the remote execution flag being populated incorrectly issue. Test Plan: CI Reviewed By: malfet, seemethere Differential Revision: D30549443 fbshipit-source-id: b3895909f5cd654ba163b77950872b332fbad3fe --- torch/testing/_internal/common_device_type.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py index b5d61273afd3e..8ec6e71d121ff 100644 --- a/torch/testing/_internal/common_device_type.py +++ b/torch/testing/_internal/common_device_type.py @@ -12,7 +12,7 @@ import torch from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \ skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \ - IS_SANDCASTLE, IS_FBCODE, DeterministicGuard, TEST_SKIP_NOARCH + IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard, TEST_SKIP_NOARCH from torch.testing._internal.common_cuda import _get_torch_cuda_version from torch.testing import \ (get_all_dtypes) @@ -469,9 +469,13 @@ def get_device_type_test_bases(): test_bases: List[Any] = list() if IS_SANDCASTLE or IS_FBCODE: - # temporarily disable IS_REMOTE_GPU, see T99020845 - test_bases.append(CPUTestBase) - test_bases.append(MetaTestBase) + if IS_REMOTE_GPU: + # Skip if sanitizer is enabled + if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN: + test_bases.append(CUDATestBase) + else: + test_bases.append(CPUTestBase) + test_bases.append(MetaTestBase) else: test_bases.append(CPUTestBase) if not TEST_SKIP_NOARCH: From b5b9ce146f27624876d64034305c3c033bdfeaf5 Mon Sep 17 00:00:00 2001 From: John Clow Date: Wed, 25 Aug 2021 14:49:06 -0700 Subject: [PATCH 228/530] Small fixes to the Contributing.txt (#63385) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63385 Correcting a mistake for the pytorch uninstall, and adding an extra note for Darwin. Test Plan: Imported from OSS Reviewed By: janeyx99, heitorschueroff Differential Revision: D30530234 fbshipit-source-id: e0f88a1725eeadabfb4b28c1da11e369ee878ab4 --- CONTRIBUTING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7cf3aecabd7c1..93de9b022ee6f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -87,7 +87,7 @@ lazy.) ```bash -conda -y uninstall pytorch +conda uninstall pytorch -y yes | pip uninstall torch ``` @@ -781,6 +781,8 @@ If you are editing a single file and rebuilding in a tight loop, the time spent linking will dominate. The system linker available in most Linux distributions (GNU `ld`) is quite slow. Use a faster linker, like [lld](https://lld.llvm.org/). +People on Mac, follow [this guide](https://stackoverflow.com/questions/42730345/how-to-install-llvm-for-mac) instead. + The easiest way to use `lld` this is download the [latest LLVM binaries](http://releases.llvm.org/download.html#8.0.0) and run: ``` From 730ce29bafcdf21cc7000dccbbed4c7af500cf27 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 25 Aug 2021 15:00:47 -0700 Subject: [PATCH 229/530] Add note on ifdefing based on CUDA_VERSION for ROCm path (#62850) Summary: CUDA_VERSION and HIP_VERSION follow very unrelated versioning schemes, so it does not make sense to use CUDA_VERSION to determine the ROCm path. This note explicitly addresses it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62850 Reviewed By: mruberry Differential Revision: D30547562 Pulled By: malfet fbshipit-source-id: 02990fa66a88466c2330ab85f446b25b78545150 --- docs/source/notes/hip.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst index 20f99cb96c5b0..a9c94e2a4febb 100644 --- a/docs/source/notes/hip.rst +++ b/docs/source/notes/hip.rst @@ -119,6 +119,27 @@ torch.distributed backends Currently, only the "nccl" and "gloo" backends for torch.distributed are supported on ROCm. +.. _cuda-api-to_hip-api-mappings: + +CUDA API to HIP API mappings in C++ +----------------------------------- + +Please refer: https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html + +NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion APIs do not +semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and +hipDriverGetVersion APIs. Please do not use them interchangeably when doing version checks. + +Eg: Instead of +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +If it is desired to not take the code path for ROCm/HIP: +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(USE_ROCM) +If it is desired to take the code path for ROCm/HIP: +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || defined(USE_ROCM) +If it is desired to take the code path for ROCm/HIP only for specific HIP versions: +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 40300) + + Refer to CUDA Semantics doc --------------------------- From 44ede71751440975e985944503d548e101a42a64 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Wed, 25 Aug 2021 15:05:14 -0700 Subject: [PATCH 230/530] Shard python_torch_functions.cpp (#62187) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62187 This file can take 3 minutes on its own to compile, and after python_functions.cpp is the second limiting factor for compile time of `libtorch_python` on a 32-core threadripper. This splits it into 3 files that take around 1 minute each to compile. Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D29962048 Pulled By: albanD fbshipit-source-id: 99016d75912bff483fe21b130cef43a6882f8c0e --- BUILD.bazel | 4 +- caffe2/CMakeLists.txt | 4 +- tools/autograd/gen_python_functions.py | 60 +- .../templates/python_torch_functions.cpp | 760 +--------------- tools/build_variables.bzl | 9 +- tools/codegen/gen.py | 4 +- torch/csrc/autograd/python_torch_functions.h | 25 + .../python_torch_functions_manual.cpp | 826 ++++++++++++++++++ 8 files changed, 930 insertions(+), 762 deletions(-) create mode 100644 torch/csrc/autograd/python_torch_functions.h create mode 100644 torch/csrc/autograd/python_torch_functions_manual.cpp diff --git a/BUILD.bazel b/BUILD.bazel index afdd4699b160f..a5f20c2020181 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -224,7 +224,9 @@ libtorch_python_generated_sources = [ "torch/csrc/autograd/generated/python_functions_3.cpp", "torch/csrc/autograd/generated/python_functions_4.cpp", "torch/csrc/autograd/generated/python_variable_methods.cpp", - "torch/csrc/autograd/generated/python_torch_functions.cpp", + "torch/csrc/autograd/generated/python_torch_functions_0.cpp", + "torch/csrc/autograd/generated/python_torch_functions_1.cpp", + "torch/csrc/autograd/generated/python_torch_functions_2.cpp", "torch/csrc/autograd/generated/python_nn_functions.cpp", "torch/csrc/autograd/generated/python_fft_functions.cpp", "torch/csrc/autograd/generated/python_linalg_functions.cpp", diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 67ab08f9b0fc5..1662a92268d37 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -397,7 +397,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_3.cpp" "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_4.cpp" "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_0.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_1.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_2.cpp" "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp" "${TORCH_SRC_DIR}/csrc/autograd/generated/python_fft_functions.cpp" "${TORCH_SRC_DIR}/csrc/autograd/generated/python_linalg_functions.cpp" diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index d1fb70c8abed3..f61d3d0c0709c 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -164,9 +164,12 @@ def gen(out: str, native_yaml_path: str, deprecated_yaml_path: str, template_pat create_python_bindings( fm, methods, is_py_variable_method, None, 'python_variable_methods.cpp', method=True) + # NOTE: num_shards here must be synced with gatherTorchFunctions in + # torch/csrc/autograd/python_torch_functions_manual.cpp functions = load_signatures(native_functions, deprecated_yaml_path, method=False) - create_python_bindings( - fm, functions, is_py_torch_function, 'torch', 'python_torch_functions.cpp', method=False) + create_python_bindings_sharded( + fm, functions, is_py_torch_function, 'torch', 'python_torch_functions.cpp', + method=False, num_shards=3) create_python_bindings( fm, functions, is_py_nn_function, 'torch.nn', 'python_nn_functions.cpp', method=False) @@ -180,6 +183,16 @@ def gen(out: str, native_yaml_path: str, deprecated_yaml_path: str, template_pat create_python_bindings( fm, functions, is_py_special_function, 'torch.special', 'python_special_functions.cpp', method=False) +def group_filter_overloads( + pairs: Sequence[PythonSignatureNativeFunctionPair], + pred: Callable[[NativeFunction], bool] +) -> Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]: + grouped: Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] = defaultdict(list) + for pair in pairs: + if pred(pair.function): + grouped[pair.function.func.name.name].append(pair) + return grouped + def create_python_bindings( fm: FileManager, pairs: Sequence[PythonSignatureNativeFunctionPair], @@ -194,10 +207,7 @@ def create_python_bindings( py_method_defs: List[str] = [] py_forwards: List[str] = [] - grouped: Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] = defaultdict(list) - for pair in pairs: - if pred(pair.function): - grouped[pair.function.func.name.name].append(pair) + grouped = group_filter_overloads(pairs, pred) for name in sorted(grouped.keys(), key=lambda x: str(x)): overloads = grouped[name] @@ -212,6 +222,44 @@ def create_python_bindings( 'py_method_defs': py_method_defs, }) +def create_python_bindings_sharded( + fm: FileManager, + pairs: Sequence[PythonSignatureNativeFunctionPair], + pred: Callable[[NativeFunction], bool], + module: Optional[str], + filename: str, + *, + method: bool, + num_shards: int +) -> None: + """Generates Python bindings to ATen functions""" + grouped = group_filter_overloads(pairs, pred) + + def key_func(kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]) -> str: + return str(kv[0]) + + def env_func( + kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] + ) -> Dict[str, List[str]]: + return { + 'py_forwards': list(forward_decls(kv[0], kv[1], method=method)), + 'py_methods': [method_impl(kv[0], module, kv[1], method=method)], + 'py_method_defs': [method_def(kv[0], module, kv[1], method=method)], + } + + fm.write_sharded( + filename, + grouped.items(), + base_env={ + 'generated_comment': + '@' + f'generated from {fm.template_dir}/{filename}', + }, + key_fn=key_func, + env_callable=env_func, + num_shards=num_shards, + sharded_keys={'py_forwards', 'py_methods', 'py_method_defs'} + ) + def load_signatures( native_functions: List[NativeFunction], deprecated_yaml_path: str, diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 9e02036639516..b45b5f298716b 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -7,7 +7,6 @@ // and also copied into 'torch' module. #include -#include // Undefine the copysign macro so that at::copysign works as intended with MSVC // https://github.com/python/cpython/blob/c60394c7fc9cc09b16e9675a3eeb5844b6d8523f/PC/pyconfig.h#L196 @@ -15,6 +14,7 @@ #undef copysign #endif // _MSC_VER +#include "torch/csrc/autograd/python_torch_functions.h" #include "torch/csrc/autograd/python_variable.h" #include "torch/csrc/autograd/utils/wrap_outputs.h" #include "torch/csrc/Dtype.h" @@ -34,7 +34,6 @@ #include -#include #include #include #include @@ -59,767 +58,28 @@ using at::ArrayRef; using torch::utils::check_out_type_matches; using namespace torch::autograd::utils; -namespace torch { namespace autograd { - -static PyObject* THPVariableFunctionsModule = NULL; - -inline Tensor dispatch_arange(const Scalar& end, Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::arange_out(result, end); -} - -inline Tensor dispatch_arange(const Scalar& end, const TensorOptions& options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return torch::arange(end, options); -} - -inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::arange_out(result, start, end, step); -} - -inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return torch::arange(start, end, step, options); -} - -static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "arange(Scalar end, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", - "arange(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", - }, /*traceable=*/true); - - ParsedArgs<9> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if(r.has_torch_function()) { - return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); - } - - if (r.idx == 0) { - if (r.isNone(1)) { - auto end = r.scalar(0); - // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) - c10::optional scalarType = r.scalartypeOptional(2); - const auto options = TensorOptions() - .dtype(scalarType) - .device(r.device(4)) - .layout(r.layout(3)) - .requires_grad(r.toBool(6)) - .pinned_memory(r.toBool(5)); - return wrap(dispatch_arange(end, options)); - } else { - TORCH_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible"); - check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3), - r.device(4), r.isNone(4)); - return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(6))); - } - } else if (r.idx == 1) { - if (r.isNone(3)) { - auto start = r.scalar(0); - auto end = r.scalar(1); - auto step = r.scalar(2); - // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) - c10::optional scalarType = r.scalartypeOptional(4); - const auto options = TensorOptions() - .dtype(scalarType) - .device(r.device(6)) - .layout(r.layout(5)) - .requires_grad(r.toBool(8)) - .pinned_memory(r.toBool(7)); - return wrap(dispatch_arange(start, end, step, options)); - } else { - TORCH_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible"); - check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5), - r.device(6), r.isNone(6)); - return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(8))); - } - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) { - pybind11::gil_scoped_release no_gil; - OptionalDeviceGuard device_guard(device_of(result)); - return at::range_out(result, start, end, step); -} - -inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - DeviceGuard device_guard(options.device()); - return torch::range(start, end, step, options); -} - -static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "range(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)", - }); - - ParsedArgs<8> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if (r.idx == 0) { - auto ret = PyErr_WarnEx( - PyExc_UserWarning, - "torch.range is deprecated and will be removed in a future release " - "because its behavior is inconsistent with Python's range builtin. " - "Instead, use torch.arange, which produces values in [start, end).", - 1); - if (ret != 0) throw python_error(); - if (r.isNone(3)) { - const auto options = TensorOptions() - .dtype(r.scalartype(4)) - .device(r.device(6)) - .layout(r.layout(5)) - .requires_grad(r.toBool(7)); - return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), options)); - } else { - check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), - r.layout(5), r.device(6), r.isNone(6)); - return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(7))); - } - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -inline Tensor dispatch_full( - IntArrayRef size, - const Scalar& fill_val, - const TensorOptions& options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return at::full(size, fill_val, options); -} - -inline Tensor dispatch_full( - IntArrayRef size, - const Scalar& fill_val, - c10::optional names, - const TensorOptions& options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return at::full(size, fill_val, names, options); -} - -inline Tensor dispatch_full( - IntArrayRef size, - const Scalar& fill_val, - Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::full_out(result, size, fill_val); -} - -static PyObject * THPVariable_full(PyObject* self, PyObject* args, PyObject* kwargs) { - HANDLE_TH_ERRORS - - static PythonArgParser parser({ - "full(IntArrayRef size, Scalar fill_value, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", - "full(IntArrayRef size, Scalar fill_value, *, DimnameList names=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", - }, /*traceable=*/true); - - // Acquires (common) arguments - ParsedArgs<8> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); +// NOTE: See [Sharded File] comment in VariableType - if(r.has_torch_function()) { - return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); - } - - auto size = r.intlist(0); - auto fill_val = r.scalar(1); - const auto options = TensorOptions{} - .dtype(r.scalartypeOptional(3)) - .layout(r.layout(4)) - .device(r.device(5)) - .pinned_memory(r.toBool(6)); - - if (r.idx == 0) { - // full - if (r.isNone(2)) { - return wrap(dispatch_full(size, fill_val, options).set_requires_grad(r.toBool(7))); - } - - // full.out - // Validates out tensor and other kwargs - auto result = r.tensor(2); - TORCH_CHECK(!r.toBool(6), " `pin_memory` and `out` parameters are incompatible"); - check_out_type_matches(result, r.scalartype(3), r.isNone(3), r.layout(4), - r.device(5), r.isNone(5)); - - return wrap(dispatch_full(size, fill_val, result).set_requires_grad(r.toBool(7))); - } else if (r.idx == 1) { - // full.names - if (r.isNone(2)) { - return wrap(dispatch_full(size, fill_val, c10::nullopt, options).set_requires_grad(r.toBool(7))); - } - - // Converts from c10::optional to c10::optional - auto raw_names = r.toDimnameListOptional(2); - c10::optional names(*raw_names); - return wrap(dispatch_full(size, fill_val, names, options).set_requires_grad(r.toBool(7))); - } - - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional generator, Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::randint_out(result, high, size, generator); -} -inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional generator, const TensorOptions & options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return torch::randint(high, size, generator, options); -} -inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::randint_out(result, high, size); -} -inline Tensor dispatch_randint(int64_t high, IntArrayRef size, const TensorOptions & options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return torch::randint(high, size, options); -} -inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional generator, Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::randint_out(result, low, high, size, generator); -} -inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional generator, const TensorOptions & options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return torch::randint(low, high, size, generator, options); -} -inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Tensor result) { - pybind11::gil_scoped_release no_gil; - return at::randint_out(result, low, high, size); -} -inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, const TensorOptions & options) { - torch::utils::maybe_initialize_cuda(options); - pybind11::gil_scoped_release no_gil; - return torch::randint(low, high, size, options); -} - -static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "randint(int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)", - "randint(int64_t low, int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)", - }, /*traceable=*/false); - - ParsedArgs<9> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if(r.has_torch_function()) { - return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); - } - - if (r.idx == 0) { - if (r.isNone(3)) { - auto high = r.toInt64(0); - auto size = r.intlist(1); - auto generator = r.generator(2); - // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) - auto dtype = r.scalartypeWithDefault(4, at::ScalarType::Long); - auto device = r.device(6); - const auto options = TensorOptions() - .dtype(dtype) - .device(device) - .layout(r.layout(5)) - .requires_grad(r.toBool(7)); - return wrap(dispatch_randint(high, size, generator, options)); - } else { - check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), - r.layout(5), r.device(6), r.isNone(6)); - return wrap(dispatch_randint(r.toInt64(0), r.intlist(1), r.generator(2), r.tensor(3)).set_requires_grad(r.toBool(7))); - } - } else if (r.idx == 1) { - if (r.isNone(4)) { - auto low = r.toInt64(0); - auto high = r.toInt64(1); - auto size = r.intlist(2); - auto generator = r.generator(3); - // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) - auto dtype = r.scalartypeWithDefault(5, at::ScalarType::Long); - auto device = r.device(7); - const auto options = TensorOptions() - .dtype(dtype) - .device(device) - .layout(r.layout(6)) - .requires_grad(r.toBool(8)); - return wrap(dispatch_randint(low, high, size, generator, options)); - } else { - check_out_type_matches(r.tensor(4), r.scalartype(5), r.isNone(5), - r.layout(6), r.device(7), r.isNone(7)); - return wrap(dispatch_randint(r.toInt64(0), r.toInt64(1), r.intlist(2), r.generator(3), r.tensor(4)).set_requires_grad(r.toBool(8))); - } - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -// implemented on python object to allow torch.as_tensor to be constructed with arbitrarily nested -// python objects - list, tuple, np array, scalar, etc. -static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::as_tensor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -// implemented on python object here because PyObject currently not natively declarable -// See: ATen/native/README.md for more context -static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::tensor_from_numpy(arg)); - END_HANDLE_TH_ERRORS -} - -static Tensor dispatch_nonzero(const Tensor & self) { - pybind11::gil_scoped_release no_gil; - OptionalDeviceGuard device_guard(device_of(self)); - return self.nonzero(); -} - -static Tensor dispatch_nonzero(const Tensor & self, Tensor out) { - pybind11::gil_scoped_release no_gil; - OptionalDeviceGuard device_guard(device_of(self)); - return at::nonzero_out(out, self); -} - -static std::vector dispatch_nonzero_numpy(const Tensor & self) { - pybind11::gil_scoped_release no_gil; - OptionalDeviceGuard device_guard(device_of(self)); - return self.nonzero_numpy(); -} - -static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs); - -static PyObject * THPVariable_sparse_csr_tensor(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch.sparse_csr_tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::sparse_csr_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable__sparse_csr_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch._sparse_csr_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::_sparse_csr_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch._sparse_coo_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -// implemented on python object to allow torch.tensor to be constructed with arbitrarily nested -// python objects - list, tuple, np array, scalar, etc. -static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "get_device(Tensor input)", - }, /*traceable=*/false); - - ParsedArgs<1> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if (r.idx == 0) { - return wrap(r.tensor(0).get_device()); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable_frombuffer(PyObject* self_, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "frombuffer(PyObject* buffer, *, ScalarType dtype, int64_t count=-1, int64_t offset=0, bool requires_grad=False)", - }, /*traceable=*/false); - - PyObject* ret = nullptr; - ParsedArgs<5> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if (r.idx == 0) { - auto buffer = r.pyobject(0); - auto dtype = r.scalartype(1); - auto count = r.toInt64(2); - auto offset = r.toInt64(3); - auto requires_grad = r.toBool(4); - - auto elsize = at::elementSize(dtype); - size_t actual_count = 0; - Py_buffer view; - - TORCH_CHECK_VALUE( - PyObject_CheckBuffer(buffer) != 0, - "object does not implement Python buffer protocol."); - - if (PyObject_GetBuffer(buffer, &view, PyBUF_WRITABLE) < 0) { - TORCH_CHECK( - PyObject_GetBuffer(buffer, &view, PyBUF_SIMPLE) >= 0, - "could not retrieve buffer from object"); - TORCH_WARN_ONCE( - "The given buffer is not writable, and PyTorch does " - "not support non-writable tensors. This means you can write to the " - "underlying (supposedly non-writable) buffer using the tensor. " - "You may want to copy the buffer to protect its data or make it writable " - "before converting it to a tensor. This type of warning will be " - "suppressed for the rest of this program."); - PyErr_Clear(); - } - - Py_INCREF(view.obj); - THPObjectPtr obj(view.obj); - - auto len = view.len; - auto buf = view.buf; - PyBuffer_Release(&view); - - TORCH_CHECK_VALUE( - len > 0 && count != 0, - "both buffer length (", len, ") and count (", count, ") must not be 0"); - TORCH_CHECK_VALUE( - offset >= 0 && offset < len, - "offset (", offset, " bytes) must be non-negative and no greater than " - "buffer length (", len, " bytes) minus 1"); - TORCH_CHECK_VALUE( - count > 0 || (len - offset) % elsize == 0, - "buffer length (", len - offset, " bytes) after offset (", offset, " bytes) " - "must be a multiple of element size (", elsize, ")"); - - if (count < 0) { - actual_count = (len - offset) / elsize; - } else { - actual_count = static_cast(count); - } - - TORCH_CHECK_VALUE( - static_cast(offset) + actual_count * elsize <= len, - "requested buffer length (", actual_count, " * ", elsize, " bytes) " - "after offset (", offset, " bytes) must not be greater than actual " - "buffer length (", len, " bytes)"); - - auto offset_buf = static_cast(buf) + offset; - auto options = TensorOptions() - .dtype(dtype) - .device(c10::kCPU); - - auto tensor = at::for_blob(offset_buf, static_cast(actual_count)) - .options(options) - .deleter([obj = obj.release()](void*) { - pybind11::gil_scoped_acquire gil; - Py_DECREF(obj); - }) - .make_tensor(); - tensor.set_requires_grad(requires_grad); - ret = wrap(tensor); - } - - return ret; - - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs); - -// linspace -static PyObject * THPVariable_linspace(PyObject* self_, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "linspace(Scalar start, Scalar end, int64_t? steps=None, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", - }, /*traceable=*/true); - - ParsedArgs<9> parsed_args; - auto _r = parser.parse(nullptr, args, kwargs, parsed_args); - if(_r.has_torch_function()) { - return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); - } - if (_r.isNone(3)) { - // aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - - // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) - // This leads to problem in the operator argument checks, - // when either `start` or `end` is complex and dtype is None - const auto options = TensorOptions() - .dtype(_r.scalartypeOptional(4)) - .device(_r.device(6)) - .layout(_r.layoutOptional(5)) - .requires_grad(_r.toBool(8)) - .pinned_memory(_r.toBool(7)); - torch::utils::maybe_initialize_cuda(options); - - auto dispatch_linspace = [](Scalar start, Scalar end, c10::optional steps, TensorOptions options) -> Tensor { - pybind11::gil_scoped_release no_gil; - return torch::linspace(start, end, steps, options); - }; - return wrap(dispatch_linspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), options)); - } else { - // aten::linspace.out(Scalar start, Scalar end, int? steps=None, *, Tensor(a!) out) -> Tensor(a!) - check_out_type_matches(_r.tensor(3), _r.scalartype(4), - _r.isNone(4), _r.layoutOptional(5), - _r.device(6), _r.isNone(6)); - - auto dispatch_linspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional steps) -> Tensor { - pybind11::gil_scoped_release no_gil; - return at::linspace_out(out, start, end, steps); - }; - return wrap(dispatch_linspace_out(_r.tensor(3), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2)).set_requires_grad(_r.toBool(8))); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -// logspace -static PyObject * THPVariable_logspace(PyObject* self_, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "logspace(Scalar start, Scalar end, int64_t? steps=None, double base=10.0, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", - }, /*traceable=*/true); - - ParsedArgs<10> parsed_args; - auto _r = parser.parse(nullptr, args, kwargs, parsed_args); - if(_r.has_torch_function()) { - return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); - } - if (_r.isNone(4)) { - // aten::logspace(Scalar start, Scalar end, int? steps=None, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - - // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) - // This leads to problem in the operator argument checks, - // when either `start` or `end` is complex and dtype is None - const auto options = TensorOptions() - .dtype(_r.scalartypeOptional(5)) - .device(_r.device(7)) - .layout(_r.layoutOptional(6)) - .requires_grad(_r.toBool(9)) - .pinned_memory(_r.toBool(8)); - torch::utils::maybe_initialize_cuda(options); - - auto dispatch_logspace = [](Scalar start, Scalar end, c10::optional steps, double base, TensorOptions options) -> Tensor { - pybind11::gil_scoped_release no_gil; - return torch::logspace(start, end, steps, base, options); - }; - return wrap(dispatch_logspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3), options)); - } else { - // aten::logspace.out(Scalar start, Scalar end, int? steps=None, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) - check_out_type_matches(_r.tensor(4), _r.scalartype(5), - _r.isNone(5), _r.layoutOptional(6), - _r.device(7), _r.isNone(7)); - - auto dispatch_logspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional steps, double base) -> Tensor { - pybind11::gil_scoped_release no_gil; - return at::logspace_out(out, start, end, steps, base); - }; - return wrap(dispatch_logspace_out(_r.tensor(4), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3)).set_requires_grad(_r.toBool(9))); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} +namespace torch { namespace autograd { // generated forward declarations start here ${py_forwards} -// Wrapper converts a raised TypeError into returning NotImplemented -// Used to implement binary arithmetic operators -template -static PyObject * TypeError_to_NotImplemented_(PyObject* self, PyObject* args, PyObject* kwargs) { - PyObject* ret = Func(self, args, kwargs); - if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) { - PyErr_Clear(); - Py_INCREF(Py_NotImplemented); - ret = Py_NotImplemented; - } - return ret; -} - -// XXX: ops that are bound here are not exposed to the C++ api nor the JIT. -// Any new ops added here should be accompanied with a comment why they are not -// being registered through native_functions.yaml, and be tagged cpp / JIT -static PyMethodDef torch_functions[] = { - {"arange", castPyCFunctionWithKeywords(THPVariable_arange), - METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"as_tensor", castPyCFunctionWithKeywords(THPVariable_as_tensor), - METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"dsmm", castPyCFunctionWithKeywords(THPVariable_mm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"from_numpy", THPVariable_from_numpy, METH_STATIC | METH_O, NULL}, - {"frombuffer", castPyCFunctionWithKeywords(THPVariable_frombuffer), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"full", castPyCFunctionWithKeywords(THPVariable_full), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"hsmm", castPyCFunctionWithKeywords(THPVariable_hspmm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"linspace", castPyCFunctionWithKeywords(THPVariable_linspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"logspace", castPyCFunctionWithKeywords(THPVariable_logspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"nonzero", castPyCFunctionWithKeywords(THPVariable_nonzero), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"randint", castPyCFunctionWithKeywords(THPVariable_randint), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"range", castPyCFunctionWithKeywords(THPVariable_range), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"saddmm", castPyCFunctionWithKeywords(THPVariable_sspaddmm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"sparse_coo_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_coo_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"_sparse_coo_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_coo_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"_validate_sparse_coo_tensor_args", castPyCFunctionWithKeywords(THPVariable__validate_sparse_coo_tensor_args), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"sparse_csr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"_sparse_csr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"_validate_sparse_csr_tensor_args", castPyCFunctionWithKeywords(THPVariable__validate_sparse_csr_tensor_args), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"spmm", castPyCFunctionWithKeywords(THPVariable_mm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"tensor", castPyCFunctionWithKeywords(THPVariable_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"get_device", castPyCFunctionWithKeywords(THPVariable_get_device), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, - {"numel", castPyCFunctionWithKeywords(THPVariable_numel), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL}, +static PyMethodDef torch_functions_shard[] = { ${py_method_defs} - {NULL} }; -static PyTypeObject THPVariableFunctions = { - PyVarObject_HEAD_INIT(NULL, 0) - "torch._C._VariableFunctionsClass", /* tp_name */ - 0, /* tp_basicsize */ - 0, /* tp_itemsize */ - 0, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - NULL, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - torch_functions, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0 /* tp_new */ -}; - -void initTorchFunctions(PyObject* module) { - if (PyType_Ready(&THPVariableFunctions) < 0) { - throw python_error(); - } - Py_INCREF(&THPVariableFunctions); - - // Steals - Py_INCREF(&THPVariableFunctions); - if (PyModule_AddObject(module, "_VariableFunctionsClass", reinterpret_cast(&THPVariableFunctions)) < 0) { - throw python_error(); - } - // PyType_GenericNew returns a new reference - THPVariableFunctionsModule = PyType_GenericNew(&THPVariableFunctions, Py_None, Py_None); - // PyModule_AddObject steals a reference - if (PyModule_AddObject(module, "_VariableFunctions", THPVariableFunctionsModule) < 0) { - throw python_error(); - } +void gatherTorchFunctions${shard_id}(std::vector &torch_functions) { + constexpr size_t num_functions = sizeof(torch_functions_shard) / sizeof(torch_functions_shard[0]); + torch_functions.insert( + torch_functions.end(), + torch_functions_shard, + torch_functions_shard + num_functions); } // generated methods start here ${py_methods} -static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)", - }); - ParsedArgs<3> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if(r.has_torch_function()){ - return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); - } - - const auto as_tuple = r.toBool(1); - const auto has_out = !r.isNone(2); - - if (as_tuple) { - TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True"); - return wrap(dispatch_nonzero_numpy(r.tensor(0))); - } - - if (has_out) { - return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2))); - } - - return wrap(dispatch_nonzero(r.tensor(0))); - - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - static PythonArgParser parser({ - "numel(Tensor input)", - }, /*traceable=*/false); - - ParsedArgs<1> parsed_args; - auto r = parser.parse(args, kwargs, parsed_args); - - if(r.has_torch_function()){ - return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); - } - - if (r.idx == 0) { - return wrap(r.tensor(0).numel()); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} }} // namespace torch::autograd diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 0d888ea8a4cb0..5f4cc0df522f5 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -23,7 +23,9 @@ GENERATED_CPP = [ "autograd/generated/python_fft_functions.cpp", "autograd/generated/python_linalg_functions.cpp", "autograd/generated/python_special_functions.cpp", - "autograd/generated/python_torch_functions.cpp", + "autograd/generated/python_torch_functions_0.cpp", + "autograd/generated/python_torch_functions_1.cpp", + "autograd/generated/python_torch_functions_2.cpp", "autograd/generated/python_variable_methods.cpp", ] @@ -664,6 +666,7 @@ libtorch_python_core_sources = [ "torch/csrc/autograd/python_function.cpp", "torch/csrc/autograd/python_hook.cpp", "torch/csrc/autograd/python_legacy_variable.cpp", + "torch/csrc/autograd/python_torch_functions_manual.cpp", "torch/csrc/autograd/python_variable.cpp", "torch/csrc/autograd/python_variable_indexing.cpp", "torch/csrc/jit/backends/backend_init.cpp", @@ -760,7 +763,9 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"): "autograd/generated/python_fft_functions.cpp", "autograd/generated/python_linalg_functions.cpp", "autograd/generated/python_special_functions.cpp", - "autograd/generated/python_torch_functions.cpp", + "autograd/generated/python_torch_functions_0.cpp", + "autograd/generated/python_torch_functions_1.cpp", + "autograd/generated/python_torch_functions_2.cpp", "autograd/generated/python_variable_methods.cpp", ]] diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 44bb3b4f87e1d..ffa4ed7a1c70e 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -1,5 +1,5 @@ import os -from typing import List, Dict, Optional, Tuple, Set, Callable, Any, Union, Sequence, TypeVar +from typing import List, Dict, Optional, Tuple, Set, Callable, Any, Union, Sequence, TypeVar, Iterable from typing_extensions import Literal import yaml from collections import OrderedDict, defaultdict, namedtuple @@ -858,7 +858,7 @@ def write(self, filename: str, env_callable: Callable[[], Union[str, Union[str, def write_sharded( self, filename: str, - items: List[T], + items: Iterable[T], *, key_fn: Callable[[T], str], env_callable: Callable[[T], Dict[str, List[str]]], diff --git a/torch/csrc/autograd/python_torch_functions.h b/torch/csrc/autograd/python_torch_functions.h new file mode 100644 index 0000000000000..58257794812ee --- /dev/null +++ b/torch/csrc/autograd/python_torch_functions.h @@ -0,0 +1,25 @@ +#include + +#include + + +namespace torch { namespace autograd { + +extern PyObject* THPVariableFunctionsModule; + +// Wrapper converts a raised TypeError into returning NotImplemented +// Used to implement binary arithmetic operators +template +inline PyObject * TypeError_to_NotImplemented_(PyObject* self, PyObject* args, PyObject* kwargs) { + PyObject* ret = Func(self, args, kwargs); + if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Clear(); + Py_INCREF(Py_NotImplemented); + ret = Py_NotImplemented; + } + return ret; +} + +void initTorchFunctions(); + +}} // namespace torch::autograd diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp new file mode 100644 index 0000000000000..a54d1017bcee8 --- /dev/null +++ b/torch/csrc/autograd/python_torch_functions_manual.cpp @@ -0,0 +1,826 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +using at::Tensor; +using at::Device; +using at::Layout; +using at::Scalar; +using at::ScalarType; +using at::Backend; +using at::OptionalDeviceGuard; +using at::DeviceGuard; +using at::TensorOptions; +using at::IntArrayRef; +using at::Generator; +using at::TensorList; +using at::Dimname; +using at::DimnameList; +using at::ArrayRef; + +using torch::utils::check_out_type_matches; +using namespace torch::autograd::utils; + +namespace torch { namespace autograd { + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +PyObject* THPVariableFunctionsModule = nullptr; + + +inline Tensor dispatch_arange(const Scalar& end, Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::arange_out(result, end); +} + +inline Tensor dispatch_arange(const Scalar& end, const TensorOptions& options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return torch::arange(end, options); +} + +inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::arange_out(result, start, end, step); +} + +inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return torch::arange(start, end, step, options); +} + +static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "arange(Scalar end, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + "arange(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + }, /*traceable=*/true); + + ParsedArgs<9> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if(r.has_torch_function()) { + return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); + } + + if (r.idx == 0) { + if (r.isNone(1)) { + auto end = r.scalar(0); + // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) + c10::optional scalarType = r.scalartypeOptional(2); + const auto options = TensorOptions() + .dtype(scalarType) + .device(r.device(4)) + .layout(r.layout(3)) + .requires_grad(r.toBool(6)) + .pinned_memory(r.toBool(5)); + return wrap(dispatch_arange(end, options)); + } else { + TORCH_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible"); + check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3), + r.device(4), r.isNone(4)); + return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(6))); + } + } else if (r.idx == 1) { + if (r.isNone(3)) { + auto start = r.scalar(0); + auto end = r.scalar(1); + auto step = r.scalar(2); + // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) + c10::optional scalarType = r.scalartypeOptional(4); + const auto options = TensorOptions() + .dtype(scalarType) + .device(r.device(6)) + .layout(r.layout(5)) + .requires_grad(r.toBool(8)) + .pinned_memory(r.toBool(7)); + return wrap(dispatch_arange(start, end, step, options)); + } else { + TORCH_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible"); + check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5), + r.device(6), r.isNone(6)); + return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(8))); + } + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) { + pybind11::gil_scoped_release no_gil; + OptionalDeviceGuard device_guard(device_of(result)); + return at::range_out(result, start, end, step); +} + +inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + DeviceGuard device_guard(options.device()); + return torch::range(start, end, step, options); +} + +static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "range(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)", + }); + + ParsedArgs<8> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if (r.idx == 0) { + auto ret = PyErr_WarnEx( + PyExc_UserWarning, + "torch.range is deprecated and will be removed in a future release " + "because its behavior is inconsistent with Python's range builtin. " + "Instead, use torch.arange, which produces values in [start, end).", + 1); + if (ret != 0) throw python_error(); + if (r.isNone(3)) { + const auto options = TensorOptions() + .dtype(r.scalartype(4)) + .device(r.device(6)) + .layout(r.layout(5)) + .requires_grad(r.toBool(7)); + return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), options)); + } else { + check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), + r.layout(5), r.device(6), r.isNone(6)); + return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(7))); + } + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +inline Tensor dispatch_full( + IntArrayRef size, + const Scalar& fill_val, + const TensorOptions& options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return at::full(size, fill_val, options); +} + +inline Tensor dispatch_full( + IntArrayRef size, + const Scalar& fill_val, + c10::optional names, + const TensorOptions& options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return at::full(size, fill_val, names, options); +} + +inline Tensor dispatch_full( + IntArrayRef size, + const Scalar& fill_val, + Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::full_out(result, size, fill_val); +} + +static PyObject * THPVariable_full(PyObject* self, PyObject* args, PyObject* kwargs) { + HANDLE_TH_ERRORS + + static PythonArgParser parser({ + "full(IntArrayRef size, Scalar fill_value, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + "full(IntArrayRef size, Scalar fill_value, *, DimnameList names=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + }, /*traceable=*/true); + + // Acquires (common) arguments + ParsedArgs<8> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if(r.has_torch_function()) { + return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); + } + + auto size = r.intlist(0); + auto fill_val = r.scalar(1); + const auto options = TensorOptions{} + .dtype(r.scalartypeOptional(3)) + .layout(r.layout(4)) + .device(r.device(5)) + .pinned_memory(r.toBool(6)); + + if (r.idx == 0) { + // full + if (r.isNone(2)) { + return wrap(dispatch_full(size, fill_val, options).set_requires_grad(r.toBool(7))); + } + + // full.out + // Validates out tensor and other kwargs + auto result = r.tensor(2); + TORCH_CHECK(!r.toBool(6), " `pin_memory` and `out` parameters are incompatible"); + check_out_type_matches(result, r.scalartype(3), r.isNone(3), r.layout(4), + r.device(5), r.isNone(5)); + + return wrap(dispatch_full(size, fill_val, result).set_requires_grad(r.toBool(7))); + } else if (r.idx == 1) { + // full.names + if (r.isNone(2)) { + return wrap(dispatch_full(size, fill_val, c10::nullopt, options).set_requires_grad(r.toBool(7))); + } + + // Converts from c10::optional to c10::optional + auto raw_names = r.toDimnameListOptional(2); + c10::optional names(*raw_names); + return wrap(dispatch_full(size, fill_val, names, options).set_requires_grad(r.toBool(7))); + } + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional generator, Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::randint_out(result, high, size, generator); +} +inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional generator, const TensorOptions & options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return torch::randint(high, size, generator, options); +} +inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::randint_out(result, high, size); +} +inline Tensor dispatch_randint(int64_t high, IntArrayRef size, const TensorOptions & options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return torch::randint(high, size, options); +} +inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional generator, Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::randint_out(result, low, high, size, generator); +} +inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional generator, const TensorOptions & options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return torch::randint(low, high, size, generator, options); +} +inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Tensor result) { + pybind11::gil_scoped_release no_gil; + return at::randint_out(result, low, high, size); +} +inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, const TensorOptions & options) { + torch::utils::maybe_initialize_cuda(options); + pybind11::gil_scoped_release no_gil; + return torch::randint(low, high, size, options); +} + +static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "randint(int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)", + "randint(int64_t low, int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)", + }, /*traceable=*/false); + + ParsedArgs<9> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if(r.has_torch_function()) { + return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); + } + + if (r.idx == 0) { + if (r.isNone(3)) { + auto high = r.toInt64(0); + auto size = r.intlist(1); + auto generator = r.generator(2); + // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) + auto dtype = r.scalartypeWithDefault(4, at::ScalarType::Long); + auto device = r.device(6); + const auto options = TensorOptions() + .dtype(dtype) + .device(device) + .layout(r.layout(5)) + .requires_grad(r.toBool(7)); + return wrap(dispatch_randint(high, size, generator, options)); + } else { + check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), + r.layout(5), r.device(6), r.isNone(6)); + return wrap(dispatch_randint(r.toInt64(0), r.intlist(1), r.generator(2), r.tensor(3)).set_requires_grad(r.toBool(7))); + } + } else if (r.idx == 1) { + if (r.isNone(4)) { + auto low = r.toInt64(0); + auto high = r.toInt64(1); + auto size = r.intlist(2); + auto generator = r.generator(3); + // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) + auto dtype = r.scalartypeWithDefault(5, at::ScalarType::Long); + auto device = r.device(7); + const auto options = TensorOptions() + .dtype(dtype) + .device(device) + .layout(r.layout(6)) + .requires_grad(r.toBool(8)); + return wrap(dispatch_randint(low, high, size, generator, options)); + } else { + check_out_type_matches(r.tensor(4), r.scalartype(5), r.isNone(5), + r.layout(6), r.device(7), r.isNone(7)); + return wrap(dispatch_randint(r.toInt64(0), r.toInt64(1), r.intlist(2), r.generator(3), r.tensor(4)).set_requires_grad(r.toBool(8))); + } + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +// implemented on python object to allow torch.as_tensor to be constructed with arbitrarily nested +// python objects - list, tuple, np array, scalar, etc. +static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::as_tensor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + END_HANDLE_TH_ERRORS +} + +// implemented on python object here because PyObject currently not natively declarable +// See: ATen/native/README.md for more context +static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::tensor_from_numpy(arg)); + END_HANDLE_TH_ERRORS +} + +static Tensor dispatch_nonzero(const Tensor & self) { + pybind11::gil_scoped_release no_gil; + OptionalDeviceGuard device_guard(device_of(self)); + return self.nonzero(); +} + +static Tensor dispatch_nonzero(const Tensor & self, Tensor out) { + pybind11::gil_scoped_release no_gil; + OptionalDeviceGuard device_guard(device_of(self)); + return at::nonzero_out(out, self); +} + +static std::vector dispatch_nonzero_numpy(const Tensor & self) { + pybind11::gil_scoped_release no_gil; + OptionalDeviceGuard device_guard(device_of(self)); + return self.nonzero_numpy(); +} + +static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs); + +static PyObject * THPVariable_sparse_csr_tensor(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch.sparse_csr_tensor", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::sparse_csr_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + END_HANDLE_TH_ERRORS +} + +static PyObject * THPVariable__sparse_csr_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch._sparse_csr_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::_sparse_csr_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + END_HANDLE_TH_ERRORS +} + +static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + END_HANDLE_TH_ERRORS +} + +static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch._sparse_coo_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + END_HANDLE_TH_ERRORS +} + +// implemented on python object to allow torch.tensor to be constructed with arbitrarily nested +// python objects - list, tuple, np array, scalar, etc. +static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR); + return THPVariable_Wrap(torch::utils::tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + END_HANDLE_TH_ERRORS +} + +static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "get_device(Tensor input)", + }, /*traceable=*/false); + + ParsedArgs<1> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if (r.idx == 0) { + return wrap(r.tensor(0).get_device()); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +}static PyObject * THPVariable_frombuffer(PyObject* self_, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "frombuffer(PyObject* buffer, *, ScalarType dtype, int64_t count=-1, int64_t offset=0, bool requires_grad=False)", + }, /*traceable=*/false); + + PyObject* ret = nullptr; + ParsedArgs<5> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if (r.idx == 0) { + auto buffer = r.pyobject(0); + auto dtype = r.scalartype(1); + auto count = r.toInt64(2); + auto offset = r.toInt64(3); + auto requires_grad = r.toBool(4); + + auto elsize = at::elementSize(dtype); + size_t actual_count = 0; + Py_buffer view; + + TORCH_CHECK_VALUE( + PyObject_CheckBuffer(buffer) != 0, + "object does not implement Python buffer protocol."); + + if (PyObject_GetBuffer(buffer, &view, PyBUF_WRITABLE) < 0) { + TORCH_CHECK( + PyObject_GetBuffer(buffer, &view, PyBUF_SIMPLE) >= 0, + "could not retrieve buffer from object"); + TORCH_WARN_ONCE( + "The given buffer is not writable, and PyTorch does " + "not support non-writable tensors. This means you can write to the " + "underlying (supposedly non-writable) buffer using the tensor. " + "You may want to copy the buffer to protect its data or make it writable " + "before converting it to a tensor. This type of warning will be " + "suppressed for the rest of this program."); + PyErr_Clear(); + } + + Py_INCREF(view.obj); + THPObjectPtr obj(view.obj); + + auto len = view.len; + auto buf = view.buf; + PyBuffer_Release(&view); + + TORCH_CHECK_VALUE( + len > 0 && count != 0, + "both buffer length (", len, ") and count (", count, ") must not be 0"); + TORCH_CHECK_VALUE( + offset >= 0 && offset < len, + "offset (", offset, " bytes) must be non-negative and no greater than " + "buffer length (", len, " bytes) minus 1"); + TORCH_CHECK_VALUE( + count > 0 || (len - offset) % elsize == 0, + "buffer length (", len - offset, " bytes) after offset (", offset, " bytes) " + "must be a multiple of element size (", elsize, ")"); + + if (count < 0) { + actual_count = (len - offset) / elsize; + } else { + actual_count = static_cast(count); + } + + TORCH_CHECK_VALUE( + static_cast(offset) + actual_count * elsize <= len, + "requested buffer length (", actual_count, " * ", elsize, " bytes) " + "after offset (", offset, " bytes) must not be greater than actual " + "buffer length (", len, " bytes)"); + + auto offset_buf = static_cast(buf) + offset; + auto options = TensorOptions() + .dtype(dtype) + .device(c10::kCPU); + + auto tensor = at::for_blob(offset_buf, static_cast(actual_count)) + .options(options) + .deleter([obj = obj.release()](void*) { + pybind11::gil_scoped_acquire gil; + Py_DECREF(obj); + }) + .make_tensor(); + tensor.set_requires_grad(requires_grad); + ret = wrap(tensor); + } + + return ret; + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs); + +// linspace +static PyObject * THPVariable_linspace(PyObject* self_, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "linspace(Scalar start, Scalar end, int64_t? steps=None, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + }, /*traceable=*/true); + + ParsedArgs<9> parsed_args; + auto _r = parser.parse(nullptr, args, kwargs, parsed_args); + if(_r.has_torch_function()) { + return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); + } + if (_r.isNone(3)) { + // aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + + // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) + // This leads to problem in the operator argument checks, + // when either `start` or `end` is complex and dtype is None + const auto options = TensorOptions() + .dtype(_r.scalartypeOptional(4)) + .device(_r.device(6)) + .layout(_r.layoutOptional(5)) + .requires_grad(_r.toBool(8)) + .pinned_memory(_r.toBool(7)); + torch::utils::maybe_initialize_cuda(options); + + auto dispatch_linspace = [](Scalar start, Scalar end, c10::optional steps, TensorOptions options) -> Tensor { + pybind11::gil_scoped_release no_gil; + return torch::linspace(start, end, steps, options); + }; + return wrap(dispatch_linspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), options)); + } else { + // aten::linspace.out(Scalar start, Scalar end, int? steps=None, *, Tensor(a!) out) -> Tensor(a!) + check_out_type_matches(_r.tensor(3), _r.scalartype(4), + _r.isNone(4), _r.layoutOptional(5), + _r.device(6), _r.isNone(6)); + + auto dispatch_linspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional steps) -> Tensor { + pybind11::gil_scoped_release no_gil; + return at::linspace_out(out, start, end, steps); + }; + return wrap(dispatch_linspace_out(_r.tensor(3), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2)).set_requires_grad(_r.toBool(8))); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +// logspace +static PyObject * THPVariable_logspace(PyObject* self_, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "logspace(Scalar start, Scalar end, int64_t? steps=None, double base=10.0, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + }, /*traceable=*/true); + + ParsedArgs<10> parsed_args; + auto _r = parser.parse(nullptr, args, kwargs, parsed_args); + if(_r.has_torch_function()) { + return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); + } + if (_r.isNone(4)) { + // aten::logspace(Scalar start, Scalar end, int? steps=None, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + + // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X) + // This leads to problem in the operator argument checks, + // when either `start` or `end` is complex and dtype is None + const auto options = TensorOptions() + .dtype(_r.scalartypeOptional(5)) + .device(_r.device(7)) + .layout(_r.layoutOptional(6)) + .requires_grad(_r.toBool(9)) + .pinned_memory(_r.toBool(8)); + torch::utils::maybe_initialize_cuda(options); + + auto dispatch_logspace = [](Scalar start, Scalar end, c10::optional steps, double base, TensorOptions options) -> Tensor { + pybind11::gil_scoped_release no_gil; + return torch::logspace(start, end, steps, base, options); + }; + return wrap(dispatch_logspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3), options)); + } else { + // aten::logspace.out(Scalar start, Scalar end, int? steps=None, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) + check_out_type_matches(_r.tensor(4), _r.scalartype(5), + _r.isNone(5), _r.layoutOptional(6), + _r.device(7), _r.isNone(7)); + + auto dispatch_logspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional steps, double base) -> Tensor { + pybind11::gil_scoped_release no_gil; + return at::logspace_out(out, start, end, steps, base); + }; + return wrap(dispatch_logspace_out(_r.tensor(4), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3)).set_requires_grad(_r.toBool(9))); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +// XXX: ops that are bound here are not exposed to the C++ api nor the JIT. +// Any new ops added here should be accompanied with a comment why they are not +// being registered through native_functions.yaml, and be tagged cpp / JIT +// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) +static PyMethodDef torch_functions_manual[] = { + {"arange", castPyCFunctionWithKeywords(THPVariable_arange), + METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"as_tensor", castPyCFunctionWithKeywords(THPVariable_as_tensor), + METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"from_numpy", THPVariable_from_numpy, METH_STATIC | METH_O, nullptr}, + {"frombuffer", castPyCFunctionWithKeywords(THPVariable_frombuffer), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"full", castPyCFunctionWithKeywords(THPVariable_full), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"linspace", castPyCFunctionWithKeywords(THPVariable_linspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"logspace", castPyCFunctionWithKeywords(THPVariable_logspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"nonzero", castPyCFunctionWithKeywords(THPVariable_nonzero), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"randint", castPyCFunctionWithKeywords(THPVariable_randint), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"range", castPyCFunctionWithKeywords(THPVariable_range), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"sparse_coo_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_coo_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"_sparse_coo_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_coo_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"sparse_csr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"_sparse_csr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"tensor", castPyCFunctionWithKeywords(THPVariable_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"get_device", castPyCFunctionWithKeywords(THPVariable_get_device), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"numel", castPyCFunctionWithKeywords(THPVariable_numel), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, +}; + +static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)", + }); + ParsedArgs<3> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if(r.has_torch_function()){ + return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); + } + + const auto as_tuple = r.toBool(1); + const auto has_out = !r.isNone(2); + + if (as_tuple) { + TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True"); + return wrap(dispatch_nonzero_numpy(r.tensor(0))); + } + + if (has_out) { + return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2))); + } + + return wrap(dispatch_nonzero(r.tensor(0))); + + END_HANDLE_TH_ERRORS +} + +static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "numel(Tensor input)", + }, /*traceable=*/false); + + ParsedArgs<1> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + + if(r.has_torch_function()){ + return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); + } + + if (r.idx == 0) { + return wrap(r.tensor(0).numel()); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +// Sharded function definitions +void gatherTorchFunctions_0(std::vector &torch_functions); +void gatherTorchFunctions_1(std::vector &torch_functions); +void gatherTorchFunctions_2(std::vector &torch_functions); + +void gatherTorchFunctions(std::vector &torch_functions) { + constexpr size_t num_functions = sizeof(torch_functions_manual) / sizeof(torch_functions_manual[0]); + torch_functions.assign(torch_functions_manual, + torch_functions_manual + num_functions); + // NOTE: Must be synced with num_shards in tools/autograd/gen_python_functions.py + gatherTorchFunctions_0(torch_functions); + gatherTorchFunctions_1(torch_functions); + gatherTorchFunctions_2(torch_functions); + + static std::array, 4> aliases{{ + // Canonical function, alias name + {"sspaddmm", "saddmm"}, + {"mm", "spmm"}, + {"mm", "dsmm"}, + {"hspmm", "hsmm"} + }}; + + for (const auto& alias : aliases) { + auto it = std::find_if(torch_functions.begin(), torch_functions.end(), + [&](const PyMethodDef& def) { + return strcmp(def.ml_name, alias.first) == 0; + }); + TORCH_INTERNAL_ASSERT( + it != torch_functions.end(), + "Failed to create function alias from ", alias.first, " to ", alias.second); + PyMethodDef alias_def = *it; + alias_def.ml_name = alias.second; + + torch_functions.push_back(alias_def); + } + + torch_functions.push_back({nullptr}); + torch_functions.shrink_to_fit(); +} + +static PyTypeObject THPVariableFunctions = { + PyVarObject_HEAD_INIT(nullptr, 0) + "torch._C._VariableFunctionsClass", /* tp_name */ + 0, /* tp_basicsize */ + 0, /* tp_itemsize */ + nullptr, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + nullptr, /* tp_getattr */ + nullptr, /* tp_setattr */ + nullptr, /* tp_reserved */ + nullptr, /* tp_repr */ + nullptr, /* tp_as_number */ + nullptr, /* tp_as_sequence */ + nullptr, /* tp_as_mapping */ + nullptr, /* tp_hash */ + nullptr, /* tp_call */ + nullptr, /* tp_str */ + nullptr, /* tp_getattro */ + nullptr, /* tp_setattro */ + nullptr, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + nullptr, /* tp_doc */ + nullptr, /* tp_traverse */ + nullptr, /* tp_clear */ + nullptr, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + nullptr, /* tp_iter */ + nullptr, /* tp_iternext */ + nullptr, /* tp_methods */ + nullptr, /* tp_members */ + nullptr, /* tp_getset */ + nullptr, /* tp_base */ + nullptr, /* tp_dict */ + nullptr, /* tp_descr_get */ + nullptr, /* tp_descr_set */ + 0, /* tp_dictoffset */ + nullptr, /* tp_init */ + nullptr, /* tp_alloc */ + nullptr /* tp_new */ +}; + +void initTorchFunctions(PyObject *module) { + static std::vector torch_functions; + gatherTorchFunctions(torch_functions); + THPVariableFunctions.tp_methods = torch_functions.data(); + + if (PyType_Ready(&THPVariableFunctions) < 0) { + throw python_error(); + } + Py_INCREF(&THPVariableFunctions); + + // Steals + Py_INCREF(&THPVariableFunctions); + if (PyModule_AddObject(module, "_VariableFunctionsClass", + reinterpret_cast(&THPVariableFunctions)) < 0) { + throw python_error(); + } + // PyType_GenericNew returns a new reference + THPVariableFunctionsModule = PyType_GenericNew(&THPVariableFunctions, Py_None, Py_None); + // PyModule_AddObject steals a reference + if (PyModule_AddObject(module, "_VariableFunctions", THPVariableFunctionsModule) < 0) { + throw python_error(); + } +} + +}} // namespace torch::autograd From ba0e6a1e03d110b7a6b95de7c80651ec623e8135 Mon Sep 17 00:00:00 2001 From: John Clow Date: Wed, 25 Aug 2021 15:27:37 -0700 Subject: [PATCH 231/530] [EASY] Update the clang-tidy error message (#63370) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63370 As shown by this CI run, the actual thing that is incorrect is the prompt. https://github.com/pytorch/pytorch/actions/runs/1137298261 The CI runs the below command instead of the original command. The original command errors out when importing another file on line 1. Trying to fix the code to work with the original command causes the CI to error out. We should actually ask the user to run `python3 -m tools.linter.install.clang_tidy` Test Plan: Imported from OSS Reviewed By: janeyx99, heitorschueroff Differential Revision: D30530216 Pulled By: Gamrix fbshipit-source-id: 2a2b8d539dcc2839e4000c13e82c207fa89bfc9f --- tools/linter/clang_tidy/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py index b99c1f5366848..1846916c26f3d 100644 --- a/tools/linter/clang_tidy/__main__.py +++ b/tools/linter/clang_tidy/__main__.py @@ -184,7 +184,8 @@ def main() -> None: f"Could not find '{options.clang_tidy_exe}'\n" + "We provide a custom build of clang-tidy that has additional checks.\n" + "You can install it by running:\n" - + "$ python3 tools/linter/install/clang_tidy.py" + + "$ python3 -m tools.linter.install.clang_tidy \n" + + "from the pytorch folder" ) raise RuntimeError(msg) From 1be1c901aabd3ddcf55af3ee869e611b7f3f43b6 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Wed, 25 Aug 2021 15:54:31 -0700 Subject: [PATCH 232/530] Remove render_test_results job (#63877) Summary: This removes the `render_test_results` job we had before which had been causing some confusion among devs when it failed and isn't really necessary now that we can actually render test results on the PR HUD. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63877 Reviewed By: walterddr, janeyx99 Differential Revision: D30546705 Pulled By: driazati fbshipit-source-id: 55fdafdb6f80924d941ffc15ee10787cb54f34a1 --- .github/scripts/generate_ci_workflows.py | 7 +- .github/templates/bazel_ci_workflow.yml.j2 | 68 +--------------- .github/templates/common.yml.j2 | 25 ++++++ .github/templates/linux_ci_workflow.yml.j2 | 80 ++----------------- .github/templates/windows_ci_workflow.yml.j2 | 71 +--------------- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 78 ++++-------------- ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 78 ++++-------------- ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 78 ++++-------------- ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 78 ++++-------------- .../generated-linux-xenial-py3.6-gcc5.4.yml | 78 ++++-------------- ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 67 +++------------- ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 78 ++++-------------- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 67 +++------------- .../generated-win-vs2019-cpu-py3.yml | 67 +++------------- .../generated-win-vs2019-cuda10.1-py3.yml | 67 +++------------- .../generated-win-vs2019-cuda11.3-py3.yml | 67 +++------------- 16 files changed, 173 insertions(+), 881 deletions(-) diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 946d8da6a29ad..f1b962521b18d 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -191,8 +191,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: with open(output_file_path, "w") as output_file: GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"]) - output_file.write(workflow_template.render(asdict(self))) - output_file.write("\n") + content = workflow_template.render(asdict(self)) + output_file.write(content) + if content[-1] != "\n": + output_file.write("\n") print(output_file_path) @@ -504,6 +506,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: jinja_env = jinja2.Environment( variable_start_string="!{{", loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))), + undefined=jinja2.StrictUndefined, ) template_and_workflows = [ (jinja_env.get_template("linux_ci_workflow.yml.j2"), LINUX_WORKFLOWS), diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 index 016a11bc39277..d25ffe6d8a7e5 100644 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ b/.github/templates/bazel_ci_workflow.yml.j2 @@ -44,6 +44,7 @@ on: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + !{{ common.display_ec2_information() }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -85,9 +86,7 @@ on: -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" \ sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh' - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py + !{{ common.parse_ref() }} - name: Display and upload binary build size statistics (Click Me) # temporary hack: set CIRCLE_* vars, until we update # tools/stats/print_test_stats.py to natively support GitHub Actions @@ -156,71 +155,10 @@ on: if-no-files-found: error path: test-reports-*.zip + !{{ common.upload_test_statistics(build_environment) }} - name: Clean up docker images if: always() run: | # Prune all of the docker images docker system prune -af {%- endblock %} -{% block render_test_results +%} - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [build-and-test, !{{ ciflow_config.root_job_name }}] - if: ${{ needs.build-and-test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: !{{ build_environment }}-build-and-test - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test -{%- endblock %} diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 12108f1f95f46..bf72898d04c25 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -4,3 +4,28 @@ run: | .github/scripts/display_ec2_information.sh {%- endmacro -%} + +{%- macro parse_ref() -%} + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py +{%- endmacro -%} + +{%- macro upload_test_statistics(build_environment) -%} + - name: Display and upload test statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: !{{ build_environment }}-test + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test +{%- endmacro -%} diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 767760bf24d25..f63685295bbce 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -194,9 +194,7 @@ jobs: -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" \ sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py + !{{ common.parse_ref() }} - name: Display and upload binary build size statistics (Click Me) # temporary hack: set CIRCLE_* vars, until we update # tools/stats/print_test_stats.py to natively support GitHub Actions @@ -424,6 +422,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip + !{{ common.parse_ref() }} + !{{ common.upload_test_statistics(build_environment) }} - name: Hold runner for 2 hours or until ssh sessions have drained # Always hold for active ssh sessions if: always() @@ -437,76 +437,7 @@ jobs: docker system prune -af {% endblock %} {%- endif -%} -{%- if not is_libtorch %} -{% block render_test_results +%} - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, !{{ ciflow_config.root_job_name }}] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - !{{ common.display_ec2_information() }} - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: !{{ build_environment }}-test - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test -{%- endblock %} -{%- endif -%} - {%- if enable_doc_jobs %} - +{%- if enable_doc_jobs %} pytorch_python_doc_build: runs-on: linux.2xlarge needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}] @@ -608,5 +539,4 @@ jobs: run: | # Prune all of the docker images docker system prune -af - - {%- endif -%} +{%- endif -%} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 6756bf4720ac6..1be7b325306d5 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -275,78 +275,11 @@ jobs: path: pytorch-${{ github.run_id }}/test-reports-*.zip !{{ wait_and_kill_ssh() }} + !{{ common.parse_ref() }} + !{{ common.upload_test_statistics(build_environment) }} - name: Cleanup workspace if: always() shell: bash # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, !{{ ciflow_config.root_job_name }}] -{%- if only_build_on_pull_request %} - if: ${{ github.event_name == 'push' && (needs.test.result != 'skipped' || failure()) }} -{%- else %} - if: ${{ needs.test.result != 'skipped' || failure() }} -{%- endif %} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - # TODO: Make this into a composite step - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - !{{ common.display_ec2_information() }} - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: !{{ build_environment }}-test - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 5a4b6c6a56c78..0b3dddd3930e5 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -395,70 +395,6 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af - - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -473,5 +409,19 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 1226715485f21..624e9d0d92c5b 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -399,70 +399,6 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af - - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -477,5 +413,19 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 38321b1834b26..99a9f1f778f9f 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -395,70 +395,6 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af - - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -473,5 +409,19 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 2daf432ae76c8..be56b56ee715b 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -395,70 +395,6 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af - - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -473,5 +409,19 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 3551fe9845218..c1b877c7f9c10 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -395,70 +395,6 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af - - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -473,8 +409,22 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af pytorch_python_doc_build: runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 9e787e4ba3845..7ca389635bf56 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -138,6 +138,10 @@ jobs: with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -250,69 +254,24 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [build-and-test, ciflow_should_run] - if: ${{ needs.build-and-test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - name: Display and upload test statistics (Click Me) # temporary hack: set CIRCLE_* vars, until we update # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-build-and-test + JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-test CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index c87397849106e..375c4b65aaf89 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -393,70 +393,6 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af - - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - # Should preserve paths so reports should still be in test/test-reports - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -471,5 +407,19 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 2c673ccce0f43..ce4540b79cee7 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -249,64 +249,6 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - # TODO: Make this into a composite step - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -321,5 +263,14 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 54362c903f7d0..d868d19d0fc2c 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -233,64 +233,6 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - # TODO: Make this into a composite step - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -305,5 +247,14 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index c8497bd3029ee..4d4550c9ce06b 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -251,64 +251,6 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - # TODO: Make this into a composite step - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -323,5 +265,14 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 205758657d9b2..c5ae48a888938 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -251,64 +251,6 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - - # this is a separate step from test because the log files from test are too - # long: basically, GitHub tries to render all of the log files when you click - # through an action causing extreme slowdown on actions that contain too many - # logs (like test); we can always move it back to the other one, but it - # doesn't create the best experience - render_test_results: - needs: [generate-test-matrix, test, ciflow_should_run] - if: ${{ needs.test.result != 'skipped' || failure() }} - runs-on: linux.2xlarge - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }} - fail-fast: false - # TODO: Make this into a composite step - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow tools/stats/print_test_stats.py to use Git commands - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - uses: actions/download-artifact@v2 - name: Download PyTorch Test Reports - with: - name: test-reports-${{ matrix.config }} - path: . - - name: Unzip test reports - run: | - unzip -o 'test-reports-*.zip' - - name: Install dependencies - # boto3 version copied from .circleci/docker/common/install_conda.sh - run: | - pip3 install -r requirements.txt - pip3 install boto3==1.16.34 junitparser rich - - name: Output Test Results (Click Me) - run: | - python3 tools/render_junit.py test - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -323,5 +265,14 @@ jobs: CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* From 57d4c6cf424892888866ed98551f769cb5656623 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 25 Aug 2021 16:42:14 -0700 Subject: [PATCH 233/530] =?UTF-8?q?replace=20`self.assertTrue(torch.allclo?= =?UTF-8?q?se(..))`=20with=20`self.assertEqual(=E2=80=A6)`=20(#63637)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Fixes https://github.com/pytorch/pytorch/issues/63565 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63637 Reviewed By: malfet Differential Revision: D30541266 Pulled By: mruberry fbshipit-source-id: ab461949782c6908a589ea098fcfcf5c3e081ee6 --- test/custom_operator/test_custom_ops.py | 10 ++-- test/jit/test_freezing.py | 12 ++--- test/jit/test_tracer.py | 4 +- test/package/test_directory_reader.py | 2 +- test/package/test_model.py | 8 ++-- test/package/test_package_fx.py | 8 ++-- test/package/test_package_script.py | 16 +++---- .../quantization/core/test_workflow_module.py | 6 +-- test/quantization/core/test_workflow_ops.py | 22 ++++----- test/quantization/fx/test_equalize_fx.py | 10 ++-- test/quantization/fx/test_numeric_suite_fx.py | 6 +-- test/quantization/fx/test_quantize_fx.py | 2 +- test/test_autograd.py | 6 +-- test/test_bundled_images.py | 4 +- test/test_cuda.py | 14 +++--- test/test_jit.py | 6 +-- test/test_nn.py | 46 +++++++++---------- test/test_overrides.py | 11 +++-- test/test_spectral_ops.py | 2 +- torch/testing/_internal/common_jit.py | 2 +- torch/testing/_internal/jit_utils.py | 2 +- 21 files changed, 101 insertions(+), 98 deletions(-) diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py index 3937abde91476..356b4932d49ac 100644 --- a/test/custom_operator/test_custom_ops.py +++ b/test/custom_operator/test_custom_ops.py @@ -44,8 +44,8 @@ def test_calling_custom_op_with_autograd(self): output.sum().backward(go, False, True) grad = torch.ones(5, 5) - self.assertTrue(torch.allclose(x.grad, y + grad)) - self.assertTrue(torch.allclose(y.grad, x + grad * 2)) + self.assertEqual(x.grad, y + grad) + self.assertEqual(y.grad, x + grad * 2) # Test with optional arg. x.grad.zero_() @@ -56,9 +56,9 @@ def test_calling_custom_op_with_autograd(self): go = torch.ones((), requires_grad=True) output.sum().backward(go, False, True) - self.assertTrue(torch.allclose(x.grad, y + grad)) - self.assertTrue(torch.allclose(y.grad, x + grad * 2)) - self.assertTrue(torch.allclose(z.grad, grad)) + self.assertEqual(x.grad, y + grad) + self.assertEqual(y.grad, x + grad * 2) + self.assertEqual(z.grad, grad) def test_calling_custom_op_with_autograd_in_nograd_mode(self): with torch.no_grad(): diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 8e07af06b70ea..e9317b11412a9 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -1877,7 +1877,7 @@ def forward(self, x): N, C, H, W, = 10, 3, 224, 224 inp = torch.randn(N, C, H, W) self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) - self.assertTrue(torch.allclose(model(inp), mod(inp))) + self.assertEqual(model(inp), mod(inp)) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") def test_pool2d_batchnorm(self): @@ -1901,7 +1901,7 @@ def test_pool2d_batchnorm(self): self.run_pass('dce', mod.graph) self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) FileCheck().check("aten::to_dense").check_next("return").run(mod.graph) - self.assertTrue(torch.allclose(sub_model(inp), mod(inp))) + self.assertEqual(sub_model(inp), mod(inp)) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") def test_pool3d_batchnorm(self): @@ -1925,7 +1925,7 @@ def test_pool3d_batchnorm(self): self.run_pass('dce', mod.graph) self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) FileCheck().check("aten::to_dense").check_next("return").run(mod.graph) - self.assertTrue(torch.allclose(sub_model(inp), mod(inp))) + self.assertEqual(sub_model(inp), mod(inp)) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") @skipIfNoTorchVision @@ -1964,7 +1964,7 @@ def forward(self, x): check_count("aten::to_dense", 1, exactly=True).run(mod.graph)) else: FileCheck().check_count("aten::to_dense", 1, exactly=True).check("aten::layer_norm").run(mod.graph) - self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04)) + self.assertEqual(sub_model(param[2]), mod(param[2]), rtol=1e-04, atol=1e-04) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") @skipIfNoTorchVision @@ -2003,7 +2003,7 @@ def forward(self, x): inp = torch.randn(N, C, H, W) self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph) FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph) - self.assertTrue(torch.allclose(sub_model(inp), mod(inp))) + self.assertEqual(sub_model(inp), mod(inp)) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") def test_hardswish_hardsigmoid(self): @@ -2030,7 +2030,7 @@ def test_hardswish_hardsigmoid(self): x = torch.rand(size) # `inplace=False` is intentional, otherwise we modify the input # and we aren't testing aten impls anyways - self.assertTrue(torch.allclose(aten_op(x, inplace=False), m(x).to_dense())) + self.assertEqual(aten_op(x, inplace=False), m(x).to_dense()) @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled") def test_scalar_mul(self): diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 247072fb3e94d..1d95dc8d0d8a4 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -163,13 +163,13 @@ def forward(self, x, y): eager_out = mod(*test_inputs) traced_out = traced_func(*test_inputs) self.assertNotWarn(lambda: traced_func(*test_inputs), "Shouldn't throw slicing related warn here") - self.assertTrue(torch.allclose(eager_out, traced_out)) + self.assertEqual(eager_out, traced_out) test_inputs = (torch.randint(0, 50, (50, 50)), torch.tensor(12)) eager_out = mod(*test_inputs) traced_out = traced_func(*test_inputs) self.assertNotWarn(lambda: traced_func(*test_inputs), "Shouldn't throw slicing related warn here") - self.assertTrue(torch.allclose(eager_out, traced_out)) + self.assertEqual(eager_out, traced_out) def test_typeas_trace_check(self): diff --git a/test/package/test_directory_reader.py b/test/package/test_directory_reader.py index 93968d6e1bf92..576a7f0c064cd 100644 --- a/test/package/test_directory_reader.py +++ b/test/package/test_directory_reader.py @@ -61,7 +61,7 @@ def test_loading_pickle(self): importer = PackageImporter(Path(temp_dir) / Path(filename).name) dir_mod = importer.load_pickle("model", "model.pkl") input = torch.rand(1, 3, 224, 224) - self.assertTrue(torch.allclose(dir_mod(input), resnet(input))) + self.assertEqual(dir_mod(input), resnet(input)) def test_loading_module(self): """ diff --git a/test/package/test_model.py b/test/package/test_model.py index f5e08b6bfa83c..dc67ff5d89d2e 100644 --- a/test/package/test_model.py +++ b/test/package/test_model.py @@ -49,7 +49,7 @@ def test_resnet(self): # test that it works input = torch.rand(1, 3, 224, 224) ref = resnet(input) - self.assertTrue(torch.allclose(r2(input), ref)) + self.assertEqual(r2(input), ref) # functions exist also to get at the private modules in each package torchvision = i.import_module("torchvision") @@ -81,7 +81,7 @@ def test_resnet(self): i2 = PackageImporter(f2) r3 = i2.load_pickle("model", "model.pkl") - self.assertTrue(torch.allclose(r3(input), ref)) + self.assertEqual(r3(input), ref) @skipIfNoTorchVision def test_model_save(self): @@ -159,7 +159,7 @@ def load(): r = the_model(input) results.append(r) - self.assertTrue(torch.allclose(*results)) + self.assertEqual(*results) @skipIfNoTorchVision def test_script_resnet(self): @@ -188,7 +188,7 @@ def test_script_resnet(self): loaded = torch.jit.load(f2) input = torch.rand(1, 3, 224, 224) - self.assertTrue(torch.allclose((loaded(input)), resnet(input))) + self.assertEqual(loaded(input), resnet(input)) if __name__ == "__main__": diff --git a/test/package/test_package_fx.py b/test/package/test_package_fx.py index 7f31014a8ec04..64d431c0a3e6b 100644 --- a/test/package/test_package_fx.py +++ b/test/package/test_package_fx.py @@ -36,7 +36,7 @@ def forward(self, x): pi = PackageImporter(f) loaded_traced = pi.load_pickle("model", "model.pkl") input = torch.rand(2, 3) - self.assertTrue(torch.allclose(loaded_traced(input), traced(input))) + self.assertEqual(loaded_traced(input), traced(input)) def test_package_then_fx(self): from package_a.test_module import SimpleTest @@ -52,7 +52,7 @@ def test_package_then_fx(self): loaded = pi.load_pickle("model", "model.pkl") traced = symbolic_trace(loaded) input = torch.rand(2, 3) - self.assertTrue(torch.allclose(loaded(input), traced(input))) + self.assertEqual(loaded(input), traced(input)) def test_package_fx_package(self): from package_a.test_module import SimpleTest @@ -87,7 +87,7 @@ def test_package_fx_package(self): loaded2 = pi2.load_pickle("model", "model.pkl") input = torch.rand(2, 3) - self.assertTrue(torch.allclose(loaded(input), loaded2(input))) + self.assertEqual(loaded(input), loaded2(input)) def test_package_fx_with_imports(self): import package_a.subpackage @@ -158,7 +158,7 @@ def __init__(self, root, graph, info): self.assertEqual(loaded_gm.info, "secret") input_x = torch.randn(3) - self.assertTrue(torch.allclose(loaded_gm(input_x), gm(input_x))) + self.assertEqual(loaded_gm(input_x), gm(input_x)) if __name__ == "__main__": diff --git a/test/package/test_package_script.py b/test/package/test_package_script.py index 3bbaed0501ca1..ecacd79fb6bf7 100644 --- a/test/package/test_package_script.py +++ b/test/package/test_package_script.py @@ -51,7 +51,7 @@ def test_package_interface(self): input = torch.tensor(1) - self.assertTrue(torch.allclose(scripted(input), scripted_loaded(input))) + self.assertEqual(scripted(input), scripted_loaded(input)) def test_different_package_interface(self): """Test a case where the interface defined in the package is @@ -149,7 +149,7 @@ def __init__(self, x): input = torch.rand(2, 3) loaded_script_class = diff_fake.MyScriptClass(input) orig_script_class = fake.MyScriptClass(input) - self.assertTrue(torch.allclose(loaded_script_class.bar, orig_script_class.foo)) + self.assertEqual(loaded_script_class.bar, orig_script_class.foo) def test_save_scriptmodule(self): """ @@ -506,7 +506,7 @@ def test_save_shared_tensors(self): self.assertTrue(len(file_structure.children[".data"].children) == 1) input = torch.rand(2, 3, 4) - self.assertTrue(torch.allclose(loaded_mod_1(input), mod1(input))) + self.assertEqual(loaded_mod_1(input), mod1(input)) def test_load_shared_tensors(self): """ @@ -630,7 +630,7 @@ def test_saving_and_scripting_packaged_mod(self): loaded_mod = importer_0.load_pickle("model", "model.pkl") input = torch.rand(2, 3) - self.assertTrue(torch.allclose(loaded_mod(input), orig_mod(input))) + self.assertEqual(loaded_mod(input), orig_mod(input)) scripted_mod = torch.jit.script(loaded_mod) @@ -643,7 +643,7 @@ def test_saving_and_scripting_packaged_mod(self): importer_1 = PackageImporter(buffer_1) loaded_mod_scripted = importer_1.load_pickle("res", "scripted_mod.pkl") - self.assertTrue(torch.allclose(loaded_mod_scripted(input), orig_mod(input))) + self.assertEqual(loaded_mod_scripted(input), orig_mod(input)) def test_mixing_packaged_and_inline_modules(self): """ @@ -680,7 +680,7 @@ def forward(self, input: str): loaded_imported = importer.load_pickle("model", "imported.pkl") input = torch.rand(2, 3) - self.assertTrue(torch.allclose(loaded_imported(input), imported_mod(input))) + self.assertEqual(loaded_imported(input), imported_mod(input)) self.assertEqual(loaded_inline("input"), inline_mod("input")) @skipIfNoTorchVision @@ -721,8 +721,8 @@ def a_non_torch_leaf(a, b): loaded_imported = importer.load_pickle("model", "imported.pkl") input = torch.rand(2, 3) - self.assertTrue(torch.allclose(loaded_imported(input), imported_mod(input))) - self.assertTrue(torch.allclose(loaded_inline(input), inline_mod(input))) + self.assertEqual(loaded_imported(input), imported_mod(input)) + self.assertEqual(loaded_inline(input), inline_mod(input)) def test_tensor_sharing_pickle(self): """Test that saving a ScriptModule and a separately saving a tensor diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py index 2298653e878f8..b7782ecf9c1bd 100644 --- a/test/quantization/core/test_workflow_module.py +++ b/test/quantization/core/test_workflow_module.py @@ -205,11 +205,11 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range): if reduce_range: ref_scales = [s * 255 / 127 for s in ref_scales] ref_zero_points = [math.floor(z / 2) for z in ref_zero_points] - self.assertTrue(torch.allclose(qparams[0], torch.tensor(ref_scales, dtype=qparams[0].dtype), atol=0.0001)) + self.assertEqual(qparams[0], torch.tensor(ref_scales, dtype=qparams[0].dtype), rtol=1e-5, atol=0.0001) if qscheme == torch.per_channel_affine_float_qparams: - self.assertTrue(torch.allclose(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype), atol=1)) + self.assertEqual(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype), rtol=1e-5, atol=1) else: - self.assertTrue(torch.allclose(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype))) + self.assertEqual(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype)) # Test for serializability diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py index 9fcf5ac138f3e..60cd04345be85 100644 --- a/test/quantization/core/test_workflow_ops.py +++ b/test/quantization/core/test_workflow_ops.py @@ -312,13 +312,13 @@ def test_forward_per_tensor_half_precision_numerics(self): X1 = torch.randn(5, 5).to(torch.float16) Y1 = torch.fake_quantize_per_tensor_affine(X1, scale, zero, mini, maxi) Y1r = _fake_quantize_per_tensor_affine_reference(X1, scale, zero, mini, maxi) - self.assertTrue(torch.allclose(Y1, Y1r, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y1, Y1r, rtol=tolerance, atol=tolerance) # to force overflow X2 = torch.tensor(2**15 + .01).to(torch.float16) Y2 = torch.fake_quantize_per_tensor_affine(X2, scale, zero, mini, maxi) Y2r = _fake_quantize_per_tensor_affine_reference(X2, scale, zero, mini, maxi) - self.assertTrue(torch.allclose(Y2, Y2r, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y2, Y2r, rtol=tolerance, atol=tolerance) scale = 10 @@ -326,7 +326,7 @@ def test_forward_per_tensor_half_precision_numerics(self): X3 = torch.tensor(2**-24).to(torch.float16) Y3 = torch.fake_quantize_per_tensor_affine(X3, scale, zero, mini, maxi) Y3r = _fake_quantize_per_tensor_affine_reference(X3, scale, zero, mini, maxi) - self.assertTrue(torch.allclose(Y3, Y3r, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y3, Y3r, rtol=tolerance, atol=tolerance) def _test_forward_per_tensor_cachemask_impl(self, device): float_types = (torch.float32, torch.float16, torch.float64) @@ -347,7 +347,7 @@ def _test_forward_per_tensor_cachemask_impl(self, device): X, scale, zero_point, quant_min, quant_max) Y_ref = _fake_quantize_per_tensor_affine_reference( X, scale, zero_point, quant_min, quant_max).to(device) - self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y_test, Y_ref, rtol=tolerance, atol=tolerance) self.assertTrue(Y_test.dtype == float_type) def test_forward_per_tensor_cachemask_cpu(self): @@ -380,14 +380,14 @@ def _test_backward_per_tensor_cachemask_impl(self, device): X, scale, zero_point, quant_min, quant_max) Y_ref = _fake_quantize_per_tensor_affine_reference( X, scale, zero_point, quant_min, quant_max).to(device) - self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y_test, Y_ref, rtol=tolerance, atol=tolerance) # backward pass dout = torch.rand_like(X, dtype=torch.float).to(device) dX = _fake_quantize_per_tensor_affine_grad_reference( dout, X, scale, zero_point, quant_min, quant_max) Y_test.backward(dout) - self.assertTrue(torch.allclose(dX, X.grad)) + self.assertEqual(dX, X.grad) self.assertTrue(X.grad.dtype == float_type) def test_backward_per_tensor_cachemask_cpu(self): @@ -729,14 +729,14 @@ def test_forward_per_channel_half_precision_numerics(self): X1 = torch.randn(4, 5).to(torch.float16) Y1 = torch.fake_quantize_per_channel_affine(X1, scale, zero, axis, mini, maxi) Y1r = _fake_quantize_per_channel_affine_reference(X1, scale, zero, axis, mini, maxi) - self.assertTrue(torch.allclose(Y1, Y1r, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y1, Y1r, rtol=tolerance, atol=tolerance) # to force overflow X2 = torch.randn(4, 5).to(torch.float16) X2[0, 0] = 2**15 + .01 Y2 = torch.fake_quantize_per_channel_affine(X2, scale, zero, axis, mini, maxi) Y2r = _fake_quantize_per_channel_affine_reference(X2, scale, zero, axis, mini, maxi) - self.assertTrue(torch.allclose(Y2, Y2r, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y2, Y2r, rtol=tolerance, atol=tolerance) scale = torch.zeros(5) + 10 @@ -745,7 +745,7 @@ def test_forward_per_channel_half_precision_numerics(self): X3[0, 0] = 2**-24 Y3 = torch.fake_quantize_per_channel_affine(X3, scale, zero, axis, mini, maxi) Y3r = _fake_quantize_per_channel_affine_reference(X3, scale, zero, axis, mini, maxi) - self.assertTrue(torch.allclose(Y3, Y3r, rtol=tolerance, atol=tolerance)) + self.assertEqual(Y3, Y3r, rtol=tolerance, atol=tolerance) def _test_learnable_forward_per_channel(self, X_base, device, scale_base, zero_point_base, axis): r"""Tests the forward path of the learnable FakeQuantizePerTensorAffine op. @@ -1160,7 +1160,7 @@ def test_fused_obs_fake_quant_backward_op(self, device) -> None: dX = _fake_quantize_per_tensor_affine_grad_reference( dout, x, x_scale, x_zero_point, 0, 255) - self.assertTrue(torch.allclose(dX, x.grad)) + self.assertEqual(dX, x.grad) self.assertTrue(x.grad.dtype == torch.float32) @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),) @@ -1206,7 +1206,7 @@ def test_fused_backward_op_fake_quant_off(self, device) -> None: dX = _fake_quantize_per_tensor_affine_grad_reference( dout, x, x_scale, x_zero_point, 0, 255) - self.assertTrue(torch.allclose(dX, x.grad)) + self.assertEqual(dX, x.grad) self.assertTrue(x.grad.dtype == torch.float32) if __name__ == '__main__': diff --git a/test/quantization/fx/test_equalize_fx.py b/test/quantization/fx/test_equalize_fx.py index 7c17d1296daac..a74b1744e7cc3 100644 --- a/test/quantization/fx/test_equalize_fx.py +++ b/test/quantization/fx/test_equalize_fx.py @@ -217,10 +217,10 @@ def test_input_weight_eq_observer(self, ndim, input_qdtype, input_qscheme, weigh ref_zero_points = -128 if weight_qdtype is torch.qint8 else 0 ref_zero_points = ref_zero_points - np.round(ref_min_weights_scaled / ref_scales) - self.assertTrue(torch.allclose(weight_qparams[0], torch.tensor( - ref_scales, dtype=weight_qparams[0].dtype), atol=0.0001)) - self.assertTrue(torch.allclose(weight_qparams[1], torch.tensor( - ref_zero_points, dtype=weight_qparams[1].dtype), atol=1)) + self.assertEqual(weight_qparams[0], torch.tensor( + ref_scales, dtype=weight_qparams[0].dtype), rtol=1e-5, atol=0.0001) + self.assertEqual(weight_qparams[1], torch.tensor( + ref_zero_points, dtype=weight_qparams[1].dtype), rtol=1e-5, atol=1) def test_input_weight_equalization_prepare(self): """ Tests that graphs created after prepare_fx is as expected @@ -783,7 +783,7 @@ def test_input_weight_equalization_results(self): prepared(x) equalized_and_quantized = convert_fx(prepared) # Check if compile equalized_and_quantized_output = equalized_and_quantized(x) - self.assertTrue(torch.allclose(quantized_output, equalized_and_quantized_output, atol=0.1)) + self.assertEqual(quantized_output, equalized_and_quantized_output, rtol=1e-5, atol=0.1) @skipIfNoFBGEMM def test_selective_equalization(self): diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py index 61062fba781e5..3e627f5e14419 100644 --- a/test/quantization/fx/test_numeric_suite_fx.py +++ b/test/quantization/fx/test_numeric_suite_fx.py @@ -1834,8 +1834,8 @@ def test_loggers_preserve_qat_numerics(self): mp_ns, mc_ns = add_loggers('fp32', mp, 'int8', mc, OutputLogger) ref_fp32_ns = mp_ns(datum) ref_int8_ns = mc_ns(datum) - self.assertTrue(torch.allclose(ref_fp32, ref_fp32_ns)) - self.assertTrue(torch.allclose(ref_int8, ref_int8_ns)) + self.assertEqual(ref_fp32, ref_fp32_ns) + self.assertEqual(ref_int8, ref_int8_ns) @skipIfNoFBGEMM def test_shadow_loggers_preserve_qat_numerics(self): @@ -1852,7 +1852,7 @@ def test_shadow_loggers_preserve_qat_numerics(self): mc_shadows_mp = add_shadow_loggers('int8', mc, 'fp32', mp, OutputLogger) ref_shadow = mc_shadows_mp(datum) - self.assertTrue(torch.allclose(ref_fp32, ref_shadow)) + self.assertEqual(ref_fp32, ref_shadow) class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase): """ diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 1bc6b610d1662..08474d2bc1d19 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -4668,7 +4668,7 @@ def _test_conv_transpose_impl( m2q = torch.quantization.convert(m2p) q_result2 = m2q(data) # verify results match - self.assertTrue(torch.allclose(q_result1, q_result2)) + self.assertEqual(q_result1, q_result2) @unittest.skipUnless('qnnpack' in supported_qengines, "This Pytorch Build has not been built with or does not support QNNPACK") diff --git a/test/test_autograd.py b/test/test_autograd.py index 126d9230fe687..4d416459c2af4 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -2801,11 +2801,11 @@ def test_var_mean_differentiable(self): r1 = var1 * var1 * mean1 * mean1 r2 = var2 * var2 * mean2 * mean2 - self.assertTrue(torch.allclose(r1, r2, rtol=0.01, atol=0.0)) + self.assertEqual(r1, r2, rtol=0.01, atol=0.0) torch.autograd.backward(r1, grad) torch.autograd.backward(r2, grad) - self.assertTrue(torch.allclose(input1.grad, input2.grad, rtol=0.01, atol=0.0)) + self.assertEqual(input1.grad, input2.grad, rtol=0.01, atol=0.0) @slowTest @skipIfNoLapack @@ -5159,7 +5159,7 @@ def test_autograd_inplace_views_cross_dtype(self): # TODO: this is a bug! # once this is fixed, it should have the transpose removed: - # self.assertTrue(torch.allclose(non_inplace_grad, inplace_grad)) + # self.assertEqual(non_inplace_grad, inplace_grad) self.assertEqual(non_inplace_grad.T, inplace_grad) def test_autograd_multiple_views_python(self): diff --git a/test/test_bundled_images.py b/test/test_bundled_images.py index 0c95ae39c582d..7efd40178a160 100644 --- a/test/test_bundled_images.py +++ b/test/test_bundled_images.py @@ -67,7 +67,7 @@ def forward(self, arg): self.assertEqual(len(inflated), 1) self.assertEqual(len(inflated[0]), 1) self.assertEqual(raw_data.shape, decoded_data.shape) - self.assertTrue(torch.allclose(raw_data, decoded_data, atol=0.1, rtol=1e-01)) + self.assertEqual(raw_data, decoded_data, atol=0.1, rtol=1e-01) # Check if fb::image_decode_to_NCHW works as expected with open("caffe2/test/test_img/p1.jpg", "rb") as fp: @@ -76,4 +76,4 @@ def forward(self, arg): byte_tensor = torch.tensor(list(fp.read())).byte() im2_tensor = torch.ops.fb.image_decode_to_NCHW(byte_tensor, weight, bias) self.assertEqual(raw_data.shape, im2_tensor.shape) - self.assertTrue(torch.allclose(raw_data, im2_tensor, atol=0.1, rtol=1e-01)) + self.assertEqual(raw_data, im2_tensor, atol=0.1, rtol=1e-01) diff --git a/test/test_cuda.py b/test/test_cuda.py index 55bab2ee4ebcd..e90cb1703c06e 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -2036,7 +2036,7 @@ def test_grad_scaling_unscale(self, dtype=torch.float): else: self.assertEqual(found_inf, 0.0) for grad in grads: - self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7)) + self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7) # When passing lists with mismatched dtypes to a raw # _amp_foreach_non_finite_check_and_unscale_ call, @@ -2044,7 +2044,7 @@ def test_grad_scaling_unscale(self, dtype=torch.float): grads = [g.clone(), g.to(dtype=torch.float16)] torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale) for grad in grads: - self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7)) + self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7) # Passing lists with mismatched devices to a raw # _amp_foreach_non_finite_check_and_unscale_ call should raise errors. @@ -2084,7 +2084,7 @@ def perfect_storm_grads(inject_inf): # No inf was injected, ensures unscaling worked normally. self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0) for grad in grads: - self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7)) + self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7) else: # inf was injected, ensures inf was found. self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1) @@ -2136,7 +2136,7 @@ def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float): found_inf.zero_() found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur] self.assertEqual(found_inf, 0.0) - self.assertTrue(torch.allclose(p.grad.to_dense(), (s / 4).to_dense())) + self.assertEqual(p.grad.to_dense(), (s / 4).to_dense()) v = torch.FloatTensor([16., 32., float('inf')]) p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype) @@ -2158,7 +2158,7 @@ def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float): found_inf.zero_() found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur] self.assertEqual(found_inf, 0.0) - self.assertTrue(torch.allclose(p.grad.to_dense(), (s.half() / 4).to_dense())) + self.assertEqual(p.grad.to_dense(), (s.half() / 4).to_dense()) # Creates fp16 sparse tensor with duplicated indices (uncoalesced). The uncoalesced representation # does not overflow in fp16, but the coalesced representation would, because 64000 + 64000 > fp16 max. @@ -2465,7 +2465,7 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api): for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()), chain(mod_scaling0.parameters(), mod_scaling1.parameters())): - self.assertTrue(torch.allclose(c, s, atol=1e-7)) + self.assertEqual(c, s, rtol=1e-5, atol=1e-7) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") def test_grad_scaling_multigpu(self): @@ -2534,7 +2534,7 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api): for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()), chain(mod_scaling0.parameters(), mod_scaling1.parameters())): - self.assertTrue(torch.allclose(c, s, atol=1e-7)) + self.assertEqual(c, s, rtol=1e-5, atol=1e-7) def test_cublas_multiple_threads_same_device(self): # Note, these parameters should be very carefully tuned diff --git a/test/test_jit.py b/test/test_jit.py index 28de1722dde47..2595411c01848 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -14954,7 +14954,7 @@ def jit_multihead_attn_forward(query, # type: Tensor attn_mask=mask)[0] # print("rel. error: ") # print(jit_out / py_out - 1) - self.assertTrue(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4)) + self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4) @unittest.skipIf(not RUN_CUDA, "no CUDA") def test_scriptmodule_multi_head_attn_cuda(self): @@ -14990,7 +14990,7 @@ def forward(self, q, k, v): None, None, None, 0.0, model.mod.out_proj.weight, model.mod.out_proj.bias)[0] - self.assertTrue(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4)) + self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4) @unittest.skipIf(not RUN_CUDA, "no CUDA") def test_scriptmodule_transformer_cuda(self): @@ -15029,7 +15029,7 @@ def forward(self, q, k): # print(jit_out/py_out-1) # print(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4)) - self.assertTrue(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4)) + self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4) def test_list_python_op(self): def python_list_op(lst): diff --git a/test/test_nn.py b/test/test_nn.py index 8c3541aca0716..c6fe0b28b30ac 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -229,7 +229,7 @@ def test_doubletensor_avg_pool2d(self): actual = torch.nn.functional.avg_pool2d(input[0], (i, j)) actual = actual.view(1, actual.numel()) expected = self._avg_pool2d(input, (i, j)) - self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5)) + self.assertEqual(actual, expected, rtol=0, atol=1e-5) def test_avg_pool2d_with_zero_divisor(self): self.assertRaisesRegex(RuntimeError, "divisor must be not zero", @@ -244,7 +244,7 @@ def test_doubletensor_avg_pool2d_with_divisor(self): actual = F.avg_pool2d(input[0], (i, j), divisor_override=divisor) actual = actual.view(1, actual.numel()) expected = self._sum_pool2d(input, (i, j)) / divisor - self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5)) + self.assertEqual(actual, expected, rtol=0, atol=1e-5) def test_doubletensor_avg_pool3d(self): h, w, d = 5, 6, 7 @@ -255,7 +255,7 @@ def test_doubletensor_avg_pool3d(self): actual = torch.nn.functional.avg_pool3d(input.unsqueeze(0), (i, j, k)) actual = actual.view(1, actual.numel()) expected = self._avg_pool3d(input, (i, j, k)) - self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5)) + self.assertEqual(actual, expected, rtol=0, atol=1e-5) def test_doubletensor_avg_pool3d_with_divisor(self): h, w, d = 6, 5, 7 @@ -267,7 +267,7 @@ def test_doubletensor_avg_pool3d_with_divisor(self): actual = torch.nn.functional.avg_pool3d(input.unsqueeze(0), (i, j, k), divisor_override=divisor) actual = actual.view(1, actual.numel()) expected = self._sum_pool3d(input, (i, j, k)) / divisor - self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5)) + self.assertEqual(actual, expected, rtol=0, atol=1e-5) def test_avg_pool3d_with_zero_divisor(self): self.assertRaisesRegex(RuntimeError, "divisor must be not zero", @@ -2260,7 +2260,7 @@ def forward(self, x): self.assertNotIn("weight", model._parameters) # Result should be skew-symmetric A = model.weight - self.assertTrue(torch.allclose(A, -A.T)) + self.assertEqual(A, -A.T) # Remove and check consistency parametrize.remove_parametrizations(model, "weight", leave_parametrized=False) self.assertFalse(hasattr(model, "parametrizations")) @@ -2277,7 +2277,7 @@ def forward(self, x): self.assertNotIn("weight", model._parameters) # Result should be skew-symmetric A = model.weight - self.assertTrue(torch.allclose(A, -A.T)) + self.assertEqual(A, -A.T) # Remove and check consistency parametrize.remove_parametrizations(model, "weight", leave_parametrized=False) self.assertFalse(hasattr(model, "parametrizations")) @@ -2291,7 +2291,7 @@ def forward(self, x): # Result should be orthogonal X = model.weight Id = torch.eye(X.size(0), device=X.device) - self.assertTrue(torch.allclose(X.T @ X, Id)) + self.assertEqual(X.T @ X, Id) # Structure tests self.assertTrue(hasattr(model, "parametrizations")) self.assertTrue(parametrize.is_parametrized(model)) @@ -2810,10 +2810,10 @@ def right_inverse(self, w): init_weight = model.weight.clone() parametrize.register_parametrization(model, "weight", RankOne()) # Projecting a rank 1 matrix onto the matrices of rank one does not change the matrix - self.assertTrue(torch.allclose(init_weight, model.weight)) + self.assertEqual(init_weight, model.weight) parametrize.register_parametrization(model, "weight", Double()) # The matrix now is twice the initial matrix - self.assertTrue(torch.allclose(2.0 * init_weight, model.weight)) + self.assertEqual(2.0 * init_weight, model.weight) # Multiplying by a scalar does not change the rank self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1) @@ -11276,7 +11276,7 @@ def test_layer_norm_grads_with_create_graph_flag(self): grads1 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=False)[0] grads2 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=True)[0] - self.assertTrue(torch.allclose(grads1, grads2, rtol, atol)) + self.assertEqual(grads1, grads2, rtol=rtol, atol=atol) if TEST_CUDA: x = x.to('cuda') @@ -11285,7 +11285,7 @@ def test_layer_norm_grads_with_create_graph_flag(self): grads1 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=False)[0] grads2 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=True)[0] - self.assertTrue(torch.allclose(grads1, grads2, rtol, atol)) + self.assertEqual(grads1, grads2, rtol=rtol, atol=atol) def test_padding_list(self): # Padding can be a list, or tuple (regression test for gh-54452) @@ -11793,7 +11793,7 @@ def test_add_relu(self): relu_res = torch.relu(add_res) add_relu_res = torch._VF._add_relu(a, b) - self.assertTrue(torch.allclose(add_relu_res, relu_res)) + self.assertEqual(add_relu_res, relu_res) def test_add_relu_broadcasting(self): a = torch.rand((1, 32)) @@ -11802,7 +11802,7 @@ def test_add_relu_broadcasting(self): res = torch._VF._add_relu(a, b) broadcasted_res = torch._VF._add_relu(a, b_scalar) - self.assertTrue(torch.allclose(broadcasted_res, res)) + self.assertEqual(broadcasted_res, res) def add_test(test, decorator=None): @@ -14070,8 +14070,8 @@ def helper(n, c, h, w, kernel_size, stride=None, self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) self.assertTrue(ref_out.is_contiguous()) - self.assertTrue(torch.allclose(out, ref_out)) - self.assertTrue(torch.allclose(input.grad, ref_input.grad)) + self.assertEqual(out, ref_out) + self.assertEqual(input.grad, ref_input.grad) helper(4, 8, 8, 8, 3) helper(4, 8, 8, 8, 3, count_include_pad=False, padding=1) @@ -14200,9 +14200,9 @@ def helper(n, c, h, w, kernel_size, stride=None): self.assertTrue(ref_out.is_contiguous()) self.assertTrue(ind.is_contiguous(memory_format=torch.channels_last)) self.assertTrue(ref_ind.is_contiguous()) - self.assertTrue(torch.allclose(out, ref_out)) - self.assertTrue(torch.allclose(ind, ref_ind)) - self.assertTrue(torch.allclose(input.grad, ref_input.grad)) + self.assertEqual(out, ref_out) + self.assertEqual(ind, ref_ind) + self.assertEqual(input.grad, ref_input.grad) helper(4, 8, 8, 8, 7) helper(200, 512, 28, 28, 2) @@ -17180,7 +17180,7 @@ def test_maxpool3d_non_square_backward(self, device): shape = tuple(32 if i != dim else 256 for i in range(4)) x = torch.randn(shape, device=device, requires_grad=True) F.max_pool3d(x, kernel_size=(1, 1, 1)).sum().backward() - self.assertTrue(torch.allclose(x.grad, torch.ones_like(x.grad))) + self.assertEqual(x.grad, torch.ones_like(x.grad)) # Check that clip_grad_norm_ raises an error if the total norm of the # parameters' gradients is non-finite @@ -17672,7 +17672,7 @@ def removable_hook_2(m, input): input = torch.randn(2, 2) output = module(input) - self.assertTrue(torch.allclose(torch.sigmoid(input), output)) + self.assertEqual(torch.sigmoid(input), output) # make sure hook removal is successful self.assertFalse(handle.id in handle.hooks_dict_ref()) @@ -17707,7 +17707,7 @@ def removable_hook_2(m, input, output): input = torch.randn(2, 2) output = module(input) - self.assertTrue(torch.allclose(torch.sigmoid(input), output)) + self.assertEqual(torch.sigmoid(input), output) # make sure hook removal is successful self.assertFalse(handle.id in handle.hooks_dict_ref()) @@ -18001,7 +18001,7 @@ def hook_function(module, input): module = TestModule() module.register_forward_pre_hook(hook_function) output = module(torch.zeros(2, 2)) - self.assertTrue(torch.allclose(output, torch.ones(2, 2))) + self.assertEqual(output, torch.ones(2, 2)) def test_lazy_forward_hook(self): """ @@ -18024,7 +18024,7 @@ def hook_function(module, input, output): module = TestModule() module.register_forward_hook(hook_function) output = module(torch.zeros(2, 2)) - self.assertTrue(torch.allclose(output, torch.ones(2, 2))) + self.assertEqual(output, torch.ones(2, 2)) @suppress_warnings def test_lazy_conv1d(self): diff --git a/test/test_overrides.py b/test/test_overrides.py index 41044376a40f7..a6252374364c2 100644 --- a/test/test_overrides.py +++ b/test/test_overrides.py @@ -762,6 +762,9 @@ def __bool__(self): def __int__(self): return self.__torch_function__(torch.Tensor.__int__, (Wrapper,), (self,)) + def __len__(self): + return len(self._data) + # unwrap inputs if necessary def unwrap(v): @@ -782,15 +785,15 @@ class TestEinsumOverride(TestCase): def test_wrapper(self): x = Wrapper(torch.randn(5)) y = Wrapper(torch.randn(4)) - self.assertTrue(torch.allclose(torch.einsum('i,j->ij', x, y), - torch.ger(x, y))) + self.assertEqual(torch.einsum('i,j->ij', x, y)._data, + torch.ger(x, y)._data) # in the old einsum interface, `operands` is a list a = Wrapper(torch.randn(2, 3)) b = Wrapper(torch.randn(5, 3, 7)) c = Wrapper(torch.randn(2, 7)) - self.assertTrue(torch.allclose(torch.einsum('ik,jkl,il->ij', [a, b, c]), - torch.nn.functional.bilinear(a, c, b))) + self.assertEqual(torch.einsum('ik,jkl,il->ij', [a, b, c])._data, + torch.nn.functional.bilinear(a, c, b)._data) class TestGradCheckOverride(TestCase): "Test that wrappers work with gradcheck." diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index e7e4832ad5631..fdc8c01417fd1 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -658,7 +658,7 @@ def test_fftshift_frequencies(self, device, dtype): # Test fftshift sorts the fftfreq output shifted = torch.fft.fftshift(x) - self.assertTrue(torch.allclose(shifted, shifted.sort().values)) + self.assertEqual(shifted, shifted.sort().values) self.assertEqual(sorted_fft_freqs, shifted) # And ifftshift is the inverse diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py index 80cb4d0331889..3b62ced36f391 100644 --- a/torch/testing/_internal/common_jit.py +++ b/torch/testing/_internal/common_jit.py @@ -136,7 +136,7 @@ def get_recording_tensors(args): for g2, g2_test in zip(grads2, grads2_test): if g2 is None and g2_test is None: continue - self.assertTrue(torch.allclose(g2, g2_test, atol=5e-4, rtol=1e-4)) + self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4) class JitCommonTestCase(TestCase): def createFunctionFromGraph(self, trace): diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py index 7f9fb976934d3..50d8dac23867b 100644 --- a/torch/testing/_internal/jit_utils.py +++ b/torch/testing/_internal/jit_utils.py @@ -594,7 +594,7 @@ def input_reduce(input, fn, acc): for g2, g2_ge in zip(grads2, grads2_ge): if g2 is None and g2_ge is None: continue - self.assertTrue(torch.allclose(g2, g2_ge, atol=8e-4, rtol=8e-4)) + self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4) return ge From 01b8162d00bfb0844a3f8a165d49907e51a16add Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 25 Aug 2021 17:50:48 -0700 Subject: [PATCH 234/530] Back out "Revert D30384746: [fx2trt] Add a test for quantized resnet18" (#63973) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63973 Original commit changeset: b93235323e22 Test Plan: buck run mode/opt -c python.package_style=inplace caffe2:fx2trt_quantized_resnet_test Reviewed By: 842974287 Differential Revision: D30546036 fbshipit-source-id: 2c8302456f072d04da00cf9ad97aa8304bc5e43e --- .../fx2trt/converters/acc_ops_converters.py | 15 +-- .../fx2trt/example/quantized_resnet_test.py | 117 ++++++++++++++++++ 2 files changed, 121 insertions(+), 11 deletions(-) create mode 100644 torch/fx/experimental/fx2trt/example/quantized_resnet_test.py diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 566359bf2af0d..33a817d4ccdb5 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -1300,15 +1300,11 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): if q_zero_point != 0: raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}") - # temporarily set q_scale to 1 to make sure the q_scale is different - # for quantize and dequantize to avoid the error - # TODO: follow up with nvidia TensorRT team to repro and fix the problem - q_scale = 1 scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32))) scale_layer.name = input_val.name + ".quant.scale" scale = scale_layer.get_output(0) - assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " - "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + # assert trt.__version__ > "8.0", "Explicit quantize op is only supported in " + # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ layer = network.add_quantize(input=input_val, scale=scale) layer.axis = 0 layer.name = input_val.name + ".quant" @@ -1316,9 +1312,6 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name): @tensorrt_converter(acc_ops.dequantize) def acc_ops_dequantize(network, target, args, kwargs, name): - """ - Currently just a no-op. - """ input_val = kwargs["input"] if not isinstance(input_val, trt.tensorrt.ITensor): @@ -1339,8 +1332,8 @@ def acc_ops_dequantize(network, target, args, kwargs, name): scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32))) scale_layer.name = input_val.name + ".dequant.scale" scale = scale_layer.get_output(0) - assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " - "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ + # assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in " + # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__ layer = network.add_dequantize(input=input_val, scale=scale) layer.name = input_val.name + ".dequant" layer.axis = 0 diff --git a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py new file mode 100644 index 0000000000000..140f4fb50bd76 --- /dev/null +++ b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py @@ -0,0 +1,117 @@ +import torch.fx +import torchvision.models as models +from torch.fx.experimental.fx2trt.fx2trt import TRTInterpreter, InputTensorSpec, TRTModule +from torch.quantization.quantize_fx import prepare_fx, convert_fx +import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer +import copy +from torch.fx.passes import shape_prop +from torch.fx.experimental.normalize import NormalizeArgs + +rn18 = models.resnet18().eval() + +def build_fp16_trt(rn18): + rn18 = copy.deepcopy(rn18) + rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)]) # type: ignore[attr-defined] + interp = TRTInterpreter(rn18, [InputTensorSpec(torch.Size([3, 224, 224]), torch.float, has_batch_dim=False)]) + engine, input_names, output_names = interp.run(fp16_mode=True) + return TRTModule(engine, input_names, output_names) + +@torch.no_grad() +def build_int8_trt(rn18): + rn18 = copy.deepcopy(rn18) + data = torch.randn(1, 3, 224, 224) + # data = torch.randn(1, 64, 10, 10) + # TensorRT only supports symmetric quantization + qconfig = torch.quantization.QConfig( + activation=torch.quantization.observer.HistogramObserver.with_args( + qscheme=torch.per_tensor_symmetric, dtype=torch.qint8 + ), + weight=torch.quantization.default_weight_observer + ) + prepared = prepare_fx(rn18, {"": qconfig}) + for _ in range(10): + prepared(data) + quantized_rn18 = convert_fx(prepared, is_reference=True) + print("quantized model:", quantized_rn18) + + quantized_rn18 = acc_tracer.trace(quantized_rn18, [data]) # type: ignore[attr-defined] + interp = TRTInterpreter(quantized_rn18, [InputTensorSpec(data.shape[1:], torch.float, has_batch_dim=False)]) + engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True) + return TRTModule(engine, input_names, output_names) + +@torch.no_grad() +def build_int8_trt_implicit_quant(rn18): + rn18 = copy.deepcopy(rn18) + data = torch.randn(1, 3, 224, 224) + # Quantization + qconfig = torch.quantization.QConfig( + activation=torch.quantization.observer.HistogramObserver.with_args( + qscheme=torch.per_tensor_symmetric, reduce_range=True + ), + weight=torch.quantization.default_per_channel_weight_observer + ) + prepared = prepare_fx(rn18, {"": qconfig}) + for _ in range(10): + prepared(data) + quantized_rn18 = convert_fx(prepared, is_reference=True) + + # Build trt int8 model + traced_rn18 = torch.fx.symbolic_trace(quantized_rn18) + shape_prop.ShapeProp(traced_rn18).propagate(data) + traced_rn18 = NormalizeArgs(traced_rn18).transform() + interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data])) + engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True) + trt_mod = TRTModule(engine, input_names, output_names) + return trt_mod + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 3, 3, padding=1) + + def forward(self, x): + out = self.conv(x) + # out = torch.nn.functional.relu(out) + out += x + out += out + out = torch.nn.functional.relu(out) + return out + +# rn18 = M().eval() +# rn18 = rn18.layer1 +int8_trt = build_int8_trt(rn18) +implicit_int8_trt = build_int8_trt_implicit_quant(rn18) +fp16_trt = build_fp16_trt(rn18) +x = torch.randn(5, 3, 224, 224, device="cuda") +rn18 = rn18.cuda() + +import time +NITER = 100 + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + fp16_trt(x) + torch.cuda.synchronize() +print('trt fp16 time (ms/iter)', (time.time() - s) / NITER * 1000) + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + int8_trt(x) + torch.cuda.synchronize() +print('trt int8 time (ms/iter)', (time.time() - s) / NITER * 1000) + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + implicit_int8_trt(x) + torch.cuda.synchronize() +print('trt implicit int8 time (ms/iter)', (time.time() - s) / NITER * 1000) + +torch.cuda.synchronize() +s = time.time() +for _ in range(NITER): + rn18(x) + torch.cuda.synchronize() +print('PyTorch time (ms/iter)', (time.time() - s) / NITER * 1000) From f4bc28990f6edcaf6bfc9e9737a70ea0be514198 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Wed, 25 Aug 2021 18:17:10 -0700 Subject: [PATCH 235/530] Compute cuda reduction buffer size in elements (#63969) Summary: Resubmit of https://github.com/pytorch/pytorch/issues/63885 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63969 Reviewed By: mruberry Differential Revision: D30549423 Pulled By: ngimel fbshipit-source-id: b16d25030d44ced789c125a333d72b02a8f45067 --- aten/src/ATen/native/cuda/Reduce.cuh | 3 ++- test/test_reductions.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 8c423061a79f6..b4600454f467d 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -919,10 +919,11 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter // when accumulation in output is not possible. if (!can_accumulate_in_output && !can_use_32bit_indexing) { - int64_t output_memory_size = 1; + int64_t output_memory_size = iter.element_size(0); for (int dim = 0; dim < iter.ndim(); dim++) { output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), sizeof(out_scalar_t), (char*) iter.data_ptr(0), diff --git a/test/test_reductions.py b/test/test_reductions.py index 1497ed6ad419d..c1da0f0816c5a 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -1788,7 +1788,7 @@ def run_test(input_): run_test(torch.zeros(64, 61, dtype=dtype, device=device)) run_test(torch.zeros(64, 1, dtype=dtype, device=device)) - @slowTest + @onlyCUDA def test_argminmax_large_axis(self, device): # Regression test for gh-32863 x = torch.zeros(2**31, device=device, dtype=torch.int8) From 2ea2711501fd00c108c4b7cd87bc952bc9204cbb Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 25 Aug 2021 20:09:12 -0700 Subject: [PATCH 236/530] Make frozen symbol name customizable in torch deploy. (#63817) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63817 ghstack-source-id: 136699671 Test Plan: eyes Reviewed By: wconstab Differential Revision: D29571559 fbshipit-source-id: 8e3caa4932ef8d7c8559f264f0e9bb5474ad2237 --- torch/csrc/deploy/interpreter/freeze.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py index 24fa709cb01ac..31531746ed1b2 100644 --- a/torch/csrc/deploy/interpreter/freeze.py +++ b/torch/csrc/deploy/interpreter/freeze.py @@ -35,17 +35,13 @@ """ -MAIN_PREFIX = """ +MAIN_PREFIX_TEMPLATE = """ // Compiled standard library modules. These should be appended to the existing // `PyImport_FrozenModules` that ships with CPython. -struct _frozen _PyImport_FrozenModules_torch[] = { +struct _frozen {}[] = {{ """ -FAKE_PREFIX = """ -// Compiled standard library modules. These should be appended to the existing -// `PyImport_FrozenModules` that ships with CPython. -struct _frozen _PyImport_FrozenModules[] = { -""" +FAKE_PREFIX = MAIN_PREFIX_TEMPLATE.format("_PyImport_FrozenModules") MAIN_SUFFIX = """\ {0, 0, 0} /* sentinel */ @@ -133,7 +129,7 @@ def write_bytecode(self, install_root): for f in bytecode_files: f.close() - def write_main(self, install_root, oss): + def write_main(self, install_root, oss, symbol_name): """ Write the `main.c` file containing a table enumerating all the frozen modules. @@ -143,7 +139,7 @@ def write_main(self, install_root, oss): for m in self.frozen_modules: outfp.write(f"extern unsigned char {m.c_name}[];\n") - outfp.write(MAIN_PREFIX) + outfp.write(MAIN_PREFIX_TEMPLATE.format(symbol_name)) for m in self.frozen_modules: outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n') outfp.write(MAIN_SUFFIX) @@ -246,6 +242,11 @@ def compile_file(self, path: Path, top_package_path: Path): parser.add_argument("--verbose", action="store_true", help="Print debug logs") parser.add_argument("--install_dir", help="Root directory for all output files") parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules") +parser.add_argument( + "--symbol_name", + help="The name of the frozen module array symbol to generate", + default="_PyImport_FrozenModules_torch", +) args = parser.parse_args() @@ -264,4 +265,4 @@ def compile_file(self, path: Path, top_package_path: Path): f.compile_path(path, path) f.write_bytecode(args.install_dir) -f.write_main(args.install_dir, args.oss) +f.write_main(args.install_dir, args.oss, args.symbol_name) From 124ae597fb7a371b39ff771779442017f7817d6a Mon Sep 17 00:00:00 2001 From: Zafar Takhirov Date: Wed, 25 Aug 2021 20:37:56 -0700 Subject: [PATCH 237/530] [quant] Fixing the conversion of the quantizable RNN (#63879) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63879 Quantizable RNN had a bug, where the `from_observed` was an instance method, instead of a class method. This caused the `tq.convert` to fail. This fixes the issue by making the `from_observed` a classmethod. The tests were passing before because the unittests were not using the custom module path, but a conventional `from_float`, which is also supported. Test Plan: `buck test mode/dev //caffe2/test:quantization -- test_custom_module_lstm` ``` buck test mode/dev //caffe2/test:quantization -- test_custom_module_lstm Parsing buck files: finished in 0.5 sec Downloaded 0/2 artifacts, 0.00 bytes, 100.0% cache miss (for updated rules) Building: finished in 9.2 sec (100%) 12622/12622 jobs, 2/12622 updated Total time: 9.7 sec More details at https://www.internalfb.com/intern/buck/build/0d87b987-649f-4d06-b0e2-97b5077 Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details. Running with tpx session id: cb99305f-65c9-438b-a99f-a0a2a3089778 Trace available for this run at /tmp/tpx-20210824-115652.540356/trace.log Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/5066549645030046 ✓ ListingSuccess: caffe2/test:quantization - main (12.550) ✓ Pass: caffe2/test:quantization - test_custom_module_lstm (quantization.core.test_quantized_op.TestQuantizedOps) (174.867) Summary Pass: 1 ListingSuccess: 1 If you need help understanding your runs, please follow the wiki: https://fburl.com/posting_in_tpx_users Finished test run: https://www.internalfb.com/intern/testinfra/testrun/5066549645030046 ``` Reviewed By: jerryzh168, mtl67 Differential Revision: D30520473 fbshipit-source-id: bc5d0b5bb079fd146e2614dd42526fc7d4d4f3c6 --- test/quantization/core/test_quantized_op.py | 6 +++++- torch/nn/quantizable/modules/rnn.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index 6c94586d3101e..18212671aabaa 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -2414,6 +2414,9 @@ def test_custom_module_lstm(self): custom_module_config = { 'float_to_observed_custom_module_class': { torch.nn.LSTM: torch.nn.quantizable.LSTM + }, + 'observed_to_quantized_custom_module_class': { + torch.nn.quantizable.LSTM: torch.nn.quantizable.LSTM } } @@ -2460,7 +2463,8 @@ def test_custom_module_lstm(self): self.assertEqual(y_ref, y) # Quantize - lstm_quantized = torch.quantization.convert(lstm_prepared) + lstm_quantized = torch.quantization.convert( + lstm_prepared, convert_custom_config_dict=custom_module_config) qy = lstm_quantized(qx) snr = _snr(y, qy) diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py index 32e813ce94eae..bdfd7788533b5 100644 --- a/torch/nn/quantizable/modules/rnn.py +++ b/torch/nn/quantizable/modules/rnn.py @@ -407,6 +407,7 @@ def from_float(cls, other, qconfig=None): observed = torch.quantization.prepare(observed, inplace=True) return observed - def from_observed(self, other): - return torch.quantization.convert(self, inplace=False, + @classmethod + def from_observed(cls, other): + return torch.quantization.convert(other, inplace=False, remove_qconfig=True) From 80a61142e48f21e93e388359768e31aa687a9378 Mon Sep 17 00:00:00 2001 From: Zeina Migeed Date: Wed, 25 Aug 2021 20:42:14 -0700 Subject: [PATCH 238/530] inference for algebraic expressions (#63822) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63822 Infer algebraic expressions and add it to our symbolic inferencer. Works for conv2D and can be extended to other operations. Test Plan: Imported from OSS Reviewed By: jamesr66a Differential Revision: D30518469 Pulled By: migeed-z fbshipit-source-id: b92dfa40b2d834a535177da42b851701b8f7178c --- test/fx/test_gradual_type.py | 151 ++++++++++++------ .../experimental/graph_gradual_typechecker.py | 114 ++++++++++++- torch/fx/experimental/unify_refinements.py | 18 ++- 3 files changed, 226 insertions(+), 57 deletions(-) diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py index 203cf6b7e306e..37e8db1e5cf4b 100644 --- a/test/fx/test_gradual_type.py +++ b/test/fx/test_gradual_type.py @@ -9,7 +9,14 @@ from torch.fx.experimental.rewriter import RewritingTracer from torch.fx import GraphModule from torch.fx.passes.shape_prop import ShapeProp -from torch.fx.experimental.unification import Var + +try: + import sympy + HAS_SYMPY = True +except ImportError: + HAS_SYMPY = False +skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy") + try: from torchvision.models import resnet50 @@ -19,13 +26,6 @@ HAS_TORCHVISION = False skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision") -# try: -# from unification import Var -# HAS_UNIFICATION = True -# except ImportError: -# HAS_UNIFICATION = False -# skipIfNoUnification = unittest.skipIf(not HAS_UNIFICATION, "no unification") - def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): """3x3 convolution with padding""" return torch.nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, @@ -270,10 +270,9 @@ def forward(self, x: TensorType((1, 2, 3, 5))): def test_type_check_batch_norm_2D(self): class BasicBlock(torch.nn.Module): - def __init__(self, inplanes, planes, norm_layer=None): + def __init__(self, inplanes, planes): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d self.bn1 = norm_layer(planes) def forward(self, x: TensorType((2, 2, 5, 4))): @@ -302,10 +301,9 @@ def forward(self, x: TensorType((2, 2, 5, 4))): def test_type_check_batch_norm_2D_false(self): class BasicBlock(torch.nn.Module): - def __init__(self, inplanes, planes, norm_layer=None): + def __init__(self, inplanes, planes): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d self.bn1 = norm_layer(planes) def forward(self, x: TensorType((2, 2, 5))): @@ -325,10 +323,9 @@ def forward(self, x: TensorType((2, 2, 5))): def test_type_check_batch_norm_2D_broadcast(self): class BasicBlock(torch.nn.Module): - def __init__(self, inplanes, planes, norm_layer=None): + def __init__(self, inplanes, planes): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d self.bn1 = norm_layer(planes) def forward(self, x: Dyn): @@ -363,10 +360,9 @@ def forward(self, x: Dyn): def test_type_check_conv2D(self): class BasicBlock(torch.nn.Module): - def __init__(self, inplanes, planes, stride=1, norm_layer=None): + def __init__(self, inplanes, planes, stride=1): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm_layer(planes) @@ -394,10 +390,9 @@ def forward(self, x: Dyn): def test_type_check_conv2D_2(self): class BasicBlock(torch.nn.Module): - def __init__(self, inplanes, planes, stride=1, norm_layer=None): + def __init__(self, inplanes, planes, stride=1): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm_layer(planes) @@ -434,7 +429,6 @@ def forward(self, x: TensorType((5, 2, 3, 4))): with self.assertRaises(TypeError): tc.type_check() - def test_type_check_conv2D_2_fully_static(self): annotation_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14), (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 3)] @@ -522,16 +516,14 @@ def forward(self, x): assert n.type == TensorType(output_types[i]) assert is_consistent(n.type, TensorType(b.size())) - def test_typecheck_basicblock(self): class BasicBlock(torch.nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, - base_width=64, dilation=1, norm_layer=None): + base_width=64, dilation=1): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d if groups != 1 or base_width != 64: raise ValueError('BasicBlock only supports groups=1 and base_width=64') if dilation > 1: @@ -643,7 +635,6 @@ def forward(self, x: TensorType((1, Dyn, 3, 5, Dyn))): if n.op == 'output': assert n.type == TensorType((1, Dyn, 5, Dyn)) - def test_type_check_flatten3(self): class M(torch.nn.Module): def forward(self, x: TensorType((2, 3, 4, 5))): @@ -661,7 +652,6 @@ def forward(self, x: TensorType((2, 3, 4, 5))): c = r.constraints assert c == [Equality(2, 2)] - def test_type_typechecl_maxpool2d_3dinput(self): class BasicBlock(torch.nn.Module): @@ -770,7 +760,6 @@ def forward(self, x): assert n.type == TensorType(output_types[i]) assert is_consistent(n.type, TensorType(b.size())) - def test_flatten_fully_static(self): annotation_list = [Dyn, TensorType((2, 5, 6, 9)), TensorType((10, 15, 13, 14)), TensorType((10, Dyn, 13, 14)), TensorType((Dyn, Dyn, Dyn, 10))] @@ -816,6 +805,7 @@ def forward(self, x): if n.op == 'output': assert is_consistent(n.type, TensorType(b.size())) + @skipIfNoSympy @skipIfNoTorchVision def test_resnet50(self): gm_run = symbolic_trace(resnet50()) @@ -859,14 +849,13 @@ def test_resnet50(self): batch_sizes.add(n.type.__args__[0]) assert (len(batch_sizes) == 1) - + @skipIfNoSympy def test_type_check_batch_norm_symbolic(self): class BasicBlock(torch.nn.Module): - def __init__(self, inplanes, planes, norm_layer=None): + def __init__(self, inplanes, planes): super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = torch.nn.BatchNorm2d + norm_layer = torch.nn.BatchNorm2d self.bn1 = norm_layer(planes) def forward(self, x: Dyn): @@ -884,15 +873,15 @@ def forward(self, x: Dyn): infer_symbolic_types(traced) - - my_types = iter([TensorType[(2, 2, Var(7), 4)], - TensorType[(2, 2, Var(7), 4)], - TensorType[(2, 2, Var(7), 4)], - TensorType[(2, 2, Var(7), 4)]]) + my_types = iter([TensorType[(2, 2, sympy.symbols('~7'), 4)], + TensorType[(2, 2, sympy.symbols('~7'), 4)], + TensorType[(2, 2, sympy.symbols('~7'), 4)], + TensorType[(2, 2, sympy.symbols('~7'), 4)]]) for n in graph.nodes: assert n.type == next(my_types) + @skipIfNoSympy def test_symbolic_add_with_broadcast(self): class M(torch.nn.Module): def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))): @@ -911,16 +900,17 @@ def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))): infer_symbolic_types(symbolic_traced) - expected_ph_types = [TensorType((1, 2, 3, Var(0))), + expected_ph_types = [TensorType((1, 2, 3, sympy.symbols('~0'))), TensorType((2, 3, 4)), - TensorType((1, 2, 3, Var(1))), - TensorType((1, 2, 3, Var(1)))] + TensorType((1, 2, 3, sympy.symbols('~1'))), + TensorType((1, 2, 3, sympy.symbols('~1')))] expected_iter = iter(expected_ph_types) + for n in symbolic_traced.graph.nodes: assert n.type == next(expected_iter) - + @skipIfNoSympy def test_symbolic_add_with_broadcast_2(self): class M(torch.nn.Module): def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))): @@ -934,13 +924,80 @@ def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))): r.refine() expected_ph_types = [TensorType((1, 2)), - TensorType((Var(1), 2)), - TensorType((Var(1), 2)), - TensorType((Var(1), 2))] + TensorType((sympy.symbols('~1'), 2)), + TensorType((sympy.symbols('~1'), 2)), + TensorType((sympy.symbols('~1'), 2))] expected_iter = iter(expected_ph_types) for n in symbolic_traced.graph.nodes: assert n.type == next(expected_iter) + @skipIfNoSympy + def test_type_check_conv2D_types(self): + class BasicBlock(torch.nn.Module): + def __init__(self, inplanes, planes, stride=1): + super(BasicBlock, self).__init__() + norm_layer = torch.nn.BatchNorm2d + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + + def forward(self, x: Dyn): + identity = x + out: TensorType((2, 2, Dyn, 4)) = self.conv1(x) + out += identity + return out + + B = BasicBlock(2, 2) + ast_rewriter = RewritingTracer() + graph = ast_rewriter.trace(B) + traced = GraphModule(ast_rewriter.root, graph, "gm") + tc = GraphTypeChecker({}, traced) + tc.type_check() + infer_symbolic_types(traced) + + for n in traced.graph.nodes: + if n.op == 'call_module': + assert isinstance(n.type.__args__[2], sympy.floor) + assert isinstance(n.type.__args__[3], sympy.floor) + + @skipIfNoSympy + def test_type_check_symbolic_inferenceconv2D_maxpool2d_flatten(self): + + class BasicBlock(torch.nn.Module): + def __init__(self): + super(BasicBlock, self).__init__() + + self.conv1 = torch.nn.Conv2d(3, 6, 5) + self.pool = torch.nn.MaxPool2d(2, 2) + self.conv2 = torch.nn.Conv2d(6, 16, 5) + self.fc1 = torch.nn.Linear(5, 120) + self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7)) + + def forward(self, x : TensorType((4, 3, Dyn, Dyn))): + out = self.conv1(x) + out = self.pool(out) + out = self.conv2(out) + out = self.pool(out) + out = self.fc1(out) + out = self.pool2(out) + out = torch.flatten(out, 1) + return out + + B = BasicBlock() + ast_rewriter = RewritingTracer() + traced = symbolic_trace(B) + tc = GraphTypeChecker({}, traced) + tc.type_check() + infer_symbolic_types(traced) + + for n in traced.graph.nodes: + if n.target == 'conv1': + assert n.type == TensorType((4, 6, sympy.floor((sympy.symbols('~0') - 4)), + sympy.floor((sympy.symbols('~1') - 4)))) + + elif n.target == 'conv2': + assert n.type == TensorType((4, 16, sympy.floor((sympy.symbols('~4') - 4)), + sympy.floor((sympy.symbols('~5') - 4)))) + if __name__ == '__main__': unittest.main() diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py index 6e05f918e810e..a54e52151f858 100644 --- a/torch/fx/experimental/graph_gradual_typechecker.py +++ b/torch/fx/experimental/graph_gradual_typechecker.py @@ -9,12 +9,18 @@ from torch.fx.experimental.refinement_types import Equality import itertools - from torch.fx.experimental.unification import Var # type: ignore[attr-defined] +try: + import sympy # type: ignore[import] + HAS_SYMPY = True +except ImportError: + HAS_SYMPY = False + _INFERENCE_RULES: Dict[Target, Callable] = {} _REFINEMENT_RULES: Dict[Target, Callable] = {} +_RULES: Dict[Target, Callable] = {} def expand_to_tensor_dim(t, n): @@ -84,6 +90,13 @@ def register(fn): return fn return register +def register_algebraic_expressions_inference_rule(call_target): + def register(fn): + if call_target in _RULES: + raise RuntimeError('Rule already registered for {call_target}!') + _RULES[call_target] = fn + return fn + return register @register_inference_rule(torch.add) @register_inference_rule(operator.add) @@ -258,10 +271,12 @@ def calculate_out_dimension(d_in, module_instance, index): dilation = (module_instance.dilation, module_instance.dilation) \ if isinstance(module_instance.dilation, int) else module_instance.dilation + DIMENSION_TYPES = (int, sympy.Symbol) if HAS_SYMPY else (int,) + if d_in == Dyn: return Dyn - elif isinstance(d_in, int): + elif isinstance(d_in, DIMENSION_TYPES): n = d_in + 2 * padding[index] - \ dilation[index] * \ (kernel_size[index] - 1) - 1 @@ -269,7 +284,7 @@ def calculate_out_dimension(d_in, module_instance, index): return (n // stride[0]) + 1 else: - raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn') + raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn. Received {type(d_in)}') def get_greatest_upper_bound(type1, type2): @@ -552,8 +567,17 @@ def get_node_type(a): @register_refinement_rule(Conv2d) +def conv_refinement_rule(n: Node): + res = [] + assert isinstance(n.args[0], Node) + arg_type = n.args[0].type + if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType): + res = [Equality(arg_type.__args__[0], n.type.__args__[0])] + return res + + @register_refinement_rule(torch.nn.Linear) -def first_one(n: Node): +def linear_refinement_rule(n: Node): res = [] assert isinstance(n.args[0], Node) arg_type = n.args[0].type @@ -564,7 +588,6 @@ def first_one(n: Node): # todo needs review for addition. Is this constraint correct? @register_refinement_rule(BatchNorm2d) @register_refinement_rule(torch.nn.ReLU) -@register_refinement_rule(torch.nn.AdaptiveAvgPool2d) def all_eq(n: Node): res = [] assert isinstance(n.args[0], Node) @@ -575,6 +598,18 @@ def all_eq(n: Node): res = [Equality(args1[i], args2[i]) for i in range(len(args1))] return res + +@register_refinement_rule(torch.nn.AdaptiveAvgPool2d) +def first_two__eq(n: Node): + res = [] + assert isinstance(n.args[0], Node) + arg_type = n.args[0].type + if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType): + args1 = arg_type.__args__ + args2 = n.type.__args__ + res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])] + return res + @register_refinement_rule(torch.add) @register_refinement_rule(operator.add) def add_eq(n: Node): @@ -636,6 +671,20 @@ def flatten_refinement_rule(n: Node): eq_const.append(Equality(t1, t2)) return eq_const + +@register_algebraic_expressions_inference_rule(Conv2d) +def conv_rule(n: Node, module_instance): + assert isinstance(n.args[0], Node) + arg_type = n.args[0].type + if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType): + w_in = arg_type.__args__[3] + h_in = arg_type.__args__[2] + h_out = calculate_out_dimension(h_in, module_instance, 0) + w_out = calculate_out_dimension(w_in, module_instance, 1) + new_type = TensorType((n.type.__args__[0], n.type.__args__[1], h_out, w_out)) + n.type = new_type + return new_type + class Refine: """ Symbolic shape inference. @@ -658,6 +707,15 @@ def refine(self): self.refine_node(n) return True + def symbolic_relations(self): + """ + Infers algebraic relations + """ + graph = self.traced.graph + for n in graph.nodes: + self.infer_symbolic_relations(n) + return True + def replace_dyn_with_fresh_var(self, typ): """ Replace all unknown types with fresh type variables. @@ -675,6 +733,26 @@ def replace_dyn_with_fresh_var(self, typ): else: return typ + + def convert_to_sympy_symbols(self, typ): + """ + Replace all unknown types with fresh type variables. + """ + if HAS_SYMPY: + if isinstance(typ, Var): + return sympy.symbols(str(typ)) + elif isinstance(typ, TensorType): + new_args = [self.convert_to_sympy_symbols(a) for a in typ.__args__] + return TensorType(tuple(new_args)) + elif isinstance(typ, list): + return [self.convert_to_sympy_symbols(t) for t in typ] + elif isinstance(typ, tuple): + return (self.convert_to_sympy_symbols(t) for t in typ) + else: + return typ + else: + return typ + def refine_node(self, n: Node): """ Returns a list of equality constraints for @@ -710,6 +788,32 @@ def get_node_type(a): else: pass + def infer_symbolic_relations(self, n: Node): + if HAS_SYMPY: + n.type = self.convert_to_sympy_symbols(n.type) + if n.op == 'call_function': + if n.target in _RULES: + return _RULES[n.target](n) + else: + pass + + if n.op == 'call_module': + module_instance = self.traced.get_submodule(n.target) + if type(module_instance) in _RULES: + return _RULES[type(module_instance)](n, module_instance) + else: + pass + + if n.op == 'output': + def get_node_type(a): + return a.type + n.type = torch.fx.node.map_arg(n.args[0], get_node_type) + return n.type + + else: + pass + else: + pass def get_parameter(traced, target: str): """ diff --git a/torch/fx/experimental/unify_refinements.py b/torch/fx/experimental/unify_refinements.py index 5074377ebf2dc..532d2784fb49a 100644 --- a/torch/fx/experimental/unify_refinements.py +++ b/torch/fx/experimental/unify_refinements.py @@ -2,11 +2,10 @@ from torch.fx.tensor_type import TensorType from torch.fx.experimental.unification import Var, unify # type: ignore[attr-defined] + def infer_symbolic_types_single_pass(traced): """ - Generate constraints over types, - solve constraints with unification, - apply solution back to the types + Calls our symbolic inferencer once. """ r = Refine(traced) r.refine() @@ -20,8 +19,17 @@ def infer_symbolic_types(traced): to infer all the information such as the case for braodcasting. """ - infer_symbolic_types_single_pass(traced) - infer_symbolic_types_single_pass(traced) + r = Refine(traced) + r.refine() + mgu = unify_eq(r.constraints) + substitute_all_types(traced.graph, mgu) + + r = Refine(traced) + r.refine() + mgu = unify_eq(r.constraints) + substitute_all_types(traced.graph, mgu) + + r.symbolic_relations() def convert_eq(list_of_eq): """ From 49c8fbc92f70d6d78e02e2b7944de59d9348db37 Mon Sep 17 00:00:00 2001 From: nikithamalgi Date: Wed, 25 Aug 2021 21:47:50 -0700 Subject: [PATCH 239/530] Clean up related to type refinements (#62444) Summary: Creates a helper function to refine the types into a torchScript compatible format in the monkeytype config for profile directed typing Pull Request resolved: https://github.com/pytorch/pytorch/pull/62444 Reviewed By: malfet Differential Revision: D30548159 Pulled By: nikithamalgifb fbshipit-source-id: 7c09ce5f5e043d069313b87112837d7e226ade1f --- test/jit/test_pdt.py | 41 ------------------ torch/jit/_monkeytype_config.py | 74 ++++++++++++++++----------------- torch/jit/frontend.py | 4 +- 3 files changed, 37 insertions(+), 82 deletions(-) diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py index b04a66e5dfcd9..57cd74faf432b 100644 --- a/test/jit/test_pdt.py +++ b/test/jit/test_pdt.py @@ -454,44 +454,3 @@ def test_none(a) -> Any: scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (torch.Tensor(1), )]) self.assertEqual(scripted_fn(torch.ones(1), ), test_none(torch.ones(1), )) - - class TestForwardWithNoneType(torch.nn.Module): - def forward(self, a): - count = 0 - for i, val in enumerate(a): - if val is None: - count += 1 - return count - - make_global(TestForwardWithNoneType) - pdt_model = TestForwardWithNoneType() - - # Test List[Optional[float]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[([None, ], ), ([2.9, ], )]) - self.assertEqual(scripted_model([2.8, 6.7, 3.8, None, ]), pdt_model([2.8, 6.7, 3.8, None, ])) - - # Test Tuple[Optional[int]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[((5.1, ), ), ((None, ), ), ]) - self.assertEqual(scripted_model((6.2, None, 10.6, 80.1, None, )), pdt_model((6.2, None, 10.6, 80.1, None, ))) - - # Test List[Optional[int]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[([None, ], ), ([2, ], )]) - self.assertEqual(scripted_model([2, None, 6, 8, ]), pdt_model([2, None, 6, 8, ])) - - # Test Tuple[Optional[int]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[((None, ), ), ((5, ), )]) - self.assertEqual(scripted_model((2, None, 6, 8)), pdt_model((2, None, 6, 8, ))) - - # Test Tuple[Optional[float]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[((None, ), ), ((5, ), )]) - self.assertEqual(scripted_model((2, None, 6, 8)), pdt_model((2, None, 6, 8, ))) - - # Test Tuple[Optional[torch.Tensor]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[(((torch.ones(1), ), (None, ), ), )]) - self.assertEqual(scripted_model((torch.ones(1), torch.ones(1), None)), - pdt_model((torch.ones(1), torch.ones(1), None))) - - # Test List[Optional[torch.Tensor]] as input - scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[([None, ], ), ([torch.ones(1), ], )]) - self.assertEqual(scripted_model([torch.ones(1), torch.ones(1), None]), - pdt_model([torch.ones(1), torch.ones(1), None])) diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py index b5a698eca7006..f0e4613e82fd1 100644 --- a/torch/jit/_monkeytype_config.py +++ b/torch/jit/_monkeytype_config.py @@ -1,7 +1,6 @@ import inspect import typing import pathlib -import torch from typing import Optional, Iterable, List, Dict from collections import defaultdict from types import CodeType @@ -16,25 +15,38 @@ except ImportError: _IS_MONKEYTYPE_INSTALLED = False -def get_optional_of_element_type(types: str): +def get_type(type): + """ + Helper function which converts the given type to a torchScript acceptable format. + """ + if isinstance(type, str): + return type + elif inspect.getmodule(type) == typing: + # If the type is a type imported from typing + # like Tuple, List, Dict then replace `typing.` + # with a null string. This needs to be done since + # typing.List is not accepted by TorchScript. + type_to_string = str(type) + return type_to_string.replace(type.__module__ + '.', '') + elif type.__module__.startswith('torch'): + # If the type is a subtype of torch module, then TorchScript expects a fully qualified name + # for the type which is obtained by combining the module name and type name. + return type.__module__ + '.' + type.__name__ + else: + # For all other types use the name for the type. + return type.__name__ + +def get_optional_of_element_type(types): """ Helper function to extracts the type of the element to be annotated to Optional from the list of consolidated types and returns `Optional[element type]`. - TODO: To remove this check once Union support lands. """ - elements = types.split(",") - elem_type = elements[0] if 'NoneType' in elements[1] else elements[1] - - # If the type is from typing module, then extract the element type - start = elem_type.find("[") - end = elem_type.rfind("]") - if start != -1 and end != -1: - return elem_type[:start + 1] + 'Optional[' + elem_type[start + 1: end] + ']]' - - # Else return Optional[element type] - if elem_type == 'Tensor': - elem_type = 'torch.Tensor' + elem_type = types[1] if type(None) == types[0] else types[0] + elem_type = get_type(elem_type) + + # Optional type is internally converted to Union[type, NoneType], which + # is not supported yet in TorchScript. Hence, representing the optional type as string. return 'Optional[' + elem_type + ']' def get_qualified_name(func): @@ -88,30 +100,15 @@ def consolidate_types(self, qualified_name: str) -> Dict: # then consolidate the type to `Any` and replace the entry # by type `Any`. for arg, types in all_args.items(): - _all_type = " " - for _type in types: - # If the type is a type imported from typing - # like Tuple, List, Dict then replace "typing." - # with a null string. - if inspect.getmodule(_type) == typing: - _type_to_string = str(_type) - _all_type += _type_to_string.replace('typing.', '') + ',' - elif _type is torch.nn.parameter.Parameter: - # Check if the type is torch.nn.parameter.Parameter, - # use the entire quaalified name `torch.nn.parameter.Parameter` - # for type - _all_type += 'torch.nn.parameter.Parameter' + ',' - else: - _all_type += _type.__name__ + ',' - _all_type = _all_type.lstrip(" ") # Remove any trailing spaces - - if len(types) == 2 and 'NoneType' in _all_type: + types = list(types) + type_length = len(types) + if type_length == 2 and type(None) in types: # TODO: To remove this check once Union suppport in TorchScript lands. - all_args[arg] = {get_optional_of_element_type(_all_type)} - elif len(types) > 1: - all_args[arg] = {'Any'} - else: - all_args[arg] = {_all_type[:-1]} + all_args[arg] = get_optional_of_element_type(types) + elif type_length > 1: + all_args[arg] = 'Any' + elif type_length == 1: + all_args[arg] = get_type(types[0]) return all_args def get_args_types(self, qualified_name: str) -> Dict: @@ -157,7 +154,6 @@ def jit_code_filter(code: CodeType) -> bool: The custom CodeFilter is required while scripting a FX Traced forward calls. FX Traced forward calls have `code.co_filename` start with '<' which is used to exclude tracing of stdlib and site-packages in the default code filter. - Since we need all forward calls to be traced, this custom code filter checks for code.co_name to be 'forward' and enables tracing for all such calls. The code filter is similar to default code filter for monkeytype and diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index b0228b132980a..0928106f3ba49 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -337,9 +337,9 @@ def build_param_list(ctx, py_args, self_name, pdt_arg_types=None): raise NotSupportedError(ctx_range, _vararg_kwarg_err) # List of Tuple of args and type as inferred by profile directed typing - arg_and_types = [(arg, next(iter(pdt_arg_types[arg.arg])) if pdt_arg_types and bool(pdt_arg_types[arg.arg]) else None) + arg_and_types = [(arg, pdt_arg_types[arg.arg] if pdt_arg_types and bool(pdt_arg_types[arg.arg]) else None) for arg in py_args.args] - arg_and_types_kwonlyargs = [(arg, next(iter(pdt_arg_types[arg.arg])) if pdt_arg_types and bool(pdt_arg_types[arg.arg]) + arg_and_types_kwonlyargs = [(arg, pdt_arg_types[arg.arg] if pdt_arg_types and bool(pdt_arg_types[arg.arg]) else None) for arg in py_args.kwonlyargs] result = [build_param(ctx, arg, self_name, kwarg_only=False, pdt_arg_type=arg_type) From b1154cc7741fa7ad4f075272347ff587ebf168f7 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 25 Aug 2021 22:04:44 -0700 Subject: [PATCH 240/530] enable equal_nan for complex values in isclose (#63571) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63571 Test Plan: Imported from OSS Reviewed By: malfet, ngimel Differential Revision: D30560127 Pulled By: mruberry fbshipit-source-id: 8958121ca24e7c139d869607903aebbe87bc0740 --- aten/src/ATen/native/TensorCompare.cpp | 6 ++---- test/test_testing.py | 30 ++++++++++++++++++++------ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index 90a57d1d30c94..3f69cab48b090 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -108,8 +108,6 @@ bool allclose(const Tensor& self, const Tensor& other, double rtol, double atol, // https://github.com/numpy/numpy/issues/15959 is resolved Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) { TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type()); - TORCH_CHECK(!(self.is_complex() && equal_nan), - "isclose with equal_nan=True is not supported for complex inputs."); TORCH_CHECK(!(self.is_quantized() || other.is_quantized()), "isclose is not supported for quantized inputs."); @@ -121,8 +119,8 @@ Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol // Computes equality closeness Tensor close = self == other; - if (equal_nan && self.is_floating_point()) { - close.__ior__((self != self).__iand__(other != other)); + if (equal_nan && (self.is_floating_point() || self.is_complex())) { + close.__ior__(self.isnan().__iand__(other.isnan())); } // In case of zero tolerances the closeness inequality degenerates to an equality check. diff --git a/test/test_testing.py b/test/test_testing.py index d59290b36c27b..7e67569bb4799 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -335,8 +335,6 @@ def test_isclose_comparetensors_float(self, device, dtype): self._comparetensors_helper(tests, device, dtype, True) - # torch.close with equal_nan=True is not implemented for complex inputs - # see https://github.com/numpy/numpy/issues/15959 # Note: compareTensor will compare the real and imaginary parts of a # complex tensors separately, unlike isclose. @dtypes(torch.complex64, torch.complex128) @@ -416,13 +414,20 @@ def test_isclose_comparetensors_complex(self, device, dtype): # equal_nan = True tests tests = ( (complex(1, 1), complex(1, float('nan')), False), - (complex(float('nan'), 1), complex(1, float('nan')), False), + (complex(1, 1), complex(float('nan'), 1), False), (complex(float('nan'), 1), complex(float('nan'), 1), True), + (complex(float('nan'), 1), complex(1, float('nan')), True), + (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True), ) + self._isclose_helper(tests, device, dtype, True) - with self.assertRaises(RuntimeError): - self._isclose_helper(tests, device, dtype, True) - + tests = ( + (complex(1, 1), complex(1, float('nan')), False), + (complex(1, 1), complex(float('nan'), 1), False), + (complex(float('nan'), 1), complex(float('nan'), 1), True), + (complex(float('nan'), 1), complex(1, float('nan')), False), + (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True), + ) self._comparetensors_helper(tests, device, dtype, True) # Tests that isclose with rtol or atol values less than zero throws a @@ -449,6 +454,19 @@ def test_isclose_equality_shortcut(self): self.assertFalse(torch.isclose(a, b, rtol=0, atol=0)) + @dtypes(torch.float16, torch.float32, torch.float64, torch.complex64, torch.complex128) + def test_isclose_nan_equality_shortcut(self, device, dtype): + if dtype.is_floating_point: + a = b = torch.nan + else: + a = complex(torch.nan, 0) + b = complex(0, torch.nan) + + expected = True + tests = [(a, b, expected)] + + self._isclose_helper(tests, device, dtype, equal_nan=True, rtol=0, atol=0) + @dtypes(torch.bool, torch.long, torch.float, torch.cfloat) def test_make_tensor(self, device, dtype): def check(size, low, high, requires_grad, noncontiguous): From b629ea4620c5707bfbf3640acb9c3c0f966c585d Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 25 Aug 2021 22:49:22 -0700 Subject: [PATCH 241/530] Update persons_of_interest.rst (#63907) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/63907 Reviewed By: jspisak Differential Revision: D30534972 Pulled By: dzhulgakov fbshipit-source-id: ba726fc53e292a362c387cc8b5f7776ca2a2544c --- docs/source/community/persons_of_interest.rst | 158 ++++++++++++------ 1 file changed, 103 insertions(+), 55 deletions(-) diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst index c220ae80806e8..b1d4954a65768 100644 --- a/docs/source/community/persons_of_interest.rst +++ b/docs/source/community/persons_of_interest.rst @@ -4,50 +4,47 @@ PyTorch Governance | Persons of Interest General Maintainers ------------------- -- Adam Paszke (`apaszke `__) - Soumith Chintala (`soumith `__) - Edward Yang (`ezyang `__) - Greg Chanan (`gchanan `__) - Dmytro Dzhulgakov (`dzhulgakov `__) -- (sunsetting) Sam Gross - (`colesbury `__) +- (emeritus) Sam Gross (`colesbury `__) +- (emeritus) Adam Paszke (`apaszke `__) Module-level maintainers ------------------------ -torch.* -~~~~~~~ - -- Greg Chanan (`gchanan `__) -- Soumith Chintala (`soumith `__) -- [linear algebra] Vishwak Srinivasan (`vishwakftw `__) - torch.nn ~~~~~~~~ -- Adam Paszke (`apaszke `__) - Greg Chanan (`gchanan `__) - Soumith Chintala (`soumith `__) -- Sam Gross (`colesbury `__) +- Joel Schlosser (`jbschlosser `__) +- (emeritus) Sam Gross (`colesbury `__) +- (emeritus) Adam Paszke (`apaszke `__) torch.optim ~~~~~~~~~~~ -- Vincent Quenneville-Belair (`vincentqb `__) - Soumith Chintala (`soumith `__) +- Ilqar Ramazanli (`iramazanli `__) +- (emeritus) Vincent Quenneville-Belair (`vincentqb `__) -Autograd Engine -~~~~~~~~~~~~~~~ +torch.autograd +~~~~~~~~~~~~~~ - Edward Yang (`ezyang `__) - Alban Desmaison (`alband `__) -- Adam Paszke (`apaszke `__) +- (emeritus) Adam Paszke (`apaszke `__) -JIT -~~~ +JIT / TorchScript / FX +~~~~~~~~~~~~~~~~~~~~~~ -- Zach Devito (`zdevito `__) - Michael Suo (`suo `__) +- Yanan Cao (`gmagogsfm `__) +- James Reed (`jamesr66a `__) +- (emeritus) Zach Devito (`zdevito `__) + Distributions & RNG ~~~~~~~~~~~~~~~~~~~ @@ -60,39 +57,55 @@ Distributions & RNG Distributed ~~~~~~~~~~~ -- Pieter Noordhuis (`pietern `__) - Shen Li (`mrshenli `__) -- (proposed) Pritam Damania - (`pritamdamania87 `__) +- Pritam Damania (`pritamdamania87 `__) +- (emeritus) Pieter Noordhuis (`pietern `__) Multiprocessing and DataLoaders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Vitaly Fedyunin (`VitalyFedyunin `__) - Simon Wang (`SsnL `__) -- Adam Paszke (`apaszke `__) +- (emeritus) Adam Paszke (`apaszke `__) + +torch.linalg / Linear Algebra +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Mike Ruberry (`mruberry `__) +- Vishwak Srinivasan (`vishwakftw `__) +- Ivan Yashchuk (`IvanYashchuk `__) + +torch.fft +~~~~~~~~~ + +- Mike Ruberry (`mruberry `__) +- Peter Bell (`peterbell10 `__) + CPU Performance / SIMD ~~~~~~~~~~~~~~~~~~~~~~ -- Xiaoqiang Zheng (`zheng-xq `__) - Vitaly Fedyunin (`VitalyFedyunin `__) -- Sam Gross (`colesbury `__) -- (sunsetting) Christian Puhrsch (`cpuhrsch `__) -- [threading] Ilia Cherniavskii (`ilia-cher `__) +- (emeritus) Xiaoqiang Zheng (`zheng-xq `__) +- (emeritus) Sam Gross (`colesbury `__) +- (emeritus) Christian Puhrsch (`cpuhrsch `__) +- (emeritus) Ilia Cherniavskii (`ilia-cher `__) CUDA ~~~~ - Natalia Gimelshein (`ngimel `__) - Edward Yang (`ezyang `__) -- Xiaoqiang Zheng (`zheng-xq `__) +- Piotr Bialecki (`ptrblck `__) +- (emeritus) Xiaoqiang Zheng (`zheng-xq `__) MKLDNN ~~~~~~ -- Junjie Bai (`bddppq `__) -- Yinghai Lu (`yinghai `__) +- Vitaly Fedyunin (`VitalyFedyunin `__) +- Jianhui Li (`Jianhui-Li `__) +- (emeritus) Junjie Bai (`bddppq `__) +- (emeritus) Yinghai Lu (`yinghai `__) AMD/ROCm/HIP ~~~~~~~~~~~~ @@ -100,39 +113,66 @@ AMD/ROCm/HIP - Peng Sun (`sunway513 `__) - Jithun Nair (`jithunnair-amd `__) - Jeff Daily (`jeffdaily `__) +- (emeritus) Junjie Bai (`bddppq `__) Build + CI ~~~~~~~~~~ -- Will Feng (`yf225 `__) -- Edward Yang (`ezyang `__) -- Soumith Chintala (`soumith `__) -- Karl Ostmo (`kostmo `__) -- Hong Xu (`xuhdev `__) +- Nikita Shulga (`malfet `__) +- Eli Uriegas (`seemethere `__) +- Zhuojie Zhou (`zhouzhuojie `__) +- (emeritus) Edward Yang (`ezyang `__) +- (emeritus) Karl Ostmo (`kostmo `__) -Benchmarks -~~~~~~~~~~ +Performance Tools +~~~~~~~~~~~~~~~~~ -- Mingzhe Li (`mingzhe09088 `__) +- Victor Bittorf (`bitfort `__) +- Gisle Dankel (`gdankel `__) +- Taylor Robie (`robieta `__) +- Xu Zhao (`xuzhao9 `__) +- Geeta Chauhan (`chauhang `__) +- (emeritus) Natalia Gimelshein (`ngimel `__) +- (emeritus) Mingzhe Li (`mingzhe09088 `__) C++ API ~~~~~~~ -- Will Feng (`yf225 `__) +- Joel Schlosser (`jbschlosser `__) +- (emeritus) Will Feng (`yf225 `__) C10 utils and operator dispatch ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- Sebastian Messmer (`smessmer `__) +- Brian Hirsh (`bdhirsh `__) +- Edward Yang (`ezyang `__) - Dmytro Dzhulgakov (`dzhulgakov `__) +- (emeritus) Sebastian Messmer (`smessmer `__) ONNX <-> PyTorch ~~~~~~~~~~~~~~~~ - -- Lu Fang (`houseroad `__) -- Lara Haidar (`lara-hdr `__) -- Spandan Tiwari (`spandantiwari `__) +- Negin Raoof (`neginraoof `__) +- Gary Miguel (`garymm `__) - Bowen Bao (`BowenBao `__) +- (emeritus) Lu Fang (`houseroad `__) +- (emeritus) Lara Haidar (`lara-hdr `__) +- (emeritus) Spandan Tiwari (`spandantiwari `__) + +Mobile / Edge +~~~~~~~~~~~~~ +- David Reiss (`dreiss `__) +- Raziel Guevara (`raziel `__) +- Linbin Yu (`linbinyu `__) +- Ivan Kobzarev (`IvanKobzarev `__) +- Tao Xu (`xta0 `__) + +Model Compression & Optimization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- Raghuraman Krishnamoorthi (`raghuramank100 `__) +- Jerry Zhang (`jerryzh168 `__) +- Zafar Takhirov (`z-a-f `__) +- Supriya Rao (`supriyar `__) + Windows ~~~~~~~ @@ -152,31 +192,39 @@ Library-level maintainers XLA ~~~ -- Ailing Zhang (`ailzhang `__) +- Jack Cao (`JackCaoG `__) +- Daniel Sohn (`jysohn23 `__) +- Zach Cain (`zcain117 `__) +- Brian Hirsch (`bdhirsh `__) - Gregory Chanan (`gchanan `__) -- Davide Libenzi (`dlibenzi `__) -- Alex Suhan (`asuhan `__) +- (emeritus) Ailing Zhang (`ailzhang `__) +- (emeritus) Davide Libenzi (`dlibenzi `__) +- (emeritus) Alex Suhan (`asuhan `__) TorchServe ~~~~~~~~~~ -- Geeta Chauhan (`chauhang `__) -- Manoj Rao (`mycpuorg `__) -- Vamshi Dantu (`vdantu `__) -- Dhanasekar Karuppasamy (`dhanainme `__) +- Geeta Chauhan (`chauhang `__) +- Manoj Rao (`mycpuorg `__) +- Vamshi Dantu (`vdantu `__) +- Dhanasekar Karuppasamy (`dhanainme `__) TorchVision ~~~~~~~~~~~ -- Francisco Massa (`fmassa `__) +- Francisco Massa (`fmassa `__) +- Vasilis Vryniotis (`datumbox `__) TorchText ~~~~~~~~~ -- Guanheng George Zhang (`zhangguanheng66 `__) -- Christian Puhrsch (`cpuhrsch `__) +- Parmeet Singh Bhatia (`parmeet `__) +- Steven Liu (`hudeven `__) +- (emeritus) Guanheng George Zhang (`zhangguanheng66 `__) +- (emeritus) Christian Puhrsch (`cpuhrsch `__) TorchAudio ~~~~~~~~~~ -- Vincent QB (`vincentqb `__) +- Moto Hira (`mthrok `__) +- (emeritus) Vincent QB (`vincentqb `__) From 9d95d485679392774532d4c79a73b9c11b665e1b Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Wed, 25 Aug 2021 22:56:33 -0700 Subject: [PATCH 242/530] (torch.distributed) Add torch.distributed.is_torchelastic_launched() util method + make init_method=tcp:// compatible with torchelastic (#63910) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63910 Addresses the current issue that `init_method=tcp://` is not compatible with `torch.distributed.run` and `torch.distributed.launch`. When running with a training script that initializes the process group with `init_method=tcp://localhost:$port` as such: ``` $ python -u -m torch.distributed.run --max_restarts 0 --nproc_per_node 1 --nnodes 1 --master_addr $(hostname) --master_port 6000 ~/tmp/test.py ``` An `Address in use` error is raised since the training script tries to create a TCPStore on port 6000, which is already taken since the elastic agent is already running a TCPStore on that port. For details see: https://github.com/pytorch/pytorch/issues/63874. This change does a couple of things: 1. Adds `is_torchelastic_launched()` check function that users can use in the training scripts to see whether the script is launched via torchelastic. 1. Update the `torch.distributed` docs page to include the new `is_torchelastic_launched()` function. 1. Makes `init_method=tcp://` torchelastic compatible by modifying `_tcp_rendezvous_handler` in `torch.distributed.rendezvous` (this is NOT the elastic rendezvous, it is the old rendezvous module which is slotted for deprecation in future releases) to check `is_torchelastic_launched()` AND `torchelastic_use_agent_store()` and if so, only create TCPStore clients (no daemons, not even for rank 0). 1. Adds a bunch of unittests to cover the different code paths NOTE: the issue mentions that we should fail-fast with an assertion on `init_method!=env://` when `is_torchelastic_launched()` is `True`. There are three registered init_methods in pytorch: env://, tcp://, file://. Since this diff makes tcp:// compatible with torchelastic and I've validated that file is compatible with torchelastic. There is no need to add assertions. I did update the docs to point out that env:// is the RECOMMENDED init_method. We should probably deprecate the other init_methods in the future but this is out of scope for this issue. Test Plan: Unittests. Reviewed By: cbalioglu Differential Revision: D30529984 fbshipit-source-id: 267aea6d4dad73eb14a2680ac921f210ff547cc5 --- docs/source/distributed.rst | 2 + .../launcher/bin/test_script_init_method.py | 76 ++++++++++++ .../test_script_is_torchelastic_launched.py | 42 +++++++ test/distributed/launcher/run_test.py | 117 ++++++++++++++++++ test/distributed/test_launcher.py | 6 +- torch/_C/_distributed_c10d.pyi | 3 +- torch/distributed/distributed_c10d.py | 46 ++++--- torch/distributed/launch.py | 10 +- torch/distributed/rendezvous.py | 93 +++++++++----- torch/distributed/run.py | 1 + 10 files changed, 342 insertions(+), 54 deletions(-) create mode 100755 test/distributed/launcher/bin/test_script_init_method.py create mode 100755 test/distributed/launcher/bin/test_script_is_torchelastic_launched.py diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index 0f4e051bbf4db..c5cd727fa7ea0 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -180,6 +180,8 @@ joined. .. autofunction:: is_nccl_available +.. autofunction:: is_torchelastic_launched + -------------------------------------------------------------------------------- Currently three initialization methods are supported: diff --git a/test/distributed/launcher/bin/test_script_init_method.py b/test/distributed/launcher/bin/test_script_init_method.py new file mode 100755 index 0000000000000..299839c40759b --- /dev/null +++ b/test/distributed/launcher/bin/test_script_init_method.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +import torch.distributed as dist +import torch.nn.functional as F + + +def parse_args(): + parser = argparse.ArgumentParser(description="test script") + + parser.add_argument( + "--init_method", + type=str, + required=True, + help="init_method to pass to `dist.init_process_group()` (e.g. env://)", + ) + parser.add_argument( + "--world_size", + type=int, + default=os.getenv("WORLD_SIZE", -1), + help="world_size to pass to `dist.init_process_group()`", + ) + parser.add_argument( + "--rank", + type=int, + default=os.getenv("RANK", -1), + help="rank to pass to `dist.init_process_group()`", + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + dist.init_process_group( + backend="gloo", + init_method=args.init_method, + world_size=args.world_size, + rank=args.rank, + ) + + rank = dist.get_rank() + world_size = dist.get_world_size() + + # one hot (by rank) tensor of size world_size + # example: + # rank 0, world_size 4 => [1, 0, 0, 0] + # rank 1, world_size 4 => [0, 1, 0, 0] + # ... + t = F.one_hot(torch.tensor(rank), num_classes=world_size) + + # after all_reduce t = tensor.ones(size=world_size) + dist.all_reduce(t) + + # adding all elements in t should equal world_size + derived_world_size = torch.sum(t).item() + if derived_world_size != world_size: + raise RuntimeError( + f"Wrong world size derived. Expected: {world_size}, Got: {derived_world_size}" + ) + + print("Done") + + +if __name__ == "__main__": + main() diff --git a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py new file mode 100755 index 0000000000000..fa9729c757b64 --- /dev/null +++ b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +This is a test script that launches as part of the test cases in +run_test.py, to validate the correctness of +the method ``torch.distributed.is_torchelastic_launched()``. To do so, +we run this script with and without torchelastic and validate that the +boolean value written to the out_file is indeed what we expect (e.g. +should be False when not launched with torchelastic, True when launched with) +The script itself is not a test case hence no assertions are made in this script. + +see: - test/distributed/launcher/run_test.py#test_is_torchelastic_launched() + - test/distributed/launcher/run_test.py#test_is_not_torchelastic_launched() +""" +import argparse + +import torch.distributed as dist + + +def parse_args(): + parser = argparse.ArgumentParser(description="test script") + parser.add_argument( + "--out_file", + help="file to write indicating whether this script was launched with torchelastic", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + with open(args.out_file, "w") as out: + out.write(f"{dist.is_torchelastic_launched()}") + + +if __name__ == "__main__": + main() diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py index 079fea792ed02..4ed824c036390 100644 --- a/test/distributed/launcher/run_test.py +++ b/test/distributed/launcher/run_test.py @@ -7,8 +7,10 @@ # LICENSE file in the root directory of this source tree. import multiprocessing as mp import os +import runpy import shutil import subprocess +import sys import tempfile import unittest import uuid @@ -21,6 +23,7 @@ from torch.distributed.elastic.multiprocessing.errors import ChildFailedError from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer from torch.distributed.elastic.utils import get_socket_with_port +from torch.distributed.elastic.utils.distributed import get_free_port from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, sandcastle_skip_if, @@ -475,3 +478,117 @@ def test_launch_shutdown(self, agent_mock_cls): param_mock.return_value = rdzv_handler_mock launch.main(args) rdzv_handler_mock.shutdown.assert_called_once() + + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") + def test_is_torchelastic_launched(self): + # launch test script with torchelastic and validate that + # torch.distributed.is_torchelastic_launched() returns True + + out_file = f"{os.path.join(self.test_dir, 'out')}" + + launch.main( + [ + "--run_path", + "--nnodes=1", + "--nproc_per_node=1", + "--monitor_interval=1", + path("bin/test_script_is_torchelastic_launched.py"), + f"--out_file={out_file}", + ] + ) + + with open(out_file, "r") as fp: + is_torchelastic_launched = fp.readline() + self.assertEqual("True", is_torchelastic_launched) + + def test_is_not_torchelastic_launched(self): + # launch test script without torchelastic and validate that + # torch.distributed.is_torchelastic_launched() returns False + + out_file = f"{os.path.join(self.test_dir, 'out')}" + + # need to run the script with runpy in the same interpreter + # as the test because otherwise (depending on the environment) + # it will not find torch as a dependency + with patch.object( + sys, + "argv", + [ + path("bin/test_script_is_torchelastic_launched.py"), + f"--out_file={out_file}", + ], + ): + runpy.run_path(sys.argv[0], run_name="__main__") + with open(out_file, "r") as fp: + is_torchelastic_launched = fp.readline() + self.assertEqual("False", is_torchelastic_launched) + + def test_init_method_tcp(self): + port = get_free_port() + with patch.object( + sys, + "argv", + [ + path("bin/test_script_init_method.py"), + f"--init_method=tcp://localhost:{port}", + "--rank=0", + "--world_size=1", + ], + ): + runpy.run_path(sys.argv[0], run_name="__main__") + # nothing to validate, just make sure it runs + + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") + def test_init_method_tcp_with_torchelastic(self): + port = get_free_port() + launch.main( + [ + "--run_path", + "--nnodes=1", + "--nproc_per_node=4", + "--master_addr=localhost", + f"--master_port={port}", + "--monitor_interval=1", + path("bin/test_script_init_method.py"), + f"--init_method=tcp://localhost:{port}", + ] + ) + # nothing to validate, just make sure it runs + + def test_init_method_env(self): + port = get_free_port() + with patch.dict( + os.environ, + { + "RANK": "0", + "WORLD_SIZE": "1", + "MASTER_ADDR": "localhost", + "MASTER_PORT": str(port), + }, + ), patch.object( + sys, + "argv", + [ + path("bin/test_script_init_method.py"), + "--init_method=env://", + ], + ): + runpy.run_path(sys.argv[0], run_name="__main__") + # nothing to validate, just make sure it runs + + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") + def test_init_method_env_with_torchelastic(self): + port = get_free_port() + launch.main( + [ + "--run_path", + "--nnodes=1", + "--nproc_per_node=4", + "--master_addr=localhost", + f"--master_port={port}", + "--monitor_interval=1", + path("bin/test_script_init_method.py"), + "--init_method=env://", + ] + ) + # nothing to validate, just make sure it runs diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py index 4565a266bc9ec..422c88b6bdee5 100644 --- a/test/distributed/test_launcher.py +++ b/test/distributed/test_launcher.py @@ -20,10 +20,14 @@ def path(script): return os.path.join(os.path.dirname(__file__), script) + if TEST_WITH_DEV_DBG_ASAN: - print("Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr) + print( + "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr + ) sys.exit(0) + class TestDistributedLaunch(TestCase): def test_launch_user_script(self): nnodes = 1 diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index cfa9c7cc1a46c..50e7602bdd838 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -134,7 +134,8 @@ class TCPStore(Store): world_size: int = ..., is_master: bool = ..., timeout: timedelta = ..., - wait_for_workers: bool = ... + wait_for_workers: bool = ..., + multi_tenant: bool = ... ): ... class PrefixStore(Store): diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 1b1244d9e37d5..fac096e339e71 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1,6 +1,7 @@ import contextlib import io import logging +import os import pickle import time import warnings @@ -9,28 +10,31 @@ import torch from torch._C._distributed_c10d import ( - AllreduceOptions, AllreduceCoalescedOptions, + AllreduceOptions, AllToAllOptions, BarrierOptions, BroadcastOptions, GatherOptions, PrefixStore, ProcessGroup, - ReduceOptions, ReduceOp, + ReduceOptions, ReduceScatterOptions, ScatterOptions, Store, + _DistributedDebugLevel, + _get_debug_mode, ) -from torch._C._distributed_c10d import _get_debug_mode, _DistributedDebugLevel from torch._six import string_classes +from .constants import default_pg_timeout +from .rendezvous import register_rendezvous_handler, rendezvous # noqa: F401 + + # This module is wildcard imported from torch.distributed. # TODO: specify __all__ -from .constants import default_pg_timeout -from .rendezvous import rendezvous, register_rendezvous_handler # noqa: F401 _MPI_AVAILABLE = True _NCCL_AVAILABLE = True @@ -244,7 +248,9 @@ def _store_based_barrier(rank, store, timeout): ) ) - logger.info(f"Rank {rank}: Completed store-based barrier for key:{store_key} with {world_size} nodes.") + logger.info( + f"Rank {rank}: Completed store-based barrier for key:{store_key} with {world_size} nodes." + ) def _rank_not_in_group(group: ProcessGroup): @@ -384,6 +390,18 @@ def is_initialized(): return GroupMember.WORLD is not None +def is_torchelastic_launched(): + """ + Checks whether this process was launched with ``torch.distributed.elastic`` + (aka torchelastic). The existence of ``TORCHELASTIC_RUN_ID`` environment + variable is used as a proxy to determine whether the current process + was launched with torchelastic. This is a reasonable proxy since + ``TORCHELASTIC_RUN_ID`` maps to the rendezvous id which is always a + non-null value indicating the job id for peer discovery purposes.. + """ + return os.getenv("TORCHELASTIC_RUN_ID") is not None + + def _get_default_group(): """ Getting the default process group created by init_process_group @@ -1778,8 +1796,8 @@ def broadcast_object_list(object_list, src=0, group=None, device=None): is_nccl_backend = group_backend == Backend.NCCL current_device = None if device is not None: - if is_nccl_backend and device.type != 'cuda': - raise ValueError('device type must be cuda for nccl backend') + if is_nccl_backend and device.type != "cuda": + raise ValueError("device type must be cuda for nccl backend") current_device = device else: current_device = torch.device("cpu") @@ -2229,7 +2247,9 @@ def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False): if _rank_not_in_group(group): return - scatter_list = [t if not t.is_complex() else torch.view_as_real(t) for t in scatter_list] + scatter_list = [ + t if not t.is_complex() else torch.view_as_real(t) for t in scatter_list + ] tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor) my_rank = get_rank() @@ -3026,9 +3046,7 @@ def new_subgroups( if rank in ranks_in_subgroup: cur_subgroup = subgroup logger.info( - "Rank {} is assigned to subgroup {}".format( - rank, ranks_in_subgroup - ) + "Rank {} is assigned to subgroup {}".format(rank, ranks_in_subgroup) ) return cur_subgroup, subgroups @@ -3139,8 +3157,6 @@ def new_subgroups_by_enumeration( rank_to_ranks_dict[rank] = ranks if my_rank == rank: cur_subgroup = subgroup - logging.info( - "Rank {} is assigned to subgroup {}".format(rank, ranks) - ) + logging.info("Rank {} is assigned to subgroup {}".format(rank, ranks)) return cur_subgroup, subgroups diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py index 5fcb3eb44c126..4f29edd10d521 100644 --- a/torch/distributed/launch.py +++ b/torch/distributed/launch.py @@ -97,9 +97,9 @@ >>> # your code to run 3. In your training program, you are supposed to call the following function -at the beginning to start the distributed backend. You need to make sure that -the init_method uses ``env://``, which is the only supported ``init_method`` -by this module. +at the beginning to start the distributed backend. It is strongly recommended +that ``init_method=env://``. Other init methods (e.g. ``tcp://``) may work, +but ``env://`` is the one that is officially supported by this module. :: @@ -147,6 +147,7 @@ from torch.distributed.run import get_args_parser, run + logger = logging.getLogger(__name__) @@ -181,7 +182,8 @@ def main(args=None): "If your script expects `--local_rank` argument to be set, please\n" "change it to read from `os.environ['LOCAL_RANK']` instead. See \n" "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n" - "further instructions\n", FutureWarning + "further instructions\n", + FutureWarning, ) args = parse_args(args) launch(args) diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py index 6a5b680e25011..6e430e273f951 100644 --- a/torch/distributed/rendezvous.py +++ b/torch/distributed/rendezvous.py @@ -1,17 +1,22 @@ try: from urllib.parse import urlparse, urlunparse except ImportError: - raise ImportError("urllib cannot be found, urlparse from python2 is no longer supported.") + raise ImportError( + "urllib cannot be found, urlparse from python2 is no longer supported." + ) -import torch._six as six import numbers import os import sys from datetime import timedelta -from typing import Optional, Dict, Union -from torch.distributed import FileStore, TCPStore, PrefixStore +from typing import Dict, Optional, Union + +import torch._six as six +from torch.distributed import FileStore, PrefixStore, Store, TCPStore + from .constants import default_pg_timeout + _rendezvous_handlers = {} @@ -73,7 +78,9 @@ def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs): query_dict["world_size"] = world_size result = result._replace( - query="{}".format("&".join(["{}={}".format(k, v) for k, v in query_dict.items()])) + query="{}".format( + "&".join(["{}={}".format(k, v) for k, v in query_dict.items()]) + ) ) url = urlunparse(result) @@ -92,8 +99,9 @@ def _error(msg): result = urlparse(url) path = result.path - if sys.platform == 'win32': + if sys.platform == "win32": import urllib.request + full_path = result.netloc + result.path path = urllib.request.url2pathname(full_path) if path: @@ -119,7 +127,41 @@ def _error(msg): raise RuntimeError("Unable to perform rerendezvous using file:// method") -def _tcp_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs): +def _torchelastic_use_agent_store() -> bool: + return os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None) == str(True) + + +def _create_c10d_store(hostname, port, rank, world_size, timeout) -> Store: + """ + Smartly creates a c10d Store object on ``rank`` based on whether + we need to re-use agent store. The TCPStore server is assumed to be hosted + on ``hostname:port``. + + If ``torchelastic_use_agent_store()`` is ``True``, then it is assumed that + the agent leader (node rank 0) hosts the TCPStore server (for which the + endpoint is specified by the given ``hostname:port``). Hence + ALL ranks will create and return a TCPStore client (e.g. ``start_daemon=False``). + + If ``torchelastic_use_agent_store()`` is ``False``, then rank 0 will host + the TCPStore (with multi-tenancy) and it is assumed that rank 0's hostname + and port are correctly passed via ``hostname`` and ``port``. All + non-zero ranks will create and return a TCPStore client. + """ + + if _torchelastic_use_agent_store(): + attempt = os.environ["TORCHELASTIC_RESTART_COUNT"] + tcp_store = TCPStore(hostname, port, world_size, False, timeout) + return PrefixStore(f"/worker/attempt_{attempt}", tcp_store) + else: + start_daemon = rank == 0 + return TCPStore( + hostname, port, world_size, start_daemon, timeout, multi_tenant=True + ) + + +def _tcp_rendezvous_handler( + url: str, timeout: timedelta = default_pg_timeout, **kwargs +): def _error(msg): return _rendezvous_error("tcp:// rendezvous: " + msg) @@ -136,18 +178,19 @@ def _error(msg): rank = int(query["rank"]) world_size = int(query["world_size"]) - start_daemon = rank == 0 assert result.hostname is not None - store = TCPStore( # type: ignore[call-arg] - result.hostname, result.port, world_size, start_daemon, timeout, multi_tenant=True - ) + + store = _create_c10d_store(result.hostname, result.port, rank, world_size, timeout) + yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it - raise RuntimeError("Unable to perform rerendezvous using tcp:// method") + raise RuntimeError("Unable to perform re-rendezvous using tcp:// method") -def _env_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs): +def _env_rendezvous_handler( + url: str, timeout: timedelta = default_pg_timeout, **kwargs +): def _error(msg): return _rendezvous_error("env:// rendezvous: " + msg) @@ -183,29 +226,13 @@ def _get_env_or_raise(env_var: str) -> str: master_addr = _get_env_or_raise("MASTER_ADDR") master_port = int(_get_env_or_raise("MASTER_PORT")) + store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout) - use_torchelastic_store = os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None) - - if use_torchelastic_store == str(True): - attempt = os.environ["TORCHELASTIC_RESTART_COUNT"] - worker_process_prefix = f"/worker/attempt_{attempt}" - # When TORCHELASTIC_USE_AGENT_STORE is set up, the worker process is assumed - # to be invoked by the torchelastic agent. Torchelastic agent creates a tcp daemon thread - # on the GROUP_RANK=0, as a result all user worker processes should create store with: daemon=False - tcp_store = TCPStore(master_addr, master_port, world_size, False, timeout) - # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191 - yield (PrefixStore(worker_process_prefix, tcp_store), rank, world_size) - else: - # Start the TCP store daemon on the rank 0 - start_daemon = rank == 0 - store = TCPStore( # type: ignore[call-arg] - master_addr, master_port, world_size, start_daemon, timeout, multi_tenant=True - ) - # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191 - yield (store, rank, world_size) + yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it - raise RuntimeError("Unable to perform rerendezvous using env:// method") + raise RuntimeError("Unable to perform re-rendezvous using env:// method") + register_rendezvous_handler("tcp", _tcp_rendezvous_handler) register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index f21fc4e68808f..d4428a0cde3c1 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -321,6 +321,7 @@ def train(): from torch.distributed.elastic.utils.logging import get_logger from torch.distributed.launcher.api import LaunchConfig, elastic_launch + log = get_logger() From 3b284ab0243d22bb831a1685f47061b9612e1cb3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 25 Aug 2021 23:40:09 -0700 Subject: [PATCH 243/530] Adding BFP16 quantization/dequantization support to OSS (#63059) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63059 Supporting BFP16 quantization method to OSS. Currently only support CPU ghstack-source-id: 136639528 Test Plan: Imported from OSS Reviewed By: wanchaol Differential Revision: D30194538 fbshipit-source-id: ac248567ad8028457c2a91b77ef2ce81709fce53 --- .../quantization/test_quantization.py | 65 +++++--- tools/build_variables.bzl | 2 + torch/csrc/distributed/c10d/init.cpp | 24 +++ .../c10d/quantization/quantization.cpp | 93 +++++++++++ .../c10d/quantization/quantization.h | 20 +++ .../c10d/quantization/quantization_gpu.cu | 148 ++++++++++++++++++ .../c10d/quantization/quantization_gpu.h | 20 +++ .../c10d/quantization/quantization_utils.h | 31 ++++ .../algorithms/quantization/quantization.py | 41 +++-- 9 files changed, 409 insertions(+), 35 deletions(-) create mode 100644 torch/csrc/distributed/c10d/quantization/quantization.cpp create mode 100644 torch/csrc/distributed/c10d/quantization/quantization.h create mode 100644 torch/csrc/distributed/c10d/quantization/quantization_gpu.cu create mode 100644 torch/csrc/distributed/c10d/quantization/quantization_gpu.h create mode 100644 torch/csrc/distributed/c10d/quantization/quantization_utils.h diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py index 7872920f21141..505f805b2cc10 100644 --- a/test/distributed/algorithms/quantization/test_quantization.py +++ b/test/distributed/algorithms/quantization/test_quantization.py @@ -8,6 +8,7 @@ from torch.testing._internal.common_distributed import ( MultiProcessTestCase, requires_gloo, + skip_if_rocm, skip_if_lt_x_gpu, requires_nccl, ) @@ -26,9 +27,9 @@ def _build_tensor(size, value=None, dtype=torch.float, device_id=None): if value is None: value = size if device_id is None: - return torch.empty(size, size, size, dtype=dtype).fill_(value) + return torch.empty(size, dtype=dtype).fill_(value) else: - return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id) + return torch.empty(size, dtype=dtype).fill_(value).cuda(device_id) if TEST_WITH_DEV_DBG_ASAN: print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr) sys.exit(0) @@ -38,7 +39,6 @@ def _build_tensor(size, value=None, dtype=torch.float, device_id=None): sys.exit(0) BACKEND = os.environ["BACKEND"] - if BACKEND == "gloo" or BACKEND == "nccl": class DistQuantizationTests(MultiProcessTestCase): @@ -60,7 +60,7 @@ def op_timeout_sec(self): @property def world_size(self): - return 2 + return int(os.environ["WORLD_SIZE"]) def _init_multigpu_helper(self): """Multigpu tests are designed to simulate the multi nodes with multi @@ -69,7 +69,7 @@ def _init_multigpu_helper(self): divided to subsets, each process only uses a subset. """ nGPUs = torch.cuda.device_count() - world_size = dist.get_world_size() + world_size = self.world_size visible_devices = range(nGPUs) if BACKEND == "nccl": @@ -91,18 +91,29 @@ def _init_multigpu_helper(self): @requires_gloo() @sandcastle_skip_if(BACKEND != "gloo", "Only gloo backend supports all_gather_fp16") def test_all_gather_fp16(self): - store = dist.FileStore(self.file_name, int(self.world_size)) + store = dist.FileStore(self.file_name, self.world_size) dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') device = torch.device(f"cuda:{self.rank}") group = list(range(0, self.world_size)) group_id = dist.group.WORLD self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16) + @requires_gloo() + @sandcastle_skip_if(BACKEND != "gloo", "Only gloo backend supports all_gather_fp16") + def test_all_gather_bfp16(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') + device = torch.device(f"cuda:{self.rank}") + group = list(range(0, self.world_size)) + group_id = dist.group.WORLD + self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16) + @requires_nccl() @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16") @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) + @skip_if_rocm def test_all_to_all_fp16(self): - store = dist.FileStore(self.file_name, int(self.world_size)) + store = dist.FileStore(self.file_name, self.world_size) dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') device = torch.device(f"cuda:{self.rank}") group = list(range(0, self.world_size)) @@ -117,16 +128,34 @@ def test_all_to_all_fp16(self): dtype=torch.float32, qtype=DQuantType.FP16) + @requires_nccl() + @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16") + @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) + @skip_if_rocm + def test_all_to_all_bfp16(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') + device = torch.device(f"cuda:{self.rank}") + group = list(range(0, self.world_size)) + group_id = dist.new_group(range(self.world_size)) + rank_to_GPU = self._init_multigpu_helper() + self._test_all_to_all( + group, + group_id, + self.rank, + cuda=True, + rank_to_GPU=rank_to_GPU, + dtype=torch.float32, + qtype=DQuantType.BFP16) + def _test_all_gather( self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None): for dest in group: - tensor = _build_tensor(dest + 1, rank, dtype=dtype) - tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group] - expected_tensors = [_build_tensor(dest + 1, i, dtype=dtype) for i in group] - if (qtype is not None): - allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None) - else: - allgather = dist.all_gather + tensor = _build_tensor([dest + 1, dest + 1], rank, dtype=dtype) + tensors = [_build_tensor([dest + 1, dest + 1], -1, dtype=dtype) for i in group] + expected_tensors = [ + _build_tensor([dest + 1, dest + 1], i, dtype=dtype) for i in group + ] if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors] @@ -134,6 +163,7 @@ def _test_all_gather( tensor_shapes = [torch.view_as_real(tensors[0]).shape] else: tensor_shapes = [tensors[0].shape] + allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None) allgather(tensors, tensor, group=group_id, async_op=False) for t1, t2 in zip(tensors, expected_tensors): @@ -168,11 +198,8 @@ def _test_all_to_all( t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors ] out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors] - if(qtype is not None): - quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None) - quantize_alltoall(out_tensors, in_tensors, group=group_id) - else: - dist.all_to_all(out_tensors, in_tensors, group=group_id) + quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None) + quantize_alltoall(out_tensors, in_tensors, group=group_id) for t1, t2 in zip(out_tensors, expected_tensors): self.assertEqual(t1, t2) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 5f4cc0df522f5..3f6225358ac97 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -551,6 +551,7 @@ libtorch_cuda_distributed_extra_sources = [ "torch/csrc/distributed/c10d/NCCLUtils.cpp", "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp", "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", + "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", ] libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources @@ -737,6 +738,7 @@ libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/frontend.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/python_comm_hook.cpp", + "torch/csrc/distributed/c10d/quantization/quantization.cpp", ] libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 201f0c2dd64f4..6b52d3c058384 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -17,6 +17,7 @@ #ifdef USE_C10D_NCCL #include +#include #endif #ifdef USE_C10D_MPI @@ -31,8 +32,10 @@ #include #include #include + #include #include +#include #include #include #include @@ -1644,6 +1647,27 @@ PyMethodDef* python_functions() { return methods; } +namespace quantization { +TORCH_LIBRARY(q, m) { + m.def("_Bfloat16QuantizedToFloat(Tensor input) -> Tensor"); + m.def("_FloatToBfloat16Quantized(Tensor input) -> Tensor"); +} + TORCH_LIBRARY_IMPL(q, CPU, m) { + m.impl("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cpu); + m.impl("_FloatToBfloat16Quantized", _float_to_bfloat16_cpu); + } + +#ifdef USE_C10D_NCCL + #define DISPATCH_TO_CUDA(name, function) \ + m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function))) + TORCH_LIBRARY_IMPL(q, CUDA, m) { + DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cuda); + DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", _float_to_bfloat16_cuda); + } +#endif + +} // namespace quantization + } // namespace c10d } // namespace distributed } // namespace torch diff --git a/torch/csrc/distributed/c10d/quantization/quantization.cpp b/torch/csrc/distributed/c10d/quantization/quantization.cpp new file mode 100644 index 0000000000000..b9682d73ed139 --- /dev/null +++ b/torch/csrc/distributed/c10d/quantization/quantization.cpp @@ -0,0 +1,93 @@ +#include +#include + +namespace torch { +namespace distributed { +namespace c10d { +namespace quantization { + +void FloatToBFloat16Quantized_ref( + const float* const input, + const size_t nrows, + const size_t ncols, + uint16_t* const output){ + for (const auto row : c10::irange(nrows)) { + const float* input_row = input + row * ncols; + uint16_t* output_row = output + row * ncols; + + for (const auto col : c10::irange(ncols)) { + output_row[col] = + (*reinterpret_cast(input_row + col) + (1 << 15)) >> + 16; + } + } +} + +void BFloat16QuantizedToFloat_ref( + const at::BFloat16* const input, + const size_t nrows, + const size_t ncols, + float* const output){ + const int32_t output_columns = ncols; + + for (const auto row : c10::irange(nrows)) { + const at::BFloat16* input_row = input + row * ncols; + float* output_row = output + row * output_columns; + + for (const auto col : c10::irange(ncols)) { + uint32_t val_fp32 = static_cast( + reinterpret_cast(input_row)[col]) + << 16; + reinterpret_cast(output_row)[col] = val_fp32; + } + } +} + +at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input) { + TENSOR_ON_CPU(input); + // Currently it supports 2D inputs + TENSOR_NDIM_EQUALS(input, 2); + + const auto input_sizes = input.sizes(); + const int32_t nrows = input_sizes[0]; + const int32_t ncols = input_sizes[1]; + const int32_t output_columns = ncols; + auto output = at::empty( + {nrows, output_columns}, + input.options().dtype(at::kHalf)); + + FloatToBFloat16Quantized_ref( + input.data_ptr(), + nrows, + ncols, + reinterpret_cast(output.data_ptr())); + + return output; +} + +at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input) { + TENSOR_ON_CPU(input); + // Currently it supports 2D inputs + TENSOR_NDIM_EQUALS(input, 2); + + const auto input_sizes = input.sizes(); + const int32_t nrows = input_sizes[0]; + const int32_t ncols = input_sizes[1]; + const int32_t output_columns = ncols; + + auto output = at::empty( + {nrows, output_columns}, // 4 = sizeof(float) + input.options().dtype(at::kFloat)); // + BFloat16QuantizedToFloat_ref( + reinterpret_cast(input.data_ptr()), + nrows, + ncols, + output.data_ptr()); + + return output; +} + +} // namespace quantization +} // namespace c10d +} // namespace distributed +} // namespace torch diff --git a/torch/csrc/distributed/c10d/quantization/quantization.h b/torch/csrc/distributed/c10d/quantization/quantization.h new file mode 100644 index 0000000000000..658fa754488d1 --- /dev/null +++ b/torch/csrc/distributed/c10d/quantization/quantization.h @@ -0,0 +1,20 @@ +// (c) Facebook, Inc. and its affiliates. Confidential and proprietary. + +#pragma once + + +#include +#include + +namespace torch { +namespace distributed { +namespace c10d { +namespace quantization { + +at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input); +at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input); + +} // namespace quantization +} // namespace c10d +} // namespace distributed +} // namespace torch diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu new file mode 100644 index 0000000000000..5590e035b0683 --- /dev/null +++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu @@ -0,0 +1,148 @@ +#include +#include +#include +#include + +// FP32 -> BF16 kernel +__global__ inline void _float_to_bfloat16_cuda_kernel( + const float* __restrict__ input, + const int nrows, + const int ncols, + uint16_t* __restrict__ output) { + const int row_incre = blockDim.y * gridDim.y; + const int col_incre = blockDim.x * gridDim.x; + for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < nrows; + row += row_incre) { + const float* input_row = input + row * ncols; + uint16_t* output_row = output + row * ncols; + for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < ncols; + col += col_incre) { + // Add 2^15 and right shift 16 to do round-nearest + output_row[col] = + (*reinterpret_cast(input_row + col) + (1 << 15)) >> + 16; + } + } +} + +// BF16 -> FP32 kernel +__global__ inline void _bfloat16_to_float_cuda_kernel( + const uint16_t* __restrict__ input, + const int nrows, + const int ncols, + float* __restrict__ output) { + const int row_incre = blockDim.y * gridDim.y; + const int col_incre = blockDim.x * gridDim.x; + for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < nrows; + row += row_incre) { + for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < ncols; + col += col_incre) { + const uint16_t* input_row = input + row * ncols; + float* output_row = output + row * ncols; + uint32_t val_fp32 = static_cast( + reinterpret_cast(input_row)[col]) + << 16; + reinterpret_cast(output_row)[col] = val_fp32; + } + } +} + +namespace torch { +namespace distributed { +namespace c10d { +namespace quantization { + +at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) { + TENSOR_ON_CUDA_GPU(input); + // Currently it supports 2D inputs + TENSOR_NDIM_EQUALS(input, 2); + + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(input.get_device()); + + const int nrows = input.size(0); + const int ncols = input.size(1); + const int output_columns = ncols; + + auto output = at::empty( + {nrows, output_columns}, + input.options().dtype(at::kHalf)); // at::kHalf + + if (nrows == 0 || output_columns == 0) { + return output; + } + + // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia + // NCCL input.options().dtype(at::kBFloat16)); // at::kBFloat16 + + constexpr int threads_per_block = 256; + const int blockDim_x = std::min(output_columns, threads_per_block); + dim3 blockDim(blockDim_x, threads_per_block / blockDim_x); + const int gridDim_x = (output_columns + blockDim.x - 1) / blockDim.x; + const int gridDim_y = std::min((nrows + blockDim.y - 1) / blockDim.y, 65535u); + dim3 gridDim(gridDim_x, gridDim_y); + + _float_to_bfloat16_cuda_kernel<<< + gridDim, + blockDim, + 0, + at::cuda::getCurrentCUDAStream()>>>( + input.data_ptr(), + nrows, + ncols, + // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia + // NCCL + reinterpret_cast(output.data_ptr())); + //C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return output; +} + +at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) { + TENSOR_ON_CUDA_GPU(input); + // Currently it supports 2D inputs + TENSOR_NDIM_EQUALS(input, 2); + + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(input.get_device()); + + const int nrows = input.size(0); + const int ncols = input.size(1); + const int output_columns = ncols; + + auto output = at::empty( + {nrows, output_columns}, // 4 = sizeof(float) + input.options().dtype(at::kFloat)); // at::kBytes for uint8_t + + if (nrows == 0 || output_columns == 0) { + return output; + } + + constexpr int threads_per_block = 256; + + const int blockDim_x = std::min(output_columns, threads_per_block); + dim3 blockDim(blockDim_x, threads_per_block / blockDim_x); + const int gridDim_x = (output_columns + blockDim.x - 1) / blockDim.x; + const int gridDim_y = std::min((nrows + blockDim.y - 1) / blockDim.y, 65535u); + dim3 gridDim(gridDim_x, gridDim_y); + + _bfloat16_to_float_cuda_kernel<<< + gridDim, + blockDim, + 0, + at::cuda::getCurrentCUDAStream()>>>( + // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia + // NCCL + reinterpret_cast(input.data_ptr()), + nrows, + ncols, + output.data_ptr()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return output; +} + +} // namespace quantization +} // namespace c10d +} // namespace distributed +} // namespace torch diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h new file mode 100644 index 0000000000000..2a0c8f8f8d39c --- /dev/null +++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h @@ -0,0 +1,20 @@ +// (c) Facebook, Inc. and its affiliates. Confidential and proprietary. + +#pragma once + + +#include +#include + +namespace torch { +namespace distributed { +namespace c10d { +namespace quantization { + +at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input); +at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input); + +} // namespace quantization +} // namespace c10d +} // namespace distributed +} // namespace torch diff --git a/torch/csrc/distributed/c10d/quantization/quantization_utils.h b/torch/csrc/distributed/c10d/quantization/quantization_utils.h new file mode 100644 index 0000000000000..0467ba2769f5b --- /dev/null +++ b/torch/csrc/distributed/c10d/quantization/quantization_utils.h @@ -0,0 +1,31 @@ +// (c) Facebook, Inc. and its affiliates. Confidential and proprietary. + +#pragma once + +#include + +#include + +inline std::string torch_tensor_device_name(const at::Tensor& ten) { + return c10::DeviceTypeName(ten.device().type()); +} + +#define TENSOR_NDIM_EQUALS(ten, dims) \ + TORCH_CHECK( \ + (ten).ndimension() == (dims), \ + "Tensor '" #ten "' must have " #dims \ + " dimension(s). " \ + "Found ", \ + (ten).ndimension()) + +#define TENSOR_ON_CPU(x) \ + TORCH_CHECK( \ + !x.is_cuda(), \ + #x " must be a CPU tensor; it is currently on device ", \ + torch_tensor_device_name(x)) + +#define TENSOR_ON_CUDA_GPU(x) \ + TORCH_CHECK( \ + x.is_cuda(), \ + #x " must be a CUDA tensor; it is currently on device ", \ + torch_tensor_device_name(x)) diff --git a/torch/distributed/algorithms/quantization/quantization.py b/torch/distributed/algorithms/quantization/quantization.py index 724d6aa362487..d58c58cad09e2 100644 --- a/torch/distributed/algorithms/quantization/quantization.py +++ b/torch/distributed/algorithms/quantization/quantization.py @@ -10,7 +10,12 @@ TORCH_HALF_MAX = torch.finfo(torch.float16).max class DQuantType(Enum): - FP16 = "fp16" + """ + Different quantization methods for auto_quantize API are identified here. + auto_quantize API currently supports fp16 and bfp16 methods. + """ + FP16 = "fp16", + BFP16 = "bfp16" def __str__(self) -> str: return self.value @@ -26,6 +31,8 @@ def _quantize_tensor(tensor, qtype): ) if (qtype == DQuantType.FP16): return _fp32_to_fp16_with_clamp(tensor) + elif (qtype == DQuantType.BFP16): + return torch.ops.q._FloatToBfloat16Quantized(tensor) else: raise RuntimeError( f'Quantization type {qtype} is not supported' @@ -38,13 +45,8 @@ def _quantize_tensor_list(tensor_list, qtype): raise RuntimeError( f"_quantize_tensor_list expecting list of torch.Tensor as input but found {type(tensor_list)}" ) - if (qtype == DQuantType.FP16): - quantized_tensor_list = [_quantize_tensor(t, qtype) for t in tensor_list] - return quantized_tensor_list - else: - raise RuntimeError( - f'Quantization type {qtype} is not supported' - ) + quantized_tensor_list = [_quantize_tensor(t, qtype) for t in tensor_list] + return quantized_tensor_list def _dequantize_tensor(tensor, qtype, quant_loss=None): if not isinstance(tensor, torch.Tensor): @@ -60,6 +62,13 @@ def _dequantize_tensor(tensor, qtype, quant_loss=None): return tensor.float() else: return tensor.float() / quant_loss + elif (qtype == DQuantType.BFP16): + if tensor.dtype != torch.float16: + raise RuntimeError( + f"tensor dtype is {tensor.dtype} while expected to be FP16." + ) + else: + return torch.ops.q._Bfloat16QuantizedToFloat(tensor) else: raise RuntimeError( f'Quantization type {qtype} is not supported' @@ -73,26 +82,26 @@ def _dequantize_tensor_list(tensor_list, qtype, quant_loss=None): raise RuntimeError( f"_dequantize_tensor_list expecting list of torch.Tensor as input but found {type(tensor_list)}" ) - elif (qtype == DQuantType.FP16): - dequantized_tensor_list = [_dequantize_tensor(t, qtype) for t in tensor_list] - return dequantized_tensor_list - else: - raise RuntimeError( - f'Quantization type {qtype} is not supported' - ) + dequantized_tensor_list = [_dequantize_tensor(t, qtype) for t in tensor_list] + return dequantized_tensor_list def auto_quantize(func, qtype, quant_loss=None): """ This is a prototype API that automatically quantize the input tensors, choose the precision types, and pass other necessary arguments and then dequantizes the output. + Currently it only supports: - . FP16 quantization method + . FP16 and BFP16 quantization method supported for gloo and nccl backends . all_gather, all_to_all collective ops + + Note: BFP16 only supports 2D tensors. + Args: func (callable): A function representing collective operations. qtype (QuantType): Quantization method quant_loss (float, optional): This can be used to improve accuracy in the dequantization. + Returns: (callable): the same collective as func but enables automatic quantization/dequantization. """ From a6f767ed3d66b4a01e5b2edead8491dfbca517e6 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 25 Aug 2021 23:48:58 -0700 Subject: [PATCH 244/530] Fix issue re: DDP and create_graph=True (#63831) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63831 Closes https://github.com/pytorch/pytorch/issues/63812 `at::mul_out` is not supported when `grad` itself requires grad, which is useful for computing higher order derivatives. In this case, fall back to a mul + copy instead of mul_out. ghstack-source-id: 136614644 Test Plan: UT Reviewed By: SciPioneer Differential Revision: D30505573 fbshipit-source-id: 83532b6207b3d80116fcc4dff0e5520d73b3454f --- torch/csrc/distributed/c10d/reducer.cpp | 22 ++++++++++++++++--- .../_internal/distributed/distributed_test.py | 22 +++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index d91f191602888..eafc70cc5e30f 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -377,9 +377,25 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) { if (comm_hook_ == nullptr) { auto wrapped = at::native::wrapped_scalar_tensor(double(1.) / div_factor_); - // Divides while copying into the bucket view to save one scan over - // all the input parameters. - at::mul_out(bucket_view, grad, wrapped); + if (!grad.requires_grad()) { + // Divides while copying into the bucket view to save one scan over + // all the input parameters. + at::mul_out(bucket_view, grad, wrapped); + } else { + // If DDP is running with create_graph=True, gradients require_grad + // themselves in order to compute higher order derivatives. However, + // DDP will not sync up these gradients currently (see + // https://github.com/pytorch/pytorch/issues/63812). + LOG(WARNING) + << "Using DistributedDataParallel with create_graph=True " + << " is not well-supported. The higher-order gradient will " + << " not be synchronized across ranks, and backpropagation " + << " through all_reduce operations will not occur. If you require " + << " DDP to work with higher-order gradients for your use case, " + << " please ping https://github.com/pytorch/pytorch/issues/63929"; + auto div_result = at::mul(grad, wrapped); + bucket_view.copy_(div_result); + } } else { bucket_view.copy_(grad); } diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index f4bc073a4317e..333458c5f8308 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -3760,6 +3760,28 @@ def test_DistributedDataParallel_requires_grad(self): ) self._barrier() + @sandcastle_skip_if( + BACKEND != "nccl" and BACKEND != "gloo", + "Only NCCL and GLOO backend support DistributedDataParallel", + ) + @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) + def test_ddp_create_graph(self): + rank = self.rank + torch.cuda.set_device(rank) + net = torch.nn.parallel.DistributedDataParallel( + torch.nn.Linear(1, 1, bias=False).cuda(rank), + device_ids=[rank] + ) + inp = torch.randn((2, 1), device=rank) + for _ in range(6): + loss = net(inp).sum() + # Verify DDP works with create_graph=True + loss.backward(create_graph=True) + # grad tensors should require grad. + self.assertTrue( + all([param.requires_grad for param in net.parameters()]) + ) + @sandcastle_skip_if( BACKEND != "nccl" and BACKEND != "gloo", "Only NCCL and GLOO backend support DistributedDataParallel", From 5757d03145ac4d7a81822d35fc76af56ba7d39ab Mon Sep 17 00:00:00 2001 From: Kefei Lu Date: Thu, 26 Aug 2021 00:51:53 -0700 Subject: [PATCH 245/530] Add logging for _MinimizerBase Summary: Add logging so we know which nodes are currently being visited Test Plan: lint & SC tests Reviewed By: 842974287 Differential Revision: D30509865 fbshipit-source-id: 09e77e44c97c825242e0b24f90463b50f3ca19c6 --- torch/fx/passes/net_min_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py index 2a093bea49a4c..b7a911e4bf3db 100644 --- a/torch/fx/passes/net_min_base.py +++ b/torch/fx/passes/net_min_base.py @@ -1,5 +1,6 @@ import argparse from typing import Any, Callable, Tuple, Dict, Optional +import logging import torch import torch.fx @@ -17,6 +18,8 @@ Names ) +_LOGGER = logging.getLogger(__name__) + class FxNetMinimizerBadModuleError(Exception): """ @@ -403,6 +406,7 @@ def _sequential_traverse(self, nodes: NodeList) -> NodeSet: culprits: NodeSet = set() for node in nodes: + _LOGGER.info(f"Visit node: {node.name}") cur_nodes: NodeSet = {node} if node in self.fusions: From 61d88cdd1c5fe7cf91b6ee0a71a250e3a6f61878 Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Thu, 26 Aug 2021 04:42:36 -0700 Subject: [PATCH 246/530] use `const auto&` as type for grad alias (#63949) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63949 This is an extension of the discussion in https://github.com/pytorch/pytorch/pull/63040#discussion_r687793027. Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D30546789 Pulled By: dagitses fbshipit-source-id: 3046aff4f129d5492d73dfb67717a824e16ffee8 --- tools/autograd/gen_autograd_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py index 7d852aded47a9..08136ab54bfcc 100644 --- a/tools/autograd/gen_autograd_functions.py +++ b/tools/autograd/gen_autograd_functions.py @@ -479,7 +479,7 @@ def save_var(var: SavedAttribute, is_output: bool) -> None: body: List[str] = [] if uses_single_grad(info): - body.append('auto& grad = grads[0];') + body.append('const auto& grad = grads[0];') def emit_derivative( derivative: Derivative, From c02eda8166068400a9e5d82343108cd8a524095c Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 26 Aug 2021 05:43:05 -0700 Subject: [PATCH 247/530] Update TensorPipe submodule Summary: The bot failed to do it. Test Plan: D30542677 Reviewed By: beauby Differential Revision: D30573500 fbshipit-source-id: 50abd6fc415cead0a6b6d9290fa0e5f97d0e4989 --- third_party/tensorpipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/tensorpipe b/third_party/tensorpipe index e45b2338d0a31..1cd0ac3e4ce51 160000 --- a/third_party/tensorpipe +++ b/third_party/tensorpipe @@ -1 +1 @@ -Subproject commit e45b2338d0a31192a7e413f3fbbfa7fd90504a37 +Subproject commit 1cd0ac3e4ce5144ee4ea2545741182c76fba6cf2 From 774ae0851d98829b412e46dde85e716dad065a06 Mon Sep 17 00:00:00 2001 From: Heitor Schueroff Date: Thu, 26 Aug 2021 06:05:28 -0700 Subject: [PATCH 248/530] [OpInfo] Added ReductionOpInfo subclass of OpInfo and ported sum test (#62737) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62737 ReductionOpInfo is a specialization of OpInfo for reduction operators. For now, it is designed to work with reductions that return a single tensor and that reduce all elements along one or more dimensions to a single value. In particular this excludes operators such as `max` and `min` that return multiple tensors and `quantile` that can return multiple values. fixes https://github.com/pytorch/pytorch/issues/49746 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30406568 Pulled By: heitorschueroff fbshipit-source-id: 218b1da1902f67bcf4c3681e2a0f0029a25d51f1 --- test/test_ops.py | 6 +- test/test_reductions.py | 19 +- .../_internal/common_methods_invocations.py | 255 +++++++++++++----- 3 files changed, 208 insertions(+), 72 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 76a7b6a1485ca..a6baf8dbe699a 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -10,7 +10,7 @@ (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, make_tensor, gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings) from torch.testing._internal.common_methods_invocations import \ - (op_db, _NOTHING, UnaryUfuncInfo, SpectralFuncInfo) + (op_db, _NOTHING, UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo) from torch.testing._internal.common_device_type import \ (deviceCountAtLeast, instantiate_device_type_tests, ops, onlyCUDA, onlyOnCPUAndCUDA, skipCUDAIfRocm, OpDTypes) from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference @@ -27,8 +27,8 @@ # Get names of all the operators which have ref in their entry in OpInfo (testing infra) # except for Unary Ufuncs (separately implemented in test/test_unary_ufuncs.py) # and Spectral Functions (separately implemented for only 1D as of now, in test/test_spectral_ops.py) -_ref_test_ops = list(filter(lambda op: not isinstance(op, (UnaryUfuncInfo, SpectralFuncInfo)) and - op.ref is not None and op.ref is not _NOTHING, op_db)) +_ref_test_ops = list(filter(lambda op: not isinstance(op, (UnaryUfuncInfo, ReductionOpInfo, + SpectralFuncInfo)) and op.ref is not None and op.ref is not _NOTHING, op_db)) # Tests that apply to all operators and aren't related to any particular diff --git a/test/test_reductions.py b/test/test_reductions.py index c1da0f0816c5a..e224eaec93648 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -14,7 +14,9 @@ IS_WINDOWS, make_tensor) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, - onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, precisionOverride) + onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, ops, precisionOverride) +from torch.testing._internal.common_methods_invocations import ( + ReductionOpInfo, reduction_ops) # TODO: replace with make_tensor def _generate_input(shape, dtype, device, with_extremal): @@ -55,6 +57,21 @@ def _rand_shape(dim, min_size, max_size): class TestReductions(TestCase): + ########################################################################### + # ReductionOpInfo unit tests + ########################################################################### + + @ops(reduction_ops, allowed_dtypes=[torch.float]) + def test_dim_default(self, device, dtype, op: ReductionOpInfo): + """Tests that the default behavior is to reduce all dimensions.""" + t = make_tensor((2, 3), device, dtype) + args, kwargs = next(op.generate_args_kwargs(t)) + self.assertEqual(op(t, *args, **kwargs).ndim, 0) + + ########################################################################### + # TODO: Legacy tests - port to ReductionOpInfo + ########################################################################### + def test_var_unbiased(self, device): tensor = torch.randn(100, device=device) self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True)) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index b725c4831d25f..3839b2ef82c17 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -13,7 +13,7 @@ from torch._six import inf import collections.abc -from typing import List, Sequence, Tuple, Union +from typing import Any, Callable, List, Optional, Sequence, Tuple, Union from torch.testing import \ (make_non_contiguous, floating_types, floating_types_and, complex_types, @@ -43,6 +43,15 @@ import scipy.special +# Reasonable testing sizes for dimensions +L = 20 +M = 10 +S = 5 + +# Unique value to distinguish default from anything else +_NOTHING = object() + + class DecorateInfo(object): """Describes which test, or type of tests, should be wrapped in the given decorators when testing an operator. Any test that matches all provided @@ -92,6 +101,7 @@ def __init__( device_type=device_type, dtypes=dtypes, active_if=active_if) + class SampleInput(object): """Represents sample inputs to a function.""" @@ -185,6 +195,7 @@ def _np(t): sample_np_input, np_args, np_kwargs = to_numpy(self.input), to_numpy(self.args), to_numpy(self.kwargs) return (sample_np_input, np_args, np_kwargs) + class AliasInfo(object): """Class holds alias information. For example, torch.abs -> torch.absolute, torch.Tensor.absolute, torch.Tensor.absolute_ @@ -200,9 +211,6 @@ def __call__(self, *args, **kwargs): return self.op(*args, **kwargs) -_NOTHING = object() # Unique value to distinguish default from anything else - - # Extension of getattr to support qualified names # e.g. _getattr_qual(torch, 'linalg.norm') -> torch.linalg.norm def _getattr_qual(obj, name, default=_NOTHING): @@ -770,9 +778,164 @@ def default_test_dtypes(self, device_type): else supported.intersection(self._default_test_dtypes)) -L = 20 -M = 10 -S = 5 +def _generate_reduction_inputs(device, dtype, requires_grad): + """Generates input tensors for testing reduction operators""" + yield make_tensor([], device, dtype, requires_grad=requires_grad) + yield make_tensor([2], device, dtype, requires_grad=requires_grad) + yield make_tensor([2, 3], device, dtype, requires_grad=requires_grad, noncontiguous=True) + yield make_tensor([3, 2, 1, 5], device, dtype, requires_grad=requires_grad) + + +def _generate_reduction_kwargs(ndim, supports_multiple_dims=True): + """Generates a subset of all valid dim and keepdim kwargs given ndim that + is appropriate for testing reduction operators. + """ + + # Test default dim and keepdim + yield {} + + # Test reducing inner and outer most dimensions + yield {'dim': 0, 'keepdim': True} + yield {'dim': -1, 'keepdim': False} + + # Test reducing middle dimension + if ndim > 2: + yield {'dim': ndim // 2, 'keepdim': True} + + if supports_multiple_dims: + # Test reducing all dimensions + yield {'dim': tuple(range(ndim)), 'keepdim': False} + + # Test reducing both first and last dimensions + if ndim > 1: + yield {'dim': (0, -1), 'keepdim': True} + + # Test reducing every other dimension starting with the second + if ndim > 3: + yield {'dim': tuple(range(1, ndim, 2)), 'keepdim': False} + + +def sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs): + """Sample inputs for reduction operators.""" + + # TODO(@heitorschueroff) Once all reduction operators are using + # ReductionOpInfo use op_info.supports_multiple_dims directly. + supports_multiple_dims: bool = kwargs.get('supports_multiple_dims', True) + + # TODO(@heitorschueroff) Once all reduction operators are using ReductionOpInfo + # use op_info.genearte_args_kwargs directly. + generate_args_kwargs = kwargs.get('generate_args_kwargs', lambda *args, **kwargs: (yield tuple(), {})) + + inputs: List[SampleInput] = [] + for t in _generate_reduction_inputs(device, dtype, requires_grad): + for reduction_kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims): + for args, kwargs in generate_args_kwargs(t, **reduction_kwargs): + kwargs.update(reduction_kwargs) + inputs.append(SampleInput(t, args=args, kwargs=kwargs)) + + return inputs + + +# NOTE [Reductions]: +# +# For testing purposes, we relax the definition of a reduction operator +# as defined in the docstring below. We do this to capture operators with +# a similar API so they can be tested automatically. However... +# +# Strictly speaking a reduction operator is an operator that can reduce an +# array to a single scalar value and that can be computed from the partial +# result of reducing subarrays. This usually means that the reduction operation +# should be commutative and associative. This definition is important when it +# comes to implementation as it determines how a reduction can be parallelized. +# +# For example, many summary statistics such as median, mode and quantile cannot +# be computed from partial results because these are sorting and counting based +# algorithms that need information that would be lost in the reduced value. +class ReductionOpInfo(OpInfo): + """Reduction operator information. + + An operator is a reduction operator if it reduces one or more dimensions of + the input tensor to a single value. Reduction operators must implement the + following signature: + + - `op(input, *args, *, dim=None, keepdim=False, **kwargs) -> Tensor` + + ReductionOpInfo tests that reduction operators implement a consistent API. + Optional features such as reducing over multiple dimensions are captured in + the optional keyword parameters of the ReductionOpInfo constructor. + + If a reduction operator does not yet implement the full required API of + reduction operators, this should be documented by skipping the failing + tests rather than adding optional parameters to ReductionOpInfo. + + NOTE + The API for reduction operators has not yet been finalized and some + requirements may change. + + See tests in test/test_reductions.py + """ + + def __init__( + self, name, *, + + # The identity value for the operator if it has one. + identity: Optional[Any] = None, + + # The nan policy for the operator if it implements one. + # - propagate: NaN values are propagated to the output + # - omit: NaN values are discarded during the reduction + nan_policy: Optional[str] = None, + + # Whether the operator supports reducing multiple dimensions. + supports_multiple_dims: bool = True, + + # Whether the operator promotes integral to floating point dtypes. + promotes_int_to_float: bool = False, + + # Whether the operator promotes all integral dtypes to int64. + promotes_int_to_int64: bool = False, + + # If a specific dtype is given, then the operator always returns that + # dtype irrespective of the input dtype. If None, the operator returns + # the dtype according to the type promotion rules above. + result_dtype: Optional[torch.dtype] = None, + + # ReductionOpInfo tests generate their own input, dim and keepdim + # arguments and call this function to generate tuples of extra args and + # kwargs to use when calling the op. This is required for operators that + # have other required parameters besides the input tensor. + generate_args_kwargs: Callable = lambda t, dim=None, keepdim=False: (yield tuple(), {}), + + # Options from the OpInfo base class + **kwargs, + ): + assert nan_policy in (None, 'propagate', 'omit') + + # These are mutually exclusive options + assert not (result_dtype and promotes_int_to_float) + assert not (result_dtype and promotes_int_to_int64) + assert not (promotes_int_to_float and promotes_int_to_int64) + + # Default sample_inputs_func for ReductionOpInfo which augments sample + # inputs from sample_inputs_reduction with the args and kwargs from + # generate_args_kwargs. This is only used if sample_inputs_func is None. + def sample_inputs_func(*args, **kwargs): + kwargs['supports_multiple_dims'] = supports_multiple_dims + kwargs['generate_args_kwargs'] = generate_args_kwargs + return sample_inputs_reduction(*args, **kwargs) + + # Override OpInfo defaults and call base class __init__ + kwargs.setdefault('inplace_variant', None) + kwargs.setdefault('sample_inputs_func', sample_inputs_func) + super(ReductionOpInfo, self).__init__(name, **kwargs) + + self.identity = identity + self.nan_policy = nan_policy + self.supports_multiple_dims = supports_multiple_dims + self.promotes_int_to_float = promotes_int_to_float + self.promotes_int_to_int64 = promotes_int_to_int64 + self.result_dtype = result_dtype + self.generate_args_kwargs = generate_args_kwargs def sample_inputs_unary(op_info, device, dtype, requires_grad, **kwargs): @@ -2452,56 +2615,6 @@ def sample_inputs_max_min_reduction_no_dim(op_info, device, dtype, requires_grad requires_grad=requires_grad),)) return inputs -# Generates input tensors for testing reduction ops -def _generate_reduction_inputs(device, dtype, requires_grad): - yield make_tensor((), device, dtype, requires_grad=requires_grad) - yield make_tensor((2,), device, dtype, requires_grad=requires_grad) - yield make_tensor((2, 3), device, dtype, requires_grad=requires_grad, noncontiguous=True) - yield make_tensor((3, 2, 1, 2, 2), device, dtype, requires_grad=requires_grad) - -# Generates a subset of possible dim and keepdim kwargs for a tensor -# with ndim dims appropriate for testing. If supports_multiple_dims -# is True (default) then dim kwarg can be a list of dims. -def _generate_reduction_kwargs(ndim, supports_multiple_dims=True): - for keepdim in [True, False]: - # Always test reducing inner and outer most dimensions - yield {'dim': 0, 'keepdim': keepdim} - yield {'dim': -1, 'keepdim': keepdim} - - # Also reduce middle dimension - if ndim > 2: - yield {'dim': ndim // 2, 'keepdim': keepdim} - - if supports_multiple_dims: - # Always test reducing all dims - yield {'dim': tuple(range(ndim)), 'keepdim': keepdim} - - # Test reducing both first and last dimensions - if ndim > 1: - yield {'dim': (0, ndim - 1), 'keepdim': keepdim} - - # Test reducing every other dimension starting with the second - if ndim > 3: - yield {'dim': tuple(range(1, ndim, 2)), 'keepdim': keepdim} - -# Wraps sample_inputs_reduction function to provide the additional supports_multiple_dims args -def sample_inputs_reduction_wrapper(supports_multiple_dims): - # Generates sample inputs for reduction ops that contain the input tensor - # and dim and keepdim kwargs. If a reduction op needs to test additional - # args/kwargs then create a separate sample_inputs function - def fn(op_info, device, dtype, requires_grad): - inputs = [] - - for t in _generate_reduction_inputs(device, dtype, requires_grad): - # Add case without dim and keepdim kwargs - inputs.append(SampleInput(t)) - for kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims): - inputs.append(SampleInput(t, kwargs=kwargs)) - - return inputs - - return fn - def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad): test_quantiles = (0.5, make_tensor((2,), device, dtype, low=0, high=1)) test_interpolations = ['linear', 'midpoint'] @@ -2513,6 +2626,8 @@ def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad): inputs.append(SampleInput(t, args=(quantiles,))) for kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims=False): # Interpolation kwarg for now is only supported when providing both dim and keepdim + kwargs.setdefault('dim', 0) + kwargs.setdefault('keepdim', False) for interpolation in test_interpolations: kwargs['interpolation'] = interpolation inputs.append(SampleInput(t, args=(quantiles,), kwargs=kwargs)) @@ -6875,19 +6990,19 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypesIfCUDA=all_types_and(torch.float16), # TODO: some signatures of median do support out supports_out=False, - sample_inputs_func=sample_inputs_reduction_wrapper(False)), + sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)), OpInfo('nanmedian', dtypes=all_types(), dtypesIfCPU=all_types_and(torch.bfloat16), dtypesIfCUDA=all_types_and(torch.float16), # TODO: some signatures of nanmedian do support out supports_out=False, - sample_inputs_func=sample_inputs_reduction_wrapper(False)), + sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)), OpInfo('var_mean', dtypes=floating_and_complex_types_and(torch.half), dtypesIfCPU=floating_and_complex_types_and(torch.half, torch.bfloat16), dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), - sample_inputs_func=sample_inputs_reduction_wrapper(False), + sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False), backward_dtypes=floating_types_and(torch.half), backward_dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16), backward_dtypesIfCUDA=floating_types_and(torch.half), @@ -6906,7 +7021,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypes=floating_and_complex_types_and(torch.half), dtypesIfCPU=floating_and_complex_types_and(torch.half, torch.bfloat16), dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), - sample_inputs_func=sample_inputs_reduction_wrapper(False), + sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False), backward_dtypes=floating_types_and(torch.half), backward_dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16), backward_dtypesIfCUDA=floating_types_and(torch.half), @@ -6981,21 +7096,16 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_out=False, supports_forward_ad=True, sample_inputs_func=sample_inputs_max_min_reduction_no_dim,), - OpInfo('sum', - dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool), - supports_out=False, - supports_forward_ad=True, - sample_inputs_func=sample_inputs_reduction_wrapper(supports_multiple_dims=True)), OpInfo('nansum', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), supports_out=False, - sample_inputs_func=sample_inputs_reduction_wrapper(supports_multiple_dims=True)), + sample_inputs_func=sample_inputs_reduction), # TODO(@heitorschueroff) Add test for dtype kwarg OpInfo('mean', dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, - sample_inputs_func=sample_inputs_reduction_wrapper(supports_multiple_dims=True), + sample_inputs_func=sample_inputs_reduction, # Need to skip out test because one of the overload for mean does not support it # TODO(@heitorschueroff) fix this when implementing ReductionInfo skips=(SkipInfo('TestCommon', 'test_out'),)), @@ -8843,6 +8953,14 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): ), ), ), + ReductionOpInfo( + 'sum', + identity=0, + supports_out=False, + supports_forward_ad=True, + promotes_int_to_int64=True, + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + ), ] # Common operator groupings @@ -8851,6 +8969,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)] sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True] shape_funcs = [op for op in op_db if isinstance(op, ShapeFuncInfo)] +reduction_ops = [op for op in op_db if isinstance(op, ReductionOpInfo)] # TODO: review porting these to make_tensor def index_variable(shape, max_indices, device=torch.device('cpu')): From 10da1fc3f869075d698fbcda6e0b3ece739973d2 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 26 Aug 2021 06:58:12 -0700 Subject: [PATCH 249/530] Deify opmath_t into its own header, align with accscalar_t (#63986) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63986 Fixes #63985 Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30555996 Pulled By: ezyang fbshipit-source-id: b6e4d56a5658ed028ffc105cc4b479faa6882b65 --- aten/src/ATen/OpMathType.h | 16 +++++++++++++++ aten/src/ATen/native/cuda/AmpKernels.cu | 4 ++-- .../ATen/native/cuda/ForeachBinaryOpList.cu | 4 ++-- .../ATen/native/cuda/ForeachBinaryOpScalar.cu | 4 ++-- .../native/cuda/ForeachBinaryOpScalarList.cu | 4 ++-- aten/src/ATen/native/cuda/ForeachFunctors.cuh | 20 ++++++++----------- .../ATen/native/cuda/ForeachPointwiseOp.cu | 10 +++++----- aten/src/ATen/native/cuda/ForeachUnaryOp.cu | 4 ++-- 8 files changed, 39 insertions(+), 27 deletions(-) create mode 100644 aten/src/ATen/OpMathType.h diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h new file mode 100644 index 0000000000000..b58d4779ac7a4 --- /dev/null +++ b/aten/src/ATen/OpMathType.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace at { + +// For FP16 or BFloat16 inputs, ops should perform internal math in FP32. +template struct OpMathType { using type = scalar_t; }; +template<> struct OpMathType { using type = float; }; +template<> struct OpMathType { using type = float; }; + +template +using opmath_type = typename OpMathType::type; + +} // namespace at diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu index a5d8a643648e7..c89d8a09e8d1d 100644 --- a/aten/src/ATen/native/cuda/AmpKernels.cu +++ b/aten/src/ATen/native/cuda/AmpKernels.cu @@ -59,7 +59,7 @@ void _amp_non_finite_check_and_unscale_cuda_(Tensor& scaled_grad, auto* found_inf_ptr = found_inf.data_ptr(); auto* inv_scale_ptr = inv_scale.data_ptr(); - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; gpu_kernel(iter, [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t { @@ -154,7 +154,7 @@ void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads, auto* found_inf_ptr = found_inf.data_ptr(); auto* inv_scale_ptr = inv_scale.data_ptr(); - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly. multi_tensor_apply<1>(tensor_lists, diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu index 0277aee6f02b1..67a27ce116feb 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu @@ -18,7 +18,7 @@ std::vector foreach_tensor_list_op(TensorList tensors1, TensorList tenso tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<3>(tensor_lists, BinaryOpListAlphaFunctor::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<2>(tensor_lists, BinaryOpListAlphaFunctor foreach_binary_op(TensorList tensors, const Scalar& scalar) tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<2>(tensor_lists, BinaryOpScalarFunctor::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<1>(tensor_lists, BinaryOpScalarFunctor foreach_binary_op(TensorList tensors, at::ArrayRef s tensor_lists.emplace_back(vec_res); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBFloat16, kHalf, kBool, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<2, opmath_t>(tensor_lists, scalars, BinaryOpScalarListFunctor scalars) { tensor_lists.emplace_back(tensors.vec()); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBFloat16, kHalf, kBool, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() { - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<1, opmath_t>(tensor_lists, scalars, BinaryOpScalarListFunctor #include +#include namespace at { namespace native { namespace { -// For FP16 or BFloat16 inputs, ops should perform internal math in FP32. -template struct get_opmath_t { using opmath_t = scalar_t; }; -template<> struct get_opmath_t { using opmath_t = float; }; -template<> struct get_opmath_t { using opmath_t = float; }; - // Initializes args and checks if all args are aligned template __device__ bool init_args( @@ -158,7 +154,7 @@ __device__ __forceinline__ void pointwise_op_scalar( // template struct BinaryOpScalarFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata& tl, @@ -179,7 +175,7 @@ struct BinaryOpScalarFunctor { template struct BinaryOpScalarListFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListScalarListMetadata& tl, @@ -200,7 +196,7 @@ struct BinaryOpScalarListFunctor { template struct BinaryOpListAlphaFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata& tl, @@ -287,7 +283,7 @@ struct ZeroFunctor { template struct UnaryOpFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata& tl, @@ -333,7 +329,7 @@ struct UnaryOpFunctor { template struct PointwiseOpScalarFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata& tl, @@ -354,7 +350,7 @@ struct PointwiseOpScalarFunctor { template struct PointwiseOpScalarListFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListScalarListMetadata& tl, @@ -375,7 +371,7 @@ struct PointwiseOpScalarListFunctor { template struct PointwiseOpListFunctor { - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata& tl, diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu index 977425984e99e..9440b87caedac 100644 --- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu +++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu @@ -20,7 +20,7 @@ std::vector foreach_pointwise_op(TensorList input, TensorList tensors1, tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<4>(tensor_lists, PointwiseOpScalarFunctor::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<3>(tensor_lists, PointwiseOpScalarFunctor::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<3, opmath_t>(tensor_lists, scalars, PointwiseOpScalarListFunctor foreach_pointwise_op(TensorList input, TensorList tensors1, tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; + using opmath_t = at::opmath_type; multi_tensor_apply<4, opmath_t>(tensor_lists, scalars, PointwiseOpScalarListFunctor foreach_tensor_##NAME##_cuda(TensorList tensors1, TensorList tensor_lists.emplace_back(std::move(vec_res)); \ \ AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, tensors1[0].scalar_type(), "foreach_maximum_minimum_op_cuda", [&]() { \ - using opmath_t = get_opmath_t::opmath_t; \ + using opmath_t = at::opmath_type; \ auto op = [] GPU_LAMBDA (opmath_t a, opmath_t b) -> opmath_t { \ opmath_t c = a OP b ? a : b; \ if (_isnan(a)) { \ diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu index 8d606824d2cc6..fd7a12b9dfac6 100644 --- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu +++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu @@ -15,7 +15,7 @@ template class Op> std::vector forea tensor_lists.emplace_back(tensors.vec()); tensor_lists.emplace_back(std::move(vec_res)); - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = typename at::opmath_type; multi_tensor_apply<2>(tensor_lists, UnaryOpFunctor class Op> std::vector forea template class Op> void foreach_unary_op_(TensorList tensors) { std::vector> tensor_lists; tensor_lists.emplace_back(tensors.vec()); - using opmath_t = typename get_opmath_t::opmath_t; + using opmath_t = typename at::opmath_type; multi_tensor_apply<1>(tensor_lists, UnaryOpFunctor Date: Thu, 26 Aug 2021 07:17:24 -0700 Subject: [PATCH 250/530] Added API tests to ReductionOpInfo and ported amax/amin/nansum tests (#62899) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62899 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30408816 Pulled By: heitorschueroff fbshipit-source-id: 6cb0aa7fa7edba93549ef873baa2fb8a003bd91d --- test/test_reductions.py | 242 +++++++++++++++++- .../_internal/common_methods_invocations.py | 241 +++++++++++------ 2 files changed, 397 insertions(+), 86 deletions(-) diff --git a/test/test_reductions.py b/test/test_reductions.py index e224eaec93648..e716336e4afe7 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -2,18 +2,20 @@ import numpy as np import math -from typing import Dict, List +from typing import Dict, List, Sequence import random from functools import partial from itertools import product, combinations, permutations import warnings from torch._six import inf, nan +from torch.testing import ( + integral_types_and, floating_and_complex_types_and) from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict, IS_WINDOWS, make_tensor) from torch.testing._internal.common_device_type import ( - instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, + OpDTypes, instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, ops, precisionOverride) from torch.testing._internal.common_methods_invocations import ( ReductionOpInfo, reduction_ops) @@ -55,18 +57,244 @@ def _rand_shape(dim, min_size, max_size): shape.append(random.randint(min_size, max_size)) return tuple(shape) +def _reduced_shape(shape, dim=None, keepdim=False): + """Computes the expected reduced shape given dim and keepdim + + Args: + shape: The shape to reduce + dim : The dimensions to reduce + keepdim: If true, reduced dimensions have size 1 in the reduced shape, + otherwise they are removed from the reduced shape. + + Returns: + The reduced shape + """ + if dim is None: + return [1] * len(shape) if keepdim else [] + + # Wrap negative dims + dim = dim if isinstance(dim, Sequence) else [dim] + dim = set(i if i >= 0 else len(shape) + i for i in dim) + + result = [] + for i, size in enumerate(shape): + if i not in dim: + result.append(size) + elif keepdim: + result.append(1) + + return result + class TestReductions(TestCase): ########################################################################### # ReductionOpInfo unit tests ########################################################################### - @ops(reduction_ops, allowed_dtypes=[torch.float]) - def test_dim_default(self, device, dtype, op: ReductionOpInfo): - """Tests that the default behavior is to reduce all dimensions.""" - t = make_tensor((2, 3), device, dtype) + def _test_dim_keepdim(self, op: ReductionOpInfo, device, *, ndim, **dim_keepdim): + """Tests output shape for input with ndim and dim and keepdim kwargs""" + shape = torch.randint(2, 5, (ndim,)).tolist() + t = make_tensor(shape, device, torch.float) + args, kwargs = next(op.generate_args_kwargs(t, **dim_keepdim)) + result = op(t, *args, **dim_keepdim, **kwargs) + expected_shape = _reduced_shape(shape, **dim_keepdim) + self.assertEqual(result.shape, expected_shape, f""" + expected output shape to be {expected_shape} but got {list(result.shape)} + for input shape {shape} and {dim_keepdim} + """) + + # TODO(@heitorschueroff) combine cases with and without keepdim once + # there's support for a @parametrize decorator. + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_default(self, device, op: ReductionOpInfo): + """Tests that the default dim reduces all dimensions.""" + for ndim in range(3): + self._test_dim_keepdim(op, device, ndim=ndim) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_default_keepdim(self, device, op: ReductionOpInfo): + """Tests that the default dim, when keepdim=True, reduces all dimensions to size 1.""" + for ndim in range(3): + self._test_dim_keepdim(op, device, ndim=ndim, keepdim=True) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_none(self, device, op: ReductionOpInfo): + """Tests that dim=None reduces all dimensions.""" + for ndim in range(3): + self._test_dim_keepdim(op, device, ndim=ndim, dim=None) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_none_keepdim(self, device, op: ReductionOpInfo): + """Tests that dim=None, when keepdim=True, reduces all dimensions to size 1.""" + for ndim in range(3): + self._test_dim_keepdim(op, device, ndim=ndim, dim=None, keepdim=True) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_single(self, device, op: ReductionOpInfo): + """Tests that dim=i reduces dimension i.""" + self._test_dim_keepdim(op, device, ndim=0, dim=0) + self._test_dim_keepdim(op, device, ndim=1, dim=0) + self._test_dim_keepdim(op, device, ndim=2, dim=-1) + self._test_dim_keepdim(op, device, ndim=3, dim=1) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_single_keepdim(self, device, op: ReductionOpInfo): + """Tests that dim=i, when keepdim=True, reduces dimension i to size 1.""" + self._test_dim_keepdim(op, device, ndim=0, dim=0, keepdim=True) + self._test_dim_keepdim(op, device, ndim=1, dim=0, keepdim=True) + self._test_dim_keepdim(op, device, ndim=2, dim=-1, keepdim=True) + self._test_dim_keepdim(op, device, ndim=3, dim=1, keepdim=True) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_empty(self, device, op: ReductionOpInfo): + """Tests that dim=[] is a no-op""" + self._test_dim_keepdim(op, device, ndim=0, dim=[]) + self._test_dim_keepdim(op, device, ndim=2, dim=[]) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_empty_keepdim(self, device, op: ReductionOpInfo): + """Tests that dim=[], when keepdim=True, is a no-op""" + self._test_dim_keepdim(op, device, ndim=0, dim=[], keepdim=True) + self._test_dim_keepdim(op, device, ndim=2, dim=[], keepdim=True) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_multi(self, device, op: ReductionOpInfo): + """Tests that dim=[i, j, ...] reduces dimensions i, j, ....""" + self._test_dim_keepdim(op, device, ndim=1, dim=[0]) + self._test_dim_keepdim(op, device, ndim=3, dim=[0, 2]) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_multi_keepdim(self, device, op: ReductionOpInfo): + """Tests that dim=[i, j, ...], when keepdim=True, reduces dimensions i, j, .... to size 1.""" + self._test_dim_keepdim(op, device, ndim=1, dim=[0], keepdim=True) + self._test_dim_keepdim(op, device, ndim=3, dim=[0, 2], keepdim=True) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_multi_unsorted(self, device, op: ReductionOpInfo): + """Tests that operator correctly handles unsorted dim list.""" + self._test_dim_keepdim(op, device, ndim=4, dim=[3, 0, 2]) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_multi_unsorted_keepdim(self, device, op: ReductionOpInfo): + """Tests that operator correctly handles unsorted dim list when keepdim=True.""" + self._test_dim_keepdim(op, device, ndim=4, dim=[3, 0, 2], keepdim=True) + + @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_multi_duplicate(self, device, op: ReductionOpInfo): + """Tests that an error is raised if dim has duplicate entries.""" + with self.assertRaises(RuntimeError): + self._test_dim_keepdim(op, device, ndim=3, dim=[0, 1, 1, 2]) + + @ops(filter(lambda op: not op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none) + def test_dim_multi_unsupported(self, device, op: ReductionOpInfo): + """Tests that ops claiming to not support multi dim actually don't.""" + with self.assertRaises(TypeError): + self._test_dim_keepdim(op, device, ndim=3, dim=[0, 2]) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_offbounds(self, device, op: ReductionOpInfo): + """Tests that passing an off-bounds dim throws""" + with self.assertRaises(IndexError): + self._test_dim_keepdim(op, device, ndim=2, dim=2) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_dim_ndim_limit(self, device, op: ReductionOpInfo): + """Tests that an exception is raised when reducing a tensor with more + than 64 dims along some specific dimensions. dim=None is ok""" + t = make_tensor([1] * 65, device, torch.float) + with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"): + op(t, dim=0) + + @ops(filter(lambda op: op.identity is not None, reduction_ops), dtypes=OpDTypes.supported) + def test_identity(self, device, dtype, op: ReductionOpInfo): + """Tests that the identity value is an identity for the operator""" + t = make_tensor((10,), device, dtype) + t[1::2] = op.identity + args, kwargs = next(op.generate_args_kwargs(t)) + result = op(t[::2], *args, **kwargs) + result_with_identity = op(t, *args, **kwargs) + self.assertEqual(result, result_with_identity, """ + Adding identity value to the input tensor should not change the result. + """) + + # TODO(@heitorschueroff) Update these to use the nan_policy kwarg once + # it is added to reduction operators. + + @ops(filter(lambda op: op.nan_policy == 'propagate', reduction_ops), dtypes=OpDTypes.supported, + allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16)) + def test_nan_policy_propagate(self, device, dtype, op: ReductionOpInfo): + """Tests that nan is propagated to the output by default""" + t = make_tensor((5,), device, dtype) + t[2] = torch.nan args, kwargs = next(op.generate_args_kwargs(t)) - self.assertEqual(op(t, *args, **kwargs).ndim, 0) + result = op(t, *args, **kwargs) + self.assertTrue(result.isnan()) + + @ops(filter(lambda op: op.nan_policy == 'omit', reduction_ops), dtypes=OpDTypes.supported, + allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16)) + def test_nan_policy_omit(self, device, dtype, op: ReductionOpInfo): + """Tests that NaN values do not affect the result.""" + t = make_tensor((10,), device, dtype) + t[1::2] = torch.nan + args, kwargs = next(op.generate_args_kwargs(t)) + result = op(t[::2], *args, **kwargs) + result_with_nan = op(t, *args, **kwargs) + self.assertEqual(result, result_with_nan) + + @ops(reduction_ops, dtypes=OpDTypes.supported) + def test_result_dtype(self, device, dtype, op: ReductionOpInfo): + """Tests that the result has the correct dtype""" + t = make_tensor((5,), device, dtype) + args, kwargs = next(op.generate_args_kwargs(t)) + result: torch.Tensor = op(t, *args, **kwargs) + is_integral = dtype in integral_types_and(torch.bool) + if op.promotes_int_to_float and is_integral: + self.assertTrue(torch.is_floating_point(result.dtype)) + elif op.promotes_int_to_int64 and is_integral: + self.assertEqual(result.dtype, torch.int64) + elif op.result_dtype is not None: + self.assertEqual(result.dtype, op.result_dtype) + else: + self.assertEqual(result.dtype, dtype) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_empty_tensor_empty_slice(self, device, op: ReductionOpInfo): + """Tests for consistent behavior when reducing over an empty slice. + + The rules for reducing over an empty slice are as follows: + - Return the identity value if the operator has one + - Otherwise, return NaN if the operator promotes integral dtype to + floating point dtypes. + - Otherwise, raise an error + + See discussion here https://github.com/pytorch/pytorch/issues/61901 + """ + t = make_tensor((0, 2, 3), device, torch.float) + for dim in [0] + [[0, 2]] if op.supports_multiple_dims else []: + args, kwargs = next(op.generate_args_kwargs(t, dim=dim)) + if op.identity is not None: + # Reducing along empty slice should return identity + result = op(t, *args, dim=dim, **kwargs) + self.assertEqual(result, torch.full_like(result, op.identity)) + elif op.promotes_int_to_float: + # Reducing along empty slice should return NaN + result = op(t, *args, dim=dim, **kwargs) + self.assertEqual(result, torch.full_like(result, torch.nan)) + else: + # Reducing along empty slice should raise an error + with self.assertRaises(IndexError): + op(t, *args, dim=dim, **kwargs) + + @ops(reduction_ops, dtypes=OpDTypes.none) + def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo): + """Tests that reducing a nonempty slice of an empty tensor returns an + empty tensor with the dimensions reduced.""" + t = make_tensor((0, 2, 3), device, torch.float) + for dim in [1] + [[1, 2]] if op.supports_multiple_dims else []: + args, kwargs = next(op.generate_args_kwargs(t, dim=dim)) + result = op(t, *args, dim=dim, **kwargs) + self.assertEqual(result.shape, _reduced_shape(t.shape, dim)) ########################################################################### # TODO: Legacy tests - port to ReductionOpInfo diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 3839b2ef82c17..4331c92d56599 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2170,28 +2170,6 @@ def sample_inputs_take_along_dim(op_info, device, dtype, requires_grad, **kwargs ) -def sample_inputs_amax_amin(op_info, device, dtype, requires_grad, **kwargs): - # Ordered as (input shape, kwargs) - test_cases: Tuple[tuple, dict] = ( # type: ignore[assignment] - ((S, S, S), {}), - ((S, S, S), {'dim': 1}), - ((S, S, S), {'dim': (1, 2,)}), - ((S, S, S), {'dim': 1, 'keepdim': True}), - ((), {'dim': 0}), - ((), {}), - ((), {'dim': 0, 'keepdim': True}), - ) - - samples: List[SampleInput] = [] - for shape, kwargs in test_cases: - samples.append(SampleInput( - make_tensor(shape, device, dtype, requires_grad=requires_grad), - kwargs=kwargs)) - - return samples - -# TODO (@heitorschueroff) Once aminmax supports multiple dims this should -# be combined with the above test. def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs): test_cases: Tuple[tuple, dict] = ( # type: ignore[assignment] ((S, S, S), {}), @@ -2210,33 +2188,6 @@ def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs): return samples -def sample_inputs_argmax_argmin(op_info, device, dtype, requires_grad, **kwargs): - test_cases = ( - ((2, 2, 2), ()), - ((2, 2, 2), (0,)), - ((2, 2, 2), (1,)), - ((2, 2, 2), (2,)), - ((2, 2, 2), (2, True,)), - ((2, 2, 2), (None,)), - ((), (0,)), - ((), ()), - ((), (None, True,)), - ((1,), ()), - ((1,), (0,)), - ((1,), (0, True)), - ((2,), ()), - ((2,), (0,)), - ((2,), (0, True)), - ((2, 2, 3), ()), - ((2, 2, 3), (0,)), - ((2, 2, 3), (1,)), - ((2, 2, 3), (None, True)), - ) - return tuple(SampleInput((make_tensor(size, device, dtype, - requires_grad=requires_grad)), - args=args) - for size, args in test_cases) - def sample_inputs_diff(op_info, device, dtype, requires_grad, **kwargs): test_cases = ( ((1,), 0, None, None), @@ -2634,6 +2585,14 @@ def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad): return inputs +def sample_inputs_reduction_count_nonzero(*args, **kwargs): + """Sample inputs for count_nonzero""" + samples: List[SampleInput] = sample_inputs_reduction(*args, **kwargs) + # count_nonzero does not support keepdim yet + for sample in samples: + sample.kwargs.pop('keepdim', None) + return samples + def sample_inputs_leaky_relu(op_info, device, dtype, requires_grad): N = 10 tensors = [SampleInput(make_tensor((N, N), device=device, dtype=dtype, @@ -5823,22 +5782,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): # TODO: update sample inputs with for_inplace_variant kwarg to support this test SkipInfo('TestCommon', 'test_variant_consistency_eager'),), sample_inputs_func=sample_inputs_addcmul_addcdiv), - OpInfo('amax', - ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs), - dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - sample_inputs_func=sample_inputs_amax_amin,), - OpInfo('amin', - ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs), - dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - sample_inputs_func=sample_inputs_amax_amin), - OpInfo('argmax', - dtypes=all_types_and(torch.float16, torch.bfloat16), - supports_autograd=False, - sample_inputs_func=sample_inputs_argmax_argmin,), - OpInfo('argmin', - dtypes=all_types_and(torch.float16, torch.bfloat16), - supports_autograd=False, - sample_inputs_func=sample_inputs_argmax_argmin,), UnaryUfuncInfo('asin', aliases=('arcsin', ), ref=np.arcsin, @@ -7096,10 +7039,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_out=False, supports_forward_ad=True, sample_inputs_func=sample_inputs_max_min_reduction_no_dim,), - OpInfo('nansum', - dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - supports_out=False, - sample_inputs_func=sample_inputs_reduction), # TODO(@heitorschueroff) Add test for dtype kwarg OpInfo('mean', dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), @@ -7458,16 +7397,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_forward_ad=True, skips=( SkipInfo('TestMathBits', 'test_conj_view', device_type='cuda'),),), - OpInfo('prod', - dtypes=all_types_and_complex_and(torch.bool), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - skips=( - # prod does not support the (Tensor, *, out) overload - SkipInfo('TestCommon', 'test_out', - dtypes=[torch.float32]), - ), - sample_inputs_func=sample_inputs_prod, - gradcheck_nondet_tol=GRADCHECK_NONDET_TOL), OpInfo('qr', op=torch.qr, dtypes=floating_and_complex_types(), @@ -8953,13 +8882,167 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): ), ), ), + ReductionOpInfo( + 'all', + identity=True, + supports_multiple_dims=False, + supports_out=False, + supports_autograd=False, + result_dtype=torch.bool, + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + skips=( + # FIXME: does not support passing keepdim without dim + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + # FIXME: does not support dim=None + SkipInfo('TestReductions', 'test_dim_none'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + # FIXME: uint8 input returns uint8 instead of bool + SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]), + ), + ), + ReductionOpInfo( + 'any', + identity=False, + supports_multiple_dims=False, + supports_out=False, + supports_autograd=False, + result_dtype=torch.bool, + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + skips=( + # FIXME: does not support passing keepdim without dim + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + # FIXME: does not support dim=None + SkipInfo('TestReductions', 'test_dim_none'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + # FIXME: uint8 input returns uint8 instead of bool + SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]), + ), + ), + ReductionOpInfo( + 'amax', + nan_policy='propagate', + dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), + ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs), + skips=( + # FIXME: sum reduces all dimensions when dim=[] + SkipInfo('TestReductions', 'test_dim_empty'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + ), + ), + ReductionOpInfo( + 'amin', + nan_policy='propagate', + dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), + ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs), + skips=( + # FIXME: sum reduces all dimensions when dim=[] + SkipInfo('TestReductions', 'test_dim_empty'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + ), + ), + ReductionOpInfo( + 'argmax', + supports_multiple_dims=False, + supports_autograd=False, + result_dtype=torch.int64, + dtypes=all_types_and(torch.float16, torch.bfloat16), + skips=( + # FIXME: keepdim parameter is ignored when dim=None + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + ), + ), + ReductionOpInfo( + 'argmin', + supports_multiple_dims=False, + supports_autograd=False, + result_dtype=torch.int64, + dtypes=all_types_and(torch.float16, torch.bfloat16), + skips=( + # FIXME: keepdim parameter is ignored when dim=None + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + ), + ), + ReductionOpInfo( + 'count_nonzero', + identity=0, + supports_out=False, + supports_autograd=False, + result_dtype=torch.int64, + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + sample_inputs_func=sample_inputs_reduction_count_nonzero, + skips=( + # FIXME: count_nonzero does not accept keepdim kwarg + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + SkipInfo('TestReductions', 'test_dim_single_keepdim'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + SkipInfo('TestReductions', 'test_dim_multi_keepdim'), + SkipInfo('TestReductions', 'test_dim_multi_unsorted_keepdim'), + SkipInfo('TestReductions', 'test_dim_offbounds_keepdim'), + # FIXME: dim=[] reduces all dimensions + SkipInfo('TestReductions', 'test_dim_empty'), + ), + ), + ReductionOpInfo( + 'prod', + identity=1, + nan_policy='propagate', + supports_multiple_dims=False, + supports_out=False, + promotes_int_to_int64=True, + gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, + dtypes=all_types_and_complex_and(torch.bool), + dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + sample_inputs_func=sample_inputs_prod, + skips=( + # FIXME: prod does not support passing keepdim without passing dim + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + # FIXME: prod reduces all dimensions when dim=[] + SkipInfo('TestReductions', 'test_dim_empty'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + # FIXME: prod does not support passing None to dim + SkipInfo('TestReductions', 'test_dim_none'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + ), + ), ReductionOpInfo( 'sum', identity=0, + nan_policy='propagate', supports_out=False, supports_forward_ad=True, promotes_int_to_int64=True, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + skips=( + # FIXME: sum does not support passing keepdim without passing dim + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + # FIXME: sum reduces all dimensions when dim=[] + SkipInfo('TestReductions', 'test_dim_empty'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + # FIXME: sum does not support passing None to dim + SkipInfo('TestReductions', 'test_dim_none'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + ), + ), + ReductionOpInfo( + 'nansum', + identity=0, + nan_policy='omit', + supports_out=False, + promotes_int_to_int64=True, + dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), + skips=( + # FIXME: nansum does not support passing keepdim without passing dim + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + # FIXME: nansum reduces all dimensions when dim=[] + SkipInfo('TestReductions', 'test_dim_empty'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + # FIXME: nansum does not support passing None to dim + SkipInfo('TestReductions', 'test_dim_none'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + ), ), ] From 733755f72ca15feef8deeb512925639ef15f92d7 Mon Sep 17 00:00:00 2001 From: albanD Date: Thu, 26 Aug 2021 07:48:20 -0700 Subject: [PATCH 251/530] remove special grad_mode tls handling (#63116) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63116 This PR removes the special flag to disable grad mode tracking on the ThreadLocalState and replaces it with an explicit setter that users can use. This allows to reduce complexity of ThreadLocalState. Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30388098 Pulled By: albanD fbshipit-source-id: 85641b3d711179fb78ff6a41ed077548dc821a2f --- aten/src/ATen/ThreadLocalState.cpp | 26 +++++-------------- aten/src/ATen/ThreadLocalState.h | 14 +++++----- torch/csrc/autograd/engine.cpp | 2 -- torch/csrc/autograd/engine.h | 13 +++++----- .../autograd/engine/dist_engine.cpp | 1 - 5 files changed, 20 insertions(+), 36 deletions(-) diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index fc4b8fa9c27ec..98c2519e045ce 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -9,40 +9,26 @@ namespace at { -ThreadLocalState::ThreadLocalState(bool keep_grad_mode) +ThreadLocalState::ThreadLocalState() : dispatch_key_(c10::impl::tls_local_dispatch_key_set()), debug_info_(c10::ThreadLocalDebugInfo::current()), autograd_tls_(c10::AutogradState::get_tls_state()) { rf_tls_ = at::get_record_function_tls_(); saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks(); -#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) - keep_grad_mode_ = keep_grad_mode; -#endif bumped_record_all_functions_ = at::checkRecordAllFunctions(); } +void ThreadLocalState::set_grad_mode(bool enabled) { + autograd_tls_.set_grad_mode(enabled); +} + /* static */ void ThreadLocalState::setThreadLocalState( const ThreadLocalState& state) { // Note that setting the InferenceMode TLS in this function is ONLY ok because we always // restore the dispatch key set TLS at the same time. -#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) - if (state.keep_grad_mode_) { - c10::AutogradState::set_tls_state(state.autograd_tls_); - } else { - auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), - /* inference_mode */ state.autograd_tls_.get_inference_mode()); - c10::AutogradState::set_tls_state(new_state); - } -#else - // The mobile build explicitly ignore grad_mode but fails if we propagate - // its value across threads or set it to a fixed value. - // So we have to make sure the grad_mode value is not changed here. - auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(), - /* inference_mode */ state.autograd_tls_.get_inference_mode()); - c10::AutogradState::set_tls_state(new_state); -#endif + c10::AutogradState::set_tls_state(state.autograd_tls_); at::set_record_function_tls_(state.rf_tls_); diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index 4942399cbd6d7..41146912819b4 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -16,10 +16,12 @@ class TORCH_API ThreadLocalState { public: // Saves the thread local variables' values and // returns them as a ThreadLocalState - // keep_grad_mode - whether grad mode has to be preserved - // (e.g. not preserved when passing from forward pass into - // the autograd engine, autograd engine takes care of grad mode) - ThreadLocalState(bool keep_grad_mode = true); + ThreadLocalState(); + + // set_grad_mode - force the value of the grad mode TLS in + // the current state object. This is used for example in the + // autograd engine. + void set_grad_mode(bool enabled); // Sets thread local variables in the current thread, // according to the thread boundary specified @@ -35,10 +37,8 @@ class TORCH_API ThreadLocalState { // RecordFunction TLS RecordFunctionTLS rf_tls_; + // TLS for AutogradModes AutogradState autograd_tls_; -#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) - bool keep_grad_mode_ = true; -#endif // TLS for saved tensors default hooks std::pair saved_tensors_default_hooks_; diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index de2078d2d6432..acd7971aad6a7 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -411,7 +411,6 @@ auto Engine::thread_main(const std::shared_ptr& graph_task) -> void { // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask // always saves ThreadLocalState without grad_mode. at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_); - AutoGradMode grad_mode(local_graph_task->grad_mode_); try { // The guard sets the thread_local current_graph_task on construction @@ -580,7 +579,6 @@ void GraphTask::exec_post_processing() { // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask // always saves ThreadLocalState without grad_mode. at::ThreadLocalStateGuard tls_guard(this->thread_locals_); - AutoGradMode grad_mode(this->grad_mode_); // WARNING: Don't use a range-for loop here because more callbacks may be // added in between callback calls, so iterators may become invalidated. diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h index 17318473bcfcd..dd465f96c350e 100644 --- a/torch/csrc/autograd/engine.h +++ b/torch/csrc/autograd/engine.h @@ -53,9 +53,8 @@ struct GraphTask: std::enable_shared_from_this { // true, it signals all threads to stop executing. std::atomic_bool has_error_{false}; std::atomic_bool future_completed_{false}; - // It is safe to read grad_mode_ and keep_graph_ without synchronization + // It is safe to read keep_graph_ without synchronization bool keep_graph_; - bool grad_mode_; // To protect reads/writes to not_ready_, dependencies_, captured_vars_, // has_error_, future_result_, cpu_ready_queue_, and leaf_streams. @@ -110,8 +109,9 @@ struct GraphTask: std::enable_shared_from_this { // out of the GraphTask and are no longer valid. std::vector captured_vars_; - at::ThreadLocalState thread_locals_ = - at::ThreadLocalState(/* keep_grad_mode */ false); + // Note: this field is not ready to be used until the proper `thread_locals_.set_grad_mode()` + // call in the constructor. + at::ThreadLocalState thread_locals_ = at::ThreadLocalState(); std::unordered_set leaf_streams; @@ -180,12 +180,13 @@ struct GraphTask: std::enable_shared_from_this { std::shared_ptr cpu_ready_queue, bool exit_on_error = false) : keep_graph_(keep_graph), - grad_mode_(grad_mode), owner_(NO_DEVICE), reentrant_depth_(reentrant_depth), exit_on_error_(exit_on_error), cpu_ready_queue_(std::move(cpu_ready_queue)), - future_result_(c10::make_intrusive(c10::ListType::create(c10::TensorType::get()))) {} + future_result_(c10::make_intrusive(c10::ListType::create(c10::TensorType::get()))) { + thread_locals_.set_grad_mode(grad_mode); + } private: // run GraphTask post processing void exec_post_processing(); diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp index 4a3b3fff2e20b..e6522c33280a9 100644 --- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp +++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp @@ -360,7 +360,6 @@ void DistEngine::execute_graph_task_until_ready_queue_empty( } if (task.fn_ && !local_graph_task->has_error_.load()) { at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_); - AutoGradMode grad_mode(local_graph_task->grad_mode_); try { GraphTaskGuard guard(local_graph_task); engine_.evaluate_function( From 7ccc4b5cc8c9a256bca151776444bf94bc28e5f6 Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Thu, 26 Aug 2021 08:00:48 -0700 Subject: [PATCH 252/530] [CI] move distributed test into its own CI job (#62896) Summary: Moving distributed to its own job. - [x] ensure there should be a distributed test job for every default test job matrix (on GHA) - [x] ensure that circleci jobs works for distributed as well - [x] waiting for test distributed to have its own run_test.py launch options, see https://github.com/pytorch/pytorch/issues/63147 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62896 Reviewed By: seemethere Differential Revision: D30230856 Pulled By: walterddr fbshipit-source-id: 0cad620f6cd9e56c727c105458d76539a5ae976f --- .../cimodel/data/pytorch_build_definitions.py | 21 ++++++++ .circleci/config.yml | 49 +++++++++++++++++++ .github/scripts/generate_ci_workflows.py | 5 ++ .../scripts/generate_pytorch_test_matrix.py | 2 + .github/templates/linux_ci_workflow.yml.j2 | 1 + ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 1 + ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 1 + ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 1 + ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 1 + .../generated-linux-xenial-py3.6-gcc5.4.yml | 1 + ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 1 + .jenkins/pytorch/test.sh | 10 +++- 12 files changed, 92 insertions(+), 2 deletions(-) diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py index bdc977270c22e..d7b20158759d0 100644 --- a/.circleci/cimodel/data/pytorch_build_definitions.py +++ b/.circleci/cimodel/data/pytorch_build_definitions.py @@ -415,6 +415,27 @@ def instantiate_configs(only_slow_gradcheck): ) c.dependent_tests.append(bc_breaking_check) + if ( + compiler_name != "clang" + and not rocm_version + and not is_libtorch + and not is_vulkan + and not is_pure_torch + and not is_noarch + and not is_slow_gradcheck + and not only_slow_gradcheck + ): + distributed_test = Conf( + c.gen_build_name("") + "distributed", + [], + is_xla=False, + restrict_phases=["test"], + is_libtorch=False, + is_important=True, + parent_build=c, + ) + c.dependent_tests.append(distributed_test) + config_list.append(c) return config_list diff --git a/.circleci/config.yml b/.circleci/config.yml index cb3e148e2e162..1bb32b5cc0a3d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7159,6 +7159,13 @@ workflows: build_environment: "pytorch-linux-backward-compatibility-check-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large + - pytorch_linux_test: + name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc5_4_distributed_test + requires: + - pytorch_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large - pytorch_linux_build: name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build requires: @@ -7184,6 +7191,13 @@ workflows: build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large + - pytorch_linux_test: + name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test + requires: + - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large - pytorch_linux_build: name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build requires: @@ -7209,6 +7223,13 @@ workflows: build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large + - pytorch_linux_test: + name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test + requires: + - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large - pytorch_linux_build: name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build requires: @@ -7246,6 +7267,13 @@ workflows: build_environment: "pytorch-linux-xenial-py3.6-gcc7-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7" resource_class: large + - pytorch_linux_test: + name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc7_distributed_test + requires: + - pytorch_linux_xenial_py3_6_gcc7_build + build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc7_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7" + resource_class: large - pytorch_linux_build: name: pytorch_linux_xenial_py3_clang7_asan_build requires: @@ -7380,6 +7408,13 @@ workflows: docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - pytorch_linux_test: + name: pytorch_linux_pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed_test + requires: + - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build + build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7" + resource_class: large - pytorch_linux_build: name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build requires: @@ -7402,6 +7437,13 @@ workflows: docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - pytorch_linux_test: + name: pytorch_linux_pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed_test + requires: + - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build + build_environment: "pytorch-linux-pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" + resource_class: large - pytorch_linux_build: name: pytorch_linux_bionic_py3_6_clang9_noarch_build requires: @@ -7463,6 +7505,13 @@ workflows: docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - pytorch_linux_test: + name: pytorch_linux_pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed_test + requires: + - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build + build_environment: "pytorch-linux-pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" + resource_class: large - pytorch_linux_build: name: pytorch_linux_bionic_rocm3_9_py3_6_build requires: diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index f1b962521b18d..cd7065dbfaa47 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -138,10 +138,12 @@ class CIWorkflow: only_build_on_pull_request: bool = False only_run_smoke_tests_on_pull_request: bool = False num_test_shards_on_pull_request: int = -1 + distributed_test: bool = True # The following variables will be set as environment variables, # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string. enable_jit_legacy_test: YamlShellBool = "''" + enable_distributed_test: YamlShellBool = "''" enable_multigpu_test: YamlShellBool = "''" enable_nogpu_no_avx_test: YamlShellBool = "''" enable_nogpu_no_avx2_test: YamlShellBool = "''" @@ -154,6 +156,9 @@ def __post_init__(self) -> None: if not self.on_pull_request: self.only_build_on_pull_request = False + if self.distributed_test: + self.enable_distributed_test = 1 + # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are # only running smoke tests on the pull request. if self.num_test_shards_on_pull_request == -1: diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py index d8860a02a5c37..75df57cfa2f89 100755 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ b/.github/scripts/generate_pytorch_test_matrix.py @@ -51,6 +51,8 @@ def main() -> None: configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'): configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} + if os.getenv('ENABLE_DISTRIBUTED_TEST'): + configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} if os.getenv('ENABLE_SLOW_TEST'): configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} matrix = { diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index f63685295bbce..d9af899b04b66 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -248,6 +248,7 @@ jobs: {%- endif %} env: TEST_RUNNER_TYPE: !{{ test_runner_type }} + ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }} ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }} ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }} diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 0b3dddd3930e5..769efcaa80a3b 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -224,6 +224,7 @@ jobs: needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu + ENABLE_DISTRIBUTED_TEST: 1 ENABLE_JIT_LEGACY_TEST: '' ENABLE_MULTIGPU_TEST: '' ENABLE_NOGPU_NO_AVX_TEST: '' diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 624e9d0d92c5b..ddd81c079df7c 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -224,6 +224,7 @@ jobs: needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: 1 ENABLE_JIT_LEGACY_TEST: '' ENABLE_MULTIGPU_TEST: '' ENABLE_NOGPU_NO_AVX_TEST: '' diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 99a9f1f778f9f..5a888d0104174 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -224,6 +224,7 @@ jobs: needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu + ENABLE_DISTRIBUTED_TEST: 1 ENABLE_JIT_LEGACY_TEST: 1 ENABLE_MULTIGPU_TEST: 1 ENABLE_NOGPU_NO_AVX_TEST: 1 diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index be56b56ee715b..25d74de36dc66 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -224,6 +224,7 @@ jobs: needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu + ENABLE_DISTRIBUTED_TEST: 1 ENABLE_JIT_LEGACY_TEST: '' ENABLE_MULTIGPU_TEST: '' ENABLE_NOGPU_NO_AVX_TEST: '' diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index c1b877c7f9c10..341f9e6da6e8c 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -224,6 +224,7 @@ jobs: needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: 1 ENABLE_JIT_LEGACY_TEST: '' ENABLE_MULTIGPU_TEST: '' ENABLE_NOGPU_NO_AVX_TEST: '' diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 375c4b65aaf89..470fdaaad4230 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -222,6 +222,7 @@ jobs: needs: [ciflow_should_run] env: TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu + ENABLE_DISTRIBUTED_TEST: 1 ENABLE_JIT_LEGACY_TEST: '' ENABLE_MULTIGPU_TEST: '' ENABLE_NOGPU_NO_AVX_TEST: '' diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index e27ba3e0cd838..daa0da7eeca26 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -19,6 +19,11 @@ BUILD_DIR="build" BUILD_RENAMED_DIR="build_renamed" BUILD_BIN_DIR="$BUILD_DIR"/bin +# GHA has test config defined for the test job, so we need to add them. +if [[ -n "${TEST_CONFIG}" ]]; then + BUILD_ENVIRONMENT="${BUILD_ENVIRONMENT}-${TEST_CONFIG}" +fi + # shellcheck source=./common.sh source "$(dirname "${BASH_SOURCE[0]}")/common.sh" @@ -522,6 +527,9 @@ elif [[ "${BUILD_ENVIRONMENT}" == *vulkan-linux* ]]; then test_vulkan elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then test_bazel +elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then + test_distributed + test_rpc else install_torchvision install_monkeytype @@ -532,9 +540,7 @@ else test_custom_script_ops test_custom_backend test_torch_function_benchmark - test_distributed test_benchmarks - test_rpc if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-py3.6-gcc7-test* || "${BUILD_ENVIRONMENT}" == *linux-xenial-py3.6-gcc5.4-test* ]]; then test_python_gloo_with_tls fi From fbe7133b5842a55589e097c8b045ceb08cb346b2 Mon Sep 17 00:00:00 2001 From: Don Jang Date: Thu, 26 Aug 2021 08:08:53 -0700 Subject: [PATCH 253/530] [Static Runtime] Disable out variant of aten::clone (#63980) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63980 The out variant implementation of `aten::clone` causes a crash, which needs further investigation. This change disables it until the problem gets fixed. Note that `inline_cvr` doesn't use `aten::clone` as of now, so no perf implication: https://www.internalfb.com/phabricator/paste/view/P446858755?lines=121 Test Plan: N/A Reviewed By: hlu1 Differential Revision: D30544149 fbshipit-source-id: facb334d67473f622b36862fbdb2633358556fdf --- torch/csrc/jit/runtime/static/ops.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 140fdf188a951..4d34ed9388364 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -601,11 +601,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator { }; }); +// TODO(T98923825): Uncomment this once the bug in this gets fixed. +/* REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator { if (!n->matches(torch::schema( - "aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor"))) { - LogAndDumpSchema(n); - return nullptr; + "aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> +Tensor"))) { LogAndDumpSchema(n); return nullptr; } return [](ProcessedNode* p_node) { const auto& src = p_node->Input(0).toTensor(); @@ -631,6 +632,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator { at::native::copy_(out_t, src, false); }; }); +*/ + REGISTER_OPERATOR_FUNCTOR( quantized::embedding_bag_byte_rowwise_offsets, quantized_embedding_bag_byte_rowwise_offsets, From 1354ee417a4abfaacd8333fb6316c6f58494c0d7 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 26 Aug 2021 09:27:47 -0700 Subject: [PATCH 254/530] run_test.py: add option to run only core tests (#63976) Summary: This is in response to a feature request from some folks in the core team to have a local command that would only run relevant "core" tests. The idea is to have a local smoke test option for developers to run locally before making a PR in order to verify their changes did not break core functionality. These smoke tests are not targeted to be short but rather relevant. This PR enables that by allowing developers to run `python test/run_test.py --core` or `python test/run_test.py -core` in order to run the CORE_TEST_LIST, which is currently test_nn.py, test_torch.py, and test_ops.py. I am not the best person to judge what should be considered "core", so please comment which tests should be included and/or excluded from the CORE_TEST_LIST! Pull Request resolved: https://github.com/pytorch/pytorch/pull/63976 Test Plan: ``` (pytorch) janeyx@janeyx-mbp test % python run_test.py --core -v Selected tests: test_nn, test_ops, test_torch Running test_nn ... [2021-08-25 14:48:28.865078] Executing ['/Users/janeyx/miniconda3/envs/pytorch/bin/python', 'test_nn.py', '-v'] ... [2021-08-25 14:48:28.865123] test_to (__main__.PackedSequenceTest) ... ok test_to_memory_format (__main__.PackedSequenceTest) ... ok ``` Reviewed By: walterddr Differential Revision: D30575560 Pulled By: janeyx99 fbshipit-source-id: 3f151982c1e315e50e60cb0d818adaea34556a04 --- test/run_test.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index d3c661093a6e8..dd95e13de8e36 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -281,6 +281,14 @@ WINDOWS_COVERAGE_BLOCKLIST = [] +# A subset of our TEST list that validates PyTorch's ops, modules, and autograd function as expected +CORE_TEST_LIST = [ + "test_autograd", + "test_modules", + "test_nn", + "test_ops", + "test_torch" +] # the JSON file to store the S3 test stats TEST_TIMES_FILE = ".pytorch-test-times.json" @@ -629,6 +637,13 @@ def parse_args(): action="store_true", help="run all distributed tests", ) + parser.add_argument( + "-core", + "--core", + action="store_true", + help="Only run core tests, or tests that validate PyTorch's ops, modules," + "and autograd. They are defined by CORE_TEST_LIST." + ) parser.add_argument( "-pt", "--pytest", @@ -830,6 +845,12 @@ def get_selected_tests(options): filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests) ) + # Filter to only run core tests when --core option is specified + if options.core: + selected_tests = list( + filter(lambda test_name: test_name in CORE_TEST_LIST, selected_tests) + ) + # process reordering if options.bring_to_front: to_front = set(options.bring_to_front) From ba5f1b1076ed6fae4a46d7317204963a4cd53701 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Thu, 26 Aug 2021 09:41:58 -0700 Subject: [PATCH 255/530] [nnc] Fix dtype promotion involving scalars (#64002) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64002 Fixes https://github.com/pytorch/vision/issues/4315 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30566979 Pulled By: bertmaher fbshipit-source-id: eaa98b9534a926be7fcd337d46c5a0acb3243179 --- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 2 +- torch/csrc/jit/tensorexpr/kernel.cpp | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 085291afbdcf8..833c338578616 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -135,7 +135,7 @@ const OperatorSet& supported_eltwise_set() { "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor", // "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", TODO: requires 0-dim Tensor - "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor", + // "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor", "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor", "aten::sigmoid(Tensor self) -> Tensor", "aten::relu(Tensor self) -> Tensor", diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index d53e857d75a48..c2726a0fafb01 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -454,6 +454,11 @@ std::vector computeIndicesToBroadcast( return bcast; } +bool isScalar(ExprHandle e) { + auto n = e.node(); + return n->isConstant() || to(n); +} + void promoteInputs(std::vector& inputs, const int typeConstraints) { if (inputs.empty()) { return; @@ -462,7 +467,16 @@ void promoteInputs(std::vector& inputs, const int typeConstraints) { // Find the highest type among the inputs. ScalarType highType = inputs[0].dtype().scalar_type(); for (auto input : inputs) { - highType = promoteTypes(highType, input.dtype().scalar_type()); + auto inputType = input.dtype().scalar_type(); + if (isScalar(input)) { + if (isIntegralType(highType, false) && isFloatingType(inputType)) { + highType = c10::get_default_dtype_as_scalartype(); + } else if (highType == c10::kBool) { + highType = inputType; + } + } else { + highType = promoteTypes(highType, inputType); + } } if (!checkTypes(highType, typeConstraints)) { From 6d31ba6ddcf5f839bd2d3f20a19712846d030d8b Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Thu, 26 Aug 2021 09:49:44 -0700 Subject: [PATCH 256/530] [nnc] Sanitized the names of constants in the input graph. (#63990) Summary: Fixes https://github.com/pytorch/pytorch/issues/63923 The input graph can contain constants whose names contain special characters. So, all names of constants in the input graph need to be sanitized. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63990 Reviewed By: ZolotukhinM Differential Revision: D30558432 Pulled By: navahgar fbshipit-source-id: de5b0c23d50ee8997f40f2c0fc605dda3719186f --- test/cpp/tensorexpr/test_kernel.cpp | 37 ++++++++++++++++++++++++++++ torch/csrc/jit/tensorexpr/kernel.cpp | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 8cdf2ef90df11..625fadb811710 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -1195,6 +1195,43 @@ TEST_F(Kernel, SanitizeNames_CUDA) { ASSERT_TRUE(at::allclose(o, ref)); } +TEST_F(Kernel, SanitizeConstants_CUDA) { + const auto graph_string = R"IR( + graph(%x : Float(16, 16, strides=[16, 1], device=cuda:0)): + %none : NoneType = prim::Constant() + %size : int = prim::Constant[value=16]() + %sizes : int[] = prim::ListConstruct(%size, %size) + %30 : Device = prim::Constant[value="cuda"]() + %y : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::ones(%sizes, %none, %none, %30, %none) + %z : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::mul(%x, %y) + return (%z))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + // IRParser doesn't support tensor constants, so we insert a call to + // aten::ones and then const-prop it + ConstantPropagation(graph); + + // We set the name of the constant to include special characters that are + // not allowed. This should be fixed by the sanitizer in TensorExprKernel. + graph->nodes().front()->output()->setDebugName("illegal.name"); + + // Check if we have a constant node with illegal name in the graph. + auto const_node = graph->nodes().front(); + ASSERT_EQ(const_node->kind(), prim::Constant); + ASSERT_NE(const_node->output()->debugName().find('.'), std::string::npos); + + TensorExprKernel k(graph); + + auto x = at::rand({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat)); + std::vector inputs = {x}; + std::vector stack = fmap(inputs); + k.run(stack); + auto o = stack[0].toTensor(); + auto y = at::ones({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat)); + auto ref = x * y; + ASSERT_TRUE(at::allclose(o, ref)); +} + TEST_F(Kernel, ConstantTensors) { const auto graph_string = R"IR( graph(%x : Float(16, 16, strides=[16, 1], device=cpu)): diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index c2726a0fafb01..f72fbf7c18c37 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -2996,7 +2996,7 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) { } BufPtr buf = alloc( - "const_" + v->debugName(), + "const_" + sanitizeName(v->debugName()), ExprHandleVectorToExprVector(te_sizes), ToDtype(static_cast(*tt->scalarType()))); From 0f6b524665378b18b8682f473267c80c6d5ca3df Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Thu, 26 Aug 2021 09:52:42 -0700 Subject: [PATCH 257/530] [NNC] Add C++ codegen backend to NNC (#62869) Summary: Adds a C++ codegen backend to NNC to generate C++ for CPU instead of generating LLVM IR. Tensors are represented as blobs of float. Vector operations are devectorized/unrolled. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62869 Test Plan: https://github.com/pytorch/pytorch/tree/mvz-nnc-aot-prototype makes it able to AOT compile the whole MobileNetV3 model into binary code through LLVM codegen in NNC. I forked that branch to https://github.com/cheng-chang/pytorch/tree/cc-aot-cpp, merged this PR into it, and modified `fancy_compile` to compile MobileNetV3 into C++ through ``` import torch m = torch.jit.load('mobnet.pt') m.eval() f = torch.jit.freeze(m) torch._C._fancy_compile(f.graph, [1, 3, 224, 224]) ``` The generated C++ file `mobnet.cc` can be found at https://gist.github.com/cheng-chang/e2830cc6920b39204ebf368035b2bcec. I manually compiled the generated C++ through `g++ -o mobnet -std=c++14 -L./build/lib -ltorch_cpu -ltorch mobnet.cc`, and it succeeded. Reviewed By: ZolotukhinM Differential Revision: D30149482 Pulled By: cheng-chang fbshipit-source-id: e77b189f0353e37cd309423a48a513e668d07675 --- test/cpp/tensorexpr/test_cpp_codegen.cpp | 275 ++++++++++++-- torch/csrc/jit/tensorexpr/codegen.h | 4 + torch/csrc/jit/tensorexpr/cpp_codegen.cpp | 404 +++++++++++++++++++-- torch/csrc/jit/tensorexpr/cpp_codegen.h | 84 ++++- torch/csrc/jit/tensorexpr/cpp_intrinsics.h | 36 ++ torch/csrc/jit/tensorexpr/expr.h | 8 +- torch/csrc/jit/tensorexpr/ir.h | 4 +- torch/csrc/jit/tensorexpr/ir_printer.cpp | 51 +-- torch/csrc/jit/tensorexpr/ir_printer.h | 3 + 9 files changed, 782 insertions(+), 87 deletions(-) create mode 100644 torch/csrc/jit/tensorexpr/cpp_intrinsics.h diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp index df9166b675859..d40caa126e572 100644 --- a/test/cpp/tensorexpr/test_cpp_codegen.cpp +++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp @@ -1,9 +1,11 @@ #include -#include +#include "test/cpp/tensorexpr/test_base.h" #include +#include #include +#include #include namespace torch { @@ -11,44 +13,245 @@ namespace jit { using namespace torch::jit::tensorexpr; -TEST(CppPrinter, AllocateOnStackThenFree) { - std::vector dims = {alloc(2), alloc(3)}; - BufPtr buf = alloc("x", dims, kInt); - AllocatePtr alloc_ = alloc(buf); - FreePtr free_ = alloc(buf); - BlockPtr block = Block::make({alloc_, free_}); - - std::stringstream ss; - CppPrinter printer(&ss); - printer.visit(block); - const std::string expected = R"( - # CHECK: { - # CHECK: int x[6]; - # CHECK: } +#define STR_CHECK(node, expected) \ + std::stringstream ss; \ + CppPrinter printer(&ss); \ + printer.visit(node); \ + ASSERT_EQ(ss.str(), expected) + +#define FILE_CHECK(node, pattern) \ + std::stringstream ss; \ + CppPrinter printer(&ss); \ + printer.visit(node); \ + torch::jit::testing::FileCheck().run(pattern, ss.str()) + +TEST(CppPrinter, IntImm) { + auto i = alloc(10); + STR_CHECK(i, "10"); +} + +TEST(CppPrinter, FloatImm) { + auto f = alloc(10); + STR_CHECK(f, "10.f"); +} + +TEST(CppPrinter, FloatImm1) { + auto f = alloc(10); + STR_CHECK(f, "10.f"); +} + +TEST(CppPrinter, DoubleImm) { + auto d = alloc(10); + STR_CHECK(d, "10.0"); +} + +TEST(CppPrinter, DoubleImm1) { + auto d = alloc(10.1); + STR_CHECK(d, "10.1"); +} + +TEST(CppPrinter, HalfImm) { + auto h = alloc(10); + STR_CHECK(h, "10"); +} + +TEST(CppPrinter, Add) { + auto add = alloc(alloc(1), alloc(2)); + STR_CHECK(add, "1 + 2"); +} + +TEST(CppPrinter, AddExpr1) { + auto add = alloc( + alloc(alloc(0), alloc(1)), + alloc(alloc(2), alloc(3))); + STR_CHECK(add, "(0 + 1) + (2 - 3)"); +} + +TEST(CppPrinter, AddExpr2) { + auto add = alloc( + alloc(alloc(0), alloc(1)), + alloc(alloc(2), alloc(3))); + STR_CHECK(add, "0 * 1 + (2 - 3)"); +} + +TEST(CppPrinter, AddExpr3) { + auto add = alloc( + alloc(alloc(0), alloc(1)), + alloc
(alloc(2), alloc(3))); + STR_CHECK(add, "(0 + 1) + 2 / 3"); +} + +TEST(CppPrinter, Mod) { + auto mod = alloc(alloc(1), alloc(2)); + STR_CHECK(mod, "1 % 2"); +} + +TEST(CppPrinter, ModFloat) { + auto mod = alloc(alloc(1), alloc(2)); + STR_CHECK(mod, "std::fmod(1.f, 2.f)"); +} + +TEST(CppPrinter, Max) { + auto max = alloc(alloc(1), alloc(2), false); + STR_CHECK(max, "std::max(1, 2)"); +} + +TEST(CppPrinter, MaxFloat) { + auto max = alloc(alloc(1), alloc(2), false); + STR_CHECK(max, "std::max(1.f, 2.f)"); +} + +TEST(CppPrinter, MaxHalf) { + auto max = alloc(alloc(1), alloc(2), false); + STR_CHECK(max, "(1 < 2) ? 2 : 1"); +} + +TEST(CppPrinter, And) { + auto v = alloc(alloc(1), alloc(2)); + STR_CHECK(v, "1 & 2"); +} + +TEST(CppPrinter, CompareSelect) { + auto cs = alloc( + alloc(1), + alloc(2), + alloc(1), + alloc(2), + CompareSelectOperation::kLE); + STR_CHECK(cs, "((1 <= 2) ? 1.f : 2.f)"); +} + +TEST(CppPrinter, IfThenElse) { + auto cond = alloc(alloc(1), alloc(2)); + auto true_value = alloc(alloc(0), alloc(1)); + auto false_value = alloc(alloc(2), alloc(3)); + auto v = alloc(cond, true_value, false_value); + STR_CHECK(v, "((1 + 2) ? 0 - 1 : 2 * 3)"); +} + +TEST(CppPrinter, AllocateFree) { + BufHandle buf("x", {2, 3}, kInt); + AllocatePtr alloc = Allocate::make(buf); + FreePtr free = Free::make(buf); + BlockPtr block = Block::make({alloc, free}); + + const std::string pattern = R"( + # CHECK: { + # CHECK: int* x = static_cast(malloc(24)); + # CHECK: free(x); + # CHECK: } + )"; + FILE_CHECK(block, pattern); +} + +TEST(CppPrinter, LoadStore) { + Placeholder a(BufHandle("A", {2, 3}, kInt)); + Placeholder b(BufHandle("B", {3, 4}, kInt)); + auto store = b.store({2, 2}, a.load(1, 1)); + STR_CHECK( + store, "B[(0 + 2 * (1 * 4)) + 2 * 1] = A[(0 + 1 * (1 * 3)) + 1 * 1];\n"); +} + +TEST(CppPrinter, Var) { + auto var = alloc("x", kInt); + STR_CHECK(var, "x"); +} + +TEST(CppPrinter, Cast) { + auto cast = alloc(kFloat, alloc(1)); + STR_CHECK(cast, "static_cast(1)"); +} + +TEST(CppPrinter, BitCast) { + auto cast = alloc(kInt, alloc(20)); + STR_CHECK(cast, "std::bitcast(20.f)"); +} + +TEST(CppPrinter, Let) { + auto var = alloc("x", kFloat); + auto val = alloc(2); + auto let = alloc(var, val); + STR_CHECK(let, "float x = 2.f;\n"); +} + +TEST(CppPrinter, For) { + constexpr int N = 1024; + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); + VarHandle i("i", kInt); + auto f = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i)))); + const std::string pattern = R"( + # CHECK: for (int i = 0; i < 1024; i++) { + # CHECK: C[i] = (A[i]) + (B[i]); + # CHECK: } )"; - torch::jit::testing::FileCheck().run(expected, ss.str()); -} - -TEST(CppPrinter, AllocateOnHeapThenFree) { - std::vector dims = { - alloc(20), alloc(50), alloc(3)}; - BufPtr buf = alloc("y", dims, kLong); - AllocatePtr alloc_ = alloc(buf); - FreePtr free_ = alloc(buf); - BlockPtr block = Block::make({alloc_, free_}); - - std::stringstream ss; - CppPrinter printer(&ss); - printer.visit(block); - // size(long) = 8; - // dim0 * dim1 * dim2 * size(long) = 24000. - const std::string expected = R"( - # CHECK: { - # CHECK: int64_t* y = static_cast(malloc(24000)); - # CHECK: free(y); + FILE_CHECK(f, pattern); +} + +TEST(CppPrinter, Cond) { + Placeholder x(BufHandle("X", {1}, kInt)); + auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT); + auto cond = + Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1)); + const std::string pattern = R"( + # CHECK: if (((X[0] < 10) ? 1 : 0)) { + # CHECK: X[0] = (X[0]) + 1; + # CHECK: } else { + # CHECK: X[0] = (X[0]) - 1; # CHECK: } )"; - torch::jit::testing::FileCheck().run(expected, ss.str()); + FILE_CHECK(cond, pattern); +} + +TEST(CppPrinter, Intrinsics) { + const std::unordered_set> unsupported_ops{ + kRand, kSigmoid}; + for (int i = 0; i < kMaxIntrinsicsOp; i++) { + IntrinsicsOp op = static_cast(i); + if (unsupported_ops.count(op)) { + continue; + } + + if (Intrinsics::OpArgCount(op) == 1) { + auto v = alloc(op, alloc(2.0f)); + STR_CHECK(v, "std::" + v->func_name() + "(2.f)"); + } else { + auto v = + alloc(op, alloc(1.0f), alloc(2.0f)); + STR_CHECK(v, "std::" + v->func_name() + "(1.f, 2.f)"); + } + } +} + +TEST(CppPrinter, ExternalCall) { + std::vector dims{alloc(2), alloc(2)}; + auto output = alloc("out", dims, kFloat); + auto buf_arg1 = alloc("a", dims, kFloat); + auto buf_arg2 = alloc("b", dims, kFloat); + auto scalar_arg = alloc(alloc(1), alloc(2)); + std::vector buf_args{buf_arg1, buf_arg2}; + std::vector scalar_args{scalar_arg}; + auto call = + alloc(output, "nnc_aten_matmul", buf_args, scalar_args); + const std::string pattern = R"( + # CHECK: { + # CHECK: void* buf_ptrs[]{out, a, b}; + # CHECK: int64_t buf_ranks[]{2, 2, 2}; + # CHECK: int64_t buf_dims[]{2, 2, 2, 2, 2, 2}; + # CHECK: int8_t buf_dtypes[]{6, 6, 6}; + # CHECK: int64_t extra_args[]{1 + 2}; + # CHECK: nnc_aten_matmul( + # CHECK: 3, + # CHECK: buf_ptrs, + # CHECK: buf_ranks, + # CHECK: buf_dims, + # CHECK: buf_dtypes, + # CHECK: 1, + # CHECK: extra_args); + # CHECK: } + )"; + FILE_CHECK(call, pattern); } } // namespace jit diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h index d7cfe783fab8f..29255aac07df2 100644 --- a/torch/csrc/jit/tensorexpr/codegen.h +++ b/torch/csrc/jit/tensorexpr/codegen.h @@ -46,6 +46,10 @@ class TORCH_API CodeGen { stmt_ = stmt_->accept_mutator(mutator); } + void apply_visitor(IRVisitor* visitor) { + stmt_->accept(visitor); + } + std::vector& buffer_args() { return buffer_args_; } diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp index 39a5615a97545..20795e43dd57b 100644 --- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp @@ -1,44 +1,406 @@ +#include +#include +#include + #include +#include +#include +#include namespace torch { namespace jit { namespace tensorexpr { -void CppPrinter::visit(AllocatePtr alloc) { - constexpr size_t kAllocOnStackThresholdSize = 512; +// Rewrites the variables' name according to valid C++ naming convention. +// E.g. in Graph IR, variable name may contain '.', in C++, they are replaced +// with '_'. +class CppVarNameRewriter : public IRVisitor { + public: + void visit(VarPtr v) override { + constexpr char kDot = '.'; + constexpr char kUnderscore = '_'; + if (v->name_hint().find(kDot) == std::string::npos) { + return; + } + std::string name = v->name_hint(); + std::replace(name.begin(), name.end(), kDot, kUnderscore); + v->set_name_hint(std::move(name)); + } + + void visit(BufPtr v) override { + v->base_handle()->accept(this); + } +}; + +static std::string declareExternalFunction(const std::string& func_name) { + return "void " + func_name + + "(" + "int64_t bufs_num, " + "void** buf_data, " + "int64_t* buf_ranks, " + "int64_t* buf_dims, " + "int8_t* buf_dtypes, " + "int64_t args_num, " + "int64_t* extra_args);"; +} + +CppPrinter::CppPrinter(std::ostream* os) : IRPrinter(*os), lane_(0) {} + +CppPrinter::~CppPrinter() = default; + +void CppPrinter::printPrologue() { + os() << "#include " << std::endl; + os() << "#include " << std::endl; + os() << "#include " << std::endl; + os() << "#include " << std::endl; + os() << std::endl; + + os() << "#define POS_INFINITY INFINITY" << std::endl; + os() << "#define NEG_INFINITY -INFINITY" << std::endl; + os() << std::endl; + + os() << cpp_intrinsics_definition << std::endl; + os() << std::endl; + + os() << "namespace torch {" << std::endl; + os() << "namespace jit {" << std::endl; + os() << "namespace tensorexpr {" << std::endl; + for (auto const& it : getNNCFunctionRegistry()) { + os() << declareExternalFunction(it.first) << std::endl; + } + os() << "} // namespace tensorexpr" << std::endl; + os() << "} // namespace jit" << std::endl; + os() << "} // namespace torch" << std::endl; + os() << std::endl; + + os() << "using namespace torch::jit::tensorexpr;" << std::endl; + os() << std::endl; +} + +template +inline typename std::enable_if::value, void>::type +visit_mod(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) { + os << *lhs << " % " << *rhs; +} + +template +inline typename std::enable_if::value, void>::type +visit_mod(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) { + os << "std::fmod(" << *lhs << ", " << *rhs << ")"; +} + +template +inline typename std::enable_if< + std::is_floating_point::value || std::is_integral::value, + void>::type +visit_max(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) { + os << "std::max(" << *lhs << ", " << *rhs << ")"; +} - size_t size = 1; - for (auto dim : alloc->dims()) { - IntImmPtr v = to(dim); - if (v) { - size *= v->value(); +template +inline typename std::enable_if< + !std::is_floating_point::value && !std::is_integral::value, + void>::type +visit_max(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) { + os << "(" << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs; +} + +template +inline typename std::enable_if< + std::is_floating_point::value || std::is_integral::value, + void>::type +visit_min(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) { + os << "std::min(" << *lhs << ", " << *rhs << ")"; +} + +template +inline typename std::enable_if< + !std::is_floating_point::value && !std::is_integral::value, + void>::type +visit_min(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) { + os << *lhs << " < " << *rhs << " ? " << *lhs << " : " << *rhs; +} + +template +void visit_binary_op( + std::ostream& os, + const ExprPtr lhs, + const ExprPtr rhs, + IRNodeType op_type) { + switch (op_type) { + case IRNodeType::kMod: + visit_mod(os, lhs, rhs); + break; + case IRNodeType::kMax: + visit_max(os, lhs, rhs); + break; + case IRNodeType::kMin: + visit_min(os, lhs, rhs); + break; + default: + throw std::runtime_error("invalid op type"); + } +} + +template +void dispatch_binary_op(std::ostream& os, const BinaryOpNode* v) { + switch (v->lhs()->dtype().scalar_type()) { +#define TYPE_CASE(Type, Name) \ + case ScalarType::Name: \ + visit_binary_op(os, v->lhs(), v->rhs(), v->expr_type()); \ + break; + AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE); +#undef TYPE_CASE + default: + throw unsupported_dtype(); + } +} + +void CppPrinter::visit(RampPtr v) { + visit(alloc(v->base(), alloc(alloc(lane_), v->stride()))); +} + +void CppPrinter::visit(BroadcastPtr v) { + v->value()->accept(this); +} + +void CppPrinter::visit(ModPtr v) { + dispatch_binary_op(os(), v.get()); +} + +void CppPrinter::visit(MaxPtr v) { + dispatch_binary_op(os(), v.get()); +} + +void CppPrinter::visit(MinPtr v) { + dispatch_binary_op(os(), v.get()); +} + +void CppPrinter::visit(CompareSelectPtr v) { + os() << "((" << *v->lhs() << " " + << IRPrinter::to_string(v->compare_select_op()) << " " << *v->rhs() + << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ")"; +} + +void CppPrinter::visit(IfThenElsePtr v) { + os() << "((" << *v->condition() << ") ? " << *v->true_value() << " : " + << *v->false_value() << ")"; +} + +void CppPrinter::visit(AllocatePtr v) { + size_t size = v->dtype().byte_size(); + for (const auto& dim : v->dims()) { + IntImmPtr d = to(dim); + if (d) { + size *= d->value(); } else { throw std::runtime_error("Only IntImm dimensions are supported for now"); } } emitIndent(); - if (size <= kAllocOnStackThresholdSize) { - os() << alloc->dtype().ToCppString() << " " << (*alloc->buffer_var()) << "[" - << size << "];" << std::endl; - } else { - size *= alloc->dtype().byte_size(); - os() << alloc->dtype().ToCppString() << "* " << (*alloc->buffer_var()) - << " = static_cast<" << alloc->dtype().ToCppString() << "*>(malloc(" - << size << "));" << std::endl; - allocated_on_heap_.insert(alloc->buffer_var()); + os() << v->dtype().ToCppString() << "* " << (*v->buffer_var()) + << " = static_cast<" << v->dtype().ToCppString() << "*>(malloc(" << size + << "));" << std::endl; +} + +void CppPrinter::visit(FreePtr v) { + emitIndent(); + os() << "free(" << *v->buffer_var() << ");" << std::endl; +} + +void CppPrinter::visit(LoadPtr v) { + auto flat_idx = flatten_index(v->buf()->dims(), v->indices()); + os() << *v->base_handle() << "[" << *flat_idx << "]"; +} + +void CppPrinter::visit(StorePtr v) { + auto flat_idx = flatten_index(v->buf()->dims(), v->indices()); + const int lanes = v->value()->dtype().lanes(); + for (int lane = 0; lane < lanes; lane++) { + lane_ = lane; + emitIndent(); + os() << *v->base_handle() << "[" << *flat_idx << "] = " << *v->value() + << ";" << std::endl; + } +} + +void CppPrinter::visit(CastPtr v) { + os() << "static_cast<" << v->dtype().ToCppString() << ">(" << *v->src_value() + << ")"; +} + +void CppPrinter::visit(BitCastPtr v) { + os() << "std::bitcast<" << v->src_value()->dtype().ToCppString() << ", " + << v->dtype().ToCppString() << ">(" << *v->src_value() << ")"; +} + +void CppPrinter::visit(IntrinsicsPtr v) { + if (v->op_type() == kRand || v->op_type() == kSigmoid) { + throw std::runtime_error("kRand and kSigmoid are not supported"); + } + + os() << "std::" << v->func_name() << "("; + for (int i = 0; i < v->nparams(); i++) { + if (i > 0) { + os() << ", "; + } + os() << *v->param(i); } + os() << ")"; } -void CppPrinter::visit(FreePtr free) { - VarPtr var = free->buffer_var(); - if (allocated_on_heap_.count(var)) { +void CppPrinter::visit(ExternalCallPtr v) { + // The generated code needs to link against functions defined + // in external_functions.cpp. + + auto& func_registry = getNNCFunctionRegistry(); + if (!func_registry.count(v->func_name())) { + throw unimplemented_lowering(v); + } + + std::vector bufs(v->buf_args()); + bufs.insert(bufs.begin(), v->buf()); + auto for_buf = [&](const std::function& print_buf) { + for (size_t i = 0; i < bufs.size(); i++) { + if (i > 0) { + os() << ", "; + } + print_buf(bufs[i]); + } + }; + + emitIndent(); + os() << "{" << std::endl; + indent_++; + + emitIndent(); + os() << "void* buf_ptrs[]{"; + for_buf([&](const BufPtr b) { os() << *b->base_handle(); }); + os() << "};" << std::endl; + + emitIndent(); + os() << "int64_t buf_ranks[]{"; + for_buf([&](const BufPtr b) { os() << b->ndim(); }); + os() << "};" << std::endl; + + emitIndent(); + os() << "int64_t buf_dims[]{"; + for_buf([&](const BufPtr buf) { + for (size_t i = 0; i < buf->ndim(); i++) { + if (i > 0) { + os() << ", "; + } + os() << *buf->dim(i); + } + }); + os() << "};" << std::endl; + + emitIndent(); + os() << "int8_t buf_dtypes[]{"; + for_buf([&](const BufPtr buf) { + os() << static_cast(buf->dtype().scalar_type()); + }); + os() << "};" << std::endl; + + emitIndent(); + os() << "int64_t extra_args[]{"; + for (size_t i = 0; i < v->args().size(); i++) { + if (i > 0) { + os() << ", "; + } + os() << *v->args()[i]; + } + os() << "};" << std::endl; + + emitIndent(); + os() << v->func_name() << "(" << std::endl; + emitIndent(); + os() << " " << bufs.size() << "," << std::endl; + emitIndent(); + os() << " buf_ptrs," << std::endl; + emitIndent(); + os() << " buf_ranks," << std::endl; + emitIndent(); + os() << " buf_dims," << std::endl; + emitIndent(); + os() << " buf_dtypes," << std::endl; + emitIndent(); + os() << " " << v->args().size() << "," << std::endl; + emitIndent(); + os() << " extra_args);" << std::endl; + + indent_--; + emitIndent(); + os() << "}" << std::endl; +} + +void CppPrinter::visit(LetPtr v) { + if (v->dtype().lanes() == 1) { emitIndent(); - os() << "free(" << name_manager()->get_unique_name(var) << ");" - << std::endl; + os() << v->dtype().ToCppString() << " " << *v->var() << " = " << *v->value() + << ";" << std::endl; + } else { + vector_vars_[v->var()] = v->value(); + } +} + +void CppPrinter::visit(VarPtr v) { + if (v->dtype().lanes() == 1) { + os() << name_manager()->get_unique_name(v); + } else { + os() << *vector_vars_.at(v); } } +CppCodeGen::CppCodeGen( + StmtPtr stmt, + const std::vector& buffer_args, + at::Device device, + const std::string& kernel_func_name) + : CodeGen(stmt, buffer_args, device, kernel_func_name) { + init(); +} + +void CppCodeGen::init() { + printer_ = std::make_unique(&oss_); + var_name_rewriter_ = std::make_unique(); + + apply_visitor(var_name_rewriter_.get()); + + printer_->printPrologue(); + os() << "void " << kernel_func_name() << "("; + const std::vector buffer_args = this->buffer_args(); + for (size_t i = 0; i < buffer_args.size(); i++) { + if (i > 0) { + os() << ", "; + } + const BufferArg& buffer_arg = buffer_args[i]; + const VarPtr var = buffer_arg.var(); + Dtype dtype = buffer_arg.dtype(); + os() << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ") << *var; + } + os() << ")"; + stmt()->accept(printer_.get()); + os() << std::endl; +} + +CppCodeGen::~CppCodeGen() = default; + +void CppCodeGen::call(const std::vector& args) { + // TODO: compile the generated C++ kernel into a library, + // and call the library here. + os() << "int main() {}" << std::endl; +} + +void CppCodeGen::call_raw(const std::vector& args) { + // TODO: compile the generated C++ kernel into a library, + // and call the library here. + os() << "int main() {}" << std::endl; +} + +RegisterCodeGen cpp_codegen_reg("cpp_codegen"); + } // namespace tensorexpr } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.h b/torch/csrc/jit/tensorexpr/cpp_codegen.h index 1cf15658716e6..a6d583ed4efb7 100644 --- a/torch/csrc/jit/tensorexpr/cpp_codegen.h +++ b/torch/csrc/jit/tensorexpr/cpp_codegen.h @@ -1,24 +1,100 @@ #pragma once +#include #include -#include - namespace torch { namespace jit { namespace tensorexpr { +class CppVarNameRewriter; + // Generates C++ code from the IR. +// +// Vector operations are unrolled. +// For example: +// C[Ramp(0, 1, 3)] = A[Ramp(0, 2, 3)] + B[Ramp(0, 3, 3)]; +// is unrolled into: +// C[0] = A[0] + B[0]; +// C[1] = A[2] + B[3]; +// C[2] = A[4] + B[6]; class TORCH_API CppPrinter : public IRPrinter { public: - explicit CppPrinter(std::ostream* os) : IRPrinter(*os) {} + explicit CppPrinter(std::ostream* os); + ~CppPrinter() override; + + void printPrologue(); using IRPrinter::visit; + + // Binary expressions. + void visit(ModPtr) override; + void visit(MaxPtr) override; + void visit(MinPtr) override; + + // Conditional expressions. + void visit(CompareSelectPtr) override; + void visit(IfThenElsePtr) override; + + // Tensor operations. void visit(AllocatePtr) override; void visit(FreePtr) override; + void visit(LoadPtr) override; + void visit(StorePtr) override; + + // Casts. + void visit(CastPtr) override; + void visit(BitCastPtr) override; + + // Calls. + void visit(IntrinsicsPtr) override; + void visit(ExternalCallPtr) override; + + // Vars. + void visit(LetPtr) override; + void visit(VarPtr) override; + + // Vector data types. + void visit(RampPtr) override; + void visit(BroadcastPtr) override; private: - std::unordered_set allocated_on_heap_; + int lane_; + std::unordered_map vector_vars_; +}; + +class TORCH_API CppCodeGen : public CodeGen { + public: + CppCodeGen( + StmtPtr stmt, + const std::vector& buffer_args, + at::Device device = at::kCPU, + const std::string& kernel_func_name = "func"); + + ~CppCodeGen() override; + + void call(const std::vector& args) override; + void call_raw(const std::vector& args) override; + + template + void operator()(const Ts&... ts) { + call(std::vector({CallArg(ts)...})); + } + + std::string getCodeText(const std::string& attr = "") override { + return oss_.str(); + } + + private: + void init(); + + std::ostream& os() { + return printer_->os(); + } + + std::ostringstream oss_; + std::unique_ptr printer_; + std::unique_ptr var_name_rewriter_; }; } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/cpp_intrinsics.h b/torch/csrc/jit/tensorexpr/cpp_intrinsics.h new file mode 100644 index 0000000000000..caeeed693ff38 --- /dev/null +++ b/torch/csrc/jit/tensorexpr/cpp_intrinsics.h @@ -0,0 +1,36 @@ +#pragma once + +namespace torch { +namespace jit { +namespace tensorexpr { + +constexpr auto cpp_intrinsics_definition = R"( +namespace std { + +template ::value, int>::type = 0> +T rsqrt(T v) { + return 1.0f / std::sqrt(v); +} + +template ::value, int>::type = 0> +T frac(T v) { + T intpart; + return std::modf(v, &intpart); +} + +template +To bitcast(const From& v) { + assert(sizeof(To) == sizeof(From)); + To res; + std::memcpy(&res, &v, sizeof(From)); + return res; +} + +} // namespace std +)"; + +} // namespace tensorexpr +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 108236e2e17f8..a4f317f48e666 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -169,8 +169,12 @@ class TORCH_API Var : public ExprNode { return name_hint_; } - void set_name_hint(const std::string& name_hint) { - name_hint_ = name_hint; + void set_name_hint(const std::string& name) { + name_hint_ = name; + } + + void set_name_hint(std::string&& name) { + name_hint_ = name; } Var(std::string name_hint, Dtype dtype) diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index f9fc7dcfc4246..7fe1fd1a07abb 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -684,6 +684,7 @@ enum IntrinsicsOp { kFrac, kIsNan, kRand, // We need more discussions on this. Should we consider stateful? + kMaxIntrinsicsOp, }; class TORCH_API Intrinsics : public ExprNode { @@ -864,8 +865,9 @@ class TORCH_API Intrinsics : public ExprNode { params_ = std::move(params); } - private: static int OpArgCount(IntrinsicsOp op_type); + + private: static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1); static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1, Dtype dt2); static Dtype IntrinsicsDtype( diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index 27b56e2f58146..2e1fc6e6952a7 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -25,6 +25,24 @@ void IRPrinter::print(Expr& expr) { void IRPrinter::print(Stmt& stmt) { stmt.accept(this); } +std::string IRPrinter::to_string(CompareSelectOperation op) { + switch (op) { + case CompareSelectOperation::kEQ: + return "=="; + case CompareSelectOperation::kNE: + return "!="; + case CompareSelectOperation::kGT: + return ">"; + case CompareSelectOperation::kGE: + return ">="; + case CompareSelectOperation::kLT: + return "<"; + case CompareSelectOperation::kLE: + return "<="; + default: + throw std::runtime_error("invalid compare select operator"); + } +} // TODO: change whether to include the parenthesis to the parent expression, // we need to look at the operator precedence to make the output simpler. @@ -137,28 +155,8 @@ void IRPrinter::visit(CompareSelectPtr v) { if (lhs_prec >= self_prec) { os() << ")"; } - switch (cmp_op) { - case CompareSelectOperation::kEQ: - os() << "=="; - break; - case CompareSelectOperation::kNE: - os() << "!="; - break; - case CompareSelectOperation::kGT: - os() << ">"; - break; - case CompareSelectOperation::kGE: - os() << ">="; - break; - case CompareSelectOperation::kLT: - os() << "<"; - break; - case CompareSelectOperation::kLE: - os() << "<="; - break; - default: - throw std::runtime_error("invalid compare select operator"); - } + + os() << to_string(cmp_op); if (rhs_prec >= self_prec) { os() << "("; @@ -230,6 +228,13 @@ void IRPrinter::visit(CastPtr v) { os() << ")"; } +void IRPrinter::visit(BitCastPtr v) { + auto dtype = v->dtype(); + os() << "BitCast<" << dtype.ToCppString() << ">("; + v->src_value()->accept(this); + os() << ")"; +} + void IRPrinter::visit(VarPtr v) { os() << name_manager_.get_unique_name(v); } @@ -439,7 +444,7 @@ void IRPrinter::visit(FreePtr v) { void IRPrinter::visit(LetPtr v) { os() << dtypeToCppString(v->dtype()) << " " << *v->var(); os() << " = " << *v->value(); - os() << ";"; + os() << ";" << std::endl; } void IRPrinter::visit(CondPtr v) { diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h index 321d1efe55457..327119dcc74e6 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.h +++ b/torch/csrc/jit/tensorexpr/ir_printer.h @@ -37,6 +37,7 @@ class TORCH_API IRPrinter : public IRVisitor { AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT); #undef IMM_PRINT_VISIT void visit(CastPtr v) override; + void visit(BitCastPtr v) override; void visit(VarPtr v) override; void visit(RampPtr v) override; void visit(LoadPtr v) override; @@ -83,6 +84,8 @@ class TORCH_API IRPrinter : public IRVisitor { }; protected: + std::string to_string(CompareSelectOperation op); + UniqueNameManager* name_manager() { return &name_manager_; } From ad47fb88584ec4cc9ed6e5f01e1256d38020a1a3 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Thu, 26 Aug 2021 10:21:48 -0700 Subject: [PATCH 258/530] Rename IterableAsDataPipe to IterableWrapper (#63981) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63981 Rename `IterableAsDataPipe` to `IterableWrapper` based on our naming convention `Op-er` Test Plan: Imported from OSS Reviewed By: VitalyFedyunin Differential Revision: D30554197 Pulled By: ejguan fbshipit-source-id: c2eacb20df5645d83ca165d6a1591f7e4791990f --- test/test_dataloader.py | 4 ++-- torch/utils/data/dataloader_experimental.py | 4 ++-- torch/utils/data/datapipes/iter/__init__.py | 4 ++-- torch/utils/data/datapipes/iter/utils.py | 5 ++++- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 01136b9e4bb07..65554632fd30f 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -26,7 +26,7 @@ ) from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL from torch.utils.data.dataset import random_split -from torch.utils.data.datapipes.iter import IterableAsDataPipe +from torch.utils.data.datapipes.iter import IterableWrapper from torch._utils import ExceptionWrapper from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest, @@ -1963,7 +1963,7 @@ def test_excessive_thread_creation_warning(self): class TestDataLoader2(TestCase): @skipIfNoDill def test_basics(self): - dp = IterableAsDataPipe(list(range(10))) + dp = IterableWrapper(list(range(10))) dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) self.assertEquals(list(dl), list(dl2)) diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py index 85028afd22124..ea085298bf00f 100644 --- a/torch/utils/data/dataloader_experimental.py +++ b/torch/utils/data/dataloader_experimental.py @@ -3,7 +3,7 @@ import torch.utils.data.backward_compatibility from torch.utils.data import DataLoader, IterDataPipe -from torch.utils.data.datapipes.iter import IterableAsDataPipe +from torch.utils.data.datapipes.iter import IterableWrapper class DataLoader2: def __new__(cls, @@ -69,7 +69,7 @@ def sharding_worker_init_fn(worker_init_fn, worker_id): else: if collate_fn is None: collate_fn = torch.utils.data._utils.collate.default_collate - datapipe = IterableAsDataPipe(data_loader).batch( + datapipe = IterableWrapper(data_loader).batch( batch_size, drop_last=drop_last).map(collate_fn) return datapipe diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index 5af2ab661da40..f302fd3a2b7ea 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -43,7 +43,7 @@ ZipArchiveReaderIterDataPipe as ZipArchiveReader, ) from torch.utils.data.datapipes.iter.utils import ( - IterableAsDataPipeIterDataPipe as IterableAsDataPipe, + IterableWrapperIterDataPipe as IterableWrapper, ) __all__ = ['Batcher', @@ -55,7 +55,7 @@ 'FileLoader', 'Filter', 'HttpReader', - 'IterableAsDataPipe', + 'IterableWrapper', 'LineReader', 'Mapper', 'RoutedDecoder', diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py index ea241d9f2716c..ee04abc455fba 100644 --- a/torch/utils/data/datapipes/iter/utils.py +++ b/torch/utils/data/datapipes/iter/utils.py @@ -1,10 +1,13 @@ from torch.utils.data import IterDataPipe -class IterableAsDataPipeIterDataPipe(IterDataPipe): +class IterableWrapperIterDataPipe(IterDataPipe): def __init__(self, iterable): self.iterable = iterable def __iter__(self): for data in self.iterable: yield data + + def __len__(self): + return len(self.iterable) From 48c57b9b2ef5a65dac5ef9ba2a15f742bb7d06e5 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 26 Aug 2021 12:08:00 -0700 Subject: [PATCH 259/530] Leverage TensorPipe's automatic SHM address selection (#63028) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63028 TensorPipe until now required PyTorch to come up and provide a unique identifier to use as address for the UNIX domain socket used in the SHM transport. However the Linux kernel can automatically assign an available address (like it does with IP ports), and TensorPipe now supports it, so we can remove that useless PyTorch logic. Test Plan: CI Reviewed By: mrshenli Differential Revision: D30220352 fbshipit-source-id: 78e8a6ef5916b2a72df26cdc9cd367b9d083e821 --- .../csrc/distributed/rpc/tensorpipe_agent.cpp | 22 ++----------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index df42248639f94..8e7ad18c575f8 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -16,12 +16,6 @@ #include #include -#if TENSORPIPE_HAS_SHM_TRANSPORT -// Needed for ::getpid(), which is used to create a unique address. -#include -#include -#endif - namespace torch { namespace distributed { namespace rpc { @@ -209,22 +203,10 @@ C10_REGISTER_CREATOR(TensorPipeTransportRegistry, uv, makeUvTransport); #if TENSORPIPE_HAS_SHM_TRANSPORT -std::string createUniqueShmAddr() { - thread_local uint32_t threadLocalId = 0; - return c10::str( - "shm://tensorpipe_rpc_agent_", - std::this_thread::get_id(), - "_", - ::getpid(), - "_", - threadLocalId++); -} - std::unique_ptr makeShmTransport() { auto context = tensorpipe::transport::shm::create(); - std::string address = createUniqueShmAddr(); - return std::make_unique(TransportRegistration{ - std::move(context), kShmTransportPriority, std::move(address)}); + return std::make_unique( + TransportRegistration{std::move(context), kShmTransportPriority, ""}); } // The SHM implements connections using ringbuffers residing in anonymous shared From c5cc185b6d556d7d91fc0b038a7a74529b66b737 Mon Sep 17 00:00:00 2001 From: Ansley Ussery Date: Thu, 26 Aug 2021 12:14:32 -0700 Subject: [PATCH 260/530] Allow uncompiled strings as input to `checkScriptRaisesRegex` (#63901) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63901 cc gmagogsfm Test Plan: Imported from OSS Reviewed By: gmagogsfm Differential Revision: D30579472 Pulled By: ansley fbshipit-source-id: 59ee09c1f25278d4f6e51f626588251bd095c6ea --- test/jit/test_jit_utils.py | 15 ++++++++ torch/testing/_internal/jit_utils.py | 56 ++++++++++++++++++---------- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/test/jit/test_jit_utils.py b/test/jit/test_jit_utils.py index 11d974bfe64c4..b344f82e96ced 100644 --- a/test/jit/test_jit_utils.py +++ b/test/jit/test_jit_utils.py @@ -77,3 +77,18 @@ def fn_hybrid_args(x, /, y, *args, **kwargs): self.assertEqual( [], torch._jit_internal.get_callable_argument_names(fn_hybrid_args)) + + def test_checkscriptassertraisesregex(self): + def fn(): + tup = (1, 2) + return tup[2] + + self.checkScriptRaisesRegex(fn, (), Exception, "range", name="fn") + + s = dedent(""" + def fn(): + tup = (1, 2) + return tup[2] + """) + + self.checkScriptRaisesRegex(s, (), Exception, "range", name="fn") diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py index 50d8dac23867b..4c521a8e4d9d5 100644 --- a/torch/testing/_internal/jit_utils.py +++ b/torch/testing/_internal/jit_utils.py @@ -375,35 +375,53 @@ def assertRaisesRegexWithHighlight(self, exception, regex, highlight): return _AssertRaisesRegexWithHighlightContext(self, exception, regex, highlight) def checkScriptRaisesRegex(self, script, inputs, exception, regex, - outputs=None, capture_output=False, profiling=ProfilingMode.PROFILING): + name=None, outputs=None, capture_output=False, + frames_up=1, profiling=ProfilingMode.PROFILING): """ Checks that a given function will throw the correct exception, - when executed with normal python, the string frontend, and the AST frontend + when executed with normal python, the string frontend, and the + AST frontend. Logic taken from `checkScript` (see comments there + for details) """ - with enable_profiling_mode_for_profiling_tests(): - # normal python + # Normal Python with self.assertRaisesRegex(exception, regex): - script(*inputs) - # string frontend + if isinstance(script, str): + frame = self.get_frame_vars(frames_up) + the_locals: Dict[str, Any] = {} + execWrapper(script, glob=frame, loc=the_locals) + frame.update(the_locals) + + python_fn = frame[name] + else: + python_fn = script + + python_fn(*inputs) + + # String frontend with self.assertRaisesRegex(exception, regex): - source = textwrap.dedent(inspect.getsource(script)) - cu = torch.jit.CompilationUnit(source) - ge = getattr(cu, script.__name__) - # profiling run + if isinstance(script, str): + cu = torch.jit.CompilationUnit(script, _frames_up=frames_up) + string_frontend = getattr(cu, name) + else: + source = textwrap.dedent(inspect.getsource(script)) + cu = torch.jit.CompilationUnit(source, _frames_up=frames_up) + string_frontend = getattr(cu, script.__name__) + with self.assertRaisesRegex(exception, regex): - ge(*inputs) + string_frontend(*inputs) # optimized run - ge(*inputs) - # python AST frontend - with self.assertRaisesRegex(exception, regex): - ge = torch.jit.script(script) - # profiling run + string_frontend(*inputs) + + # Python AST frontend + if not isinstance(script, str): with self.assertRaisesRegex(exception, regex): + ge = torch.jit.script(python_fn) + # profiling run + with self.assertRaisesRegex(exception, regex): + ge(*inputs) + # optimized run ge(*inputs) - # optimized run - ge(*inputs) - def checkBailouts(self, model, inputs, expected): state = model.get_debug_state() From 95d0b3199b2e0eb0516e439c8aa1a94b62113e1e Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Thu, 26 Aug 2021 12:48:01 -0700 Subject: [PATCH 261/530] Back out "[ONNX] Fix an issue that optimizations might adjust graph inputs unexpectedly. (#61280)" (#64004) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64004 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63904 Fixes T98808160 Test Plan: T98808160 Reviewed By: msaroufim Differential Revision: D30527450 fbshipit-source-id: 6262901a78ca929cecda1cf740893139aa26f1b4 --- .../expect/TestOperators.test_prelu.expect | 28 +++++----- ...ors.test_retain_param_name_disabled.expect | 52 +++++-------------- torch/_C/__init__.pyi.in | 2 +- torch/csrc/jit/passes/onnx/eval_peephole.cpp | 21 ++------ torch/csrc/jit/passes/onnx/eval_peephole.h | 3 +- torch/csrc/jit/python/init.cpp | 5 +- torch/onnx/__init__.py | 22 ++------ torch/onnx/utils.py | 19 +++---- 8 files changed, 45 insertions(+), 107 deletions(-) diff --git a/test/onnx/expect/TestOperators.test_prelu.expect b/test/onnx/expect/TestOperators.test_prelu.expect index be0328e5c61b7..e19623cfd4460 100644 --- a/test/onnx/expect/TestOperators.test_prelu.expect +++ b/test/onnx/expect/TestOperators.test_prelu.expect @@ -2,30 +2,20 @@ ir_version: 6 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { - node { - input: "weight" - output: "2" - name: "Unsqueeze_0" - op_type: "Unsqueeze" - attribute { - name: "axes" - ints: 1 - ints: 2 - type: INTS - } - } node { input: "input" - input: "2" + input: "4" output: "3" - name: "PRelu_1" + name: "PRelu_0" op_type: "PRelu" } name: "torch-jit-export" initializer { dims: 2 + dims: 1 + dims: 1 data_type: 1 - name: "weight" + name: "4" raw_data: "\000\000\200>\000\000\200>" } input { @@ -51,7 +41,7 @@ graph { } } input { - name: "weight" + name: "4" type { tensor_type { elem_type: 1 @@ -59,6 +49,12 @@ graph { dim { dim_value: 2 } + dim { + dim_value: 1 + } + dim { + dim_value: 1 + } } } } diff --git a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect index aa9499e27ac49..5eeaa875feb0c 100644 --- a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect +++ b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect @@ -2,57 +2,33 @@ ir_version: 6 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { - node { - input: "1" - output: "3" - name: "Transpose_0" - op_type: "Transpose" - attribute { - name: "perm" - ints: 1 - ints: 0 - type: INTS - } - } node { input: "input.1" - input: "3" + input: "7" output: "4" - name: "MatMul_1" + name: "MatMul_0" op_type: "MatMul" } - node { - input: "2" - output: "5" - name: "Transpose_2" - op_type: "Transpose" - attribute { - name: "perm" - ints: 1 - ints: 0 - type: INTS - } - } node { input: "4" - input: "5" + input: "8" output: "6" - name: "MatMul_3" + name: "MatMul_1" op_type: "MatMul" } name: "torch-jit-export" initializer { - dims: 5 dims: 4 + dims: 5 data_type: 1 - name: "1" + name: "7" raw_data: "\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@" } initializer { - dims: 6 dims: 5 + dims: 6 data_type: 1 - name: "2" + name: "8" raw_data: "\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@" } input { @@ -72,32 +48,32 @@ graph { } } input { - name: "1" + name: "7" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 5 + dim_value: 4 } dim { - dim_value: 4 + dim_value: 5 } } } } } input { - name: "2" + name: "8" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 6 + dim_value: 5 } dim { - dim_value: 5 + dim_value: 6 } } } diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 0b6bb6b64e0a4..3629150d15090 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -326,7 +326,7 @@ def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ... def _jit_pass_onnx_fold_if(graph: Graph) -> None: ... def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ... def _jit_pass_inline_fork_wait(graph: Graph) -> None: ... -def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue], isAllowedToAdjustGraphInputs: _bool) -> Dict[str, IValue]: ... +def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ... def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ... diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp index 4bad9367af444..05afb69ef0f23 100644 --- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp +++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp @@ -141,27 +141,14 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) { } } -void EvalPeepholeONNX( - Block* b, - ParamMap& paramsDict, - bool isAllowedToAdjustGraphInputs) { +void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) { auto valsToParamsMap = buildValueToParamsMap(b, paramsDict); - - // Optimizations like fusing Conv and BatchNorm ops may adjust the graph - // inputs. If the graph inputs are not allowed to be adjusted, for example - // export_params is False, such optimizations will be skipped. - if (isAllowedToAdjustGraphInputs) { - fuseConvBatchNorm(b, valsToParamsMap); - } - + fuseConvBatchNorm(b, valsToParamsMap); buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict); } -void EvalPeepholeONNX( - std::shared_ptr& g, - ParamMap& paramsDict, - bool isAllowedToAdjustGraphInputs) { - EvalPeepholeONNX(g->block(), paramsDict, isAllowedToAdjustGraphInputs); +void EvalPeepholeONNX(std::shared_ptr& g, ParamMap& paramsDict) { + EvalPeepholeONNX(g->block(), paramsDict); GRAPH_DUMP("After EvalPeepholeONNX:", g); } diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.h b/torch/csrc/jit/passes/onnx/eval_peephole.h index d953f2c2e5bda..6f8961d08fd5e 100644 --- a/torch/csrc/jit/passes/onnx/eval_peephole.h +++ b/torch/csrc/jit/passes/onnx/eval_peephole.h @@ -9,8 +9,7 @@ namespace jit { void EvalPeepholeONNX( std::shared_ptr& g, - std::map& paramDict, - bool isAllowedToAdjustGraphInputs); + std::map& paramDict); } // namespace jit diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 645fea2274fb2..7e43e511c786f 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -203,9 +203,8 @@ void initJITBindings(PyObject* module) { .def( "_jit_pass_onnx_eval_peephole", [](std::shared_ptr& graph, - std::map& paramsDict, - bool isAllowedToAdjustGraphInputs) { - EvalPeepholeONNX(graph, paramsDict, isAllowedToAdjustGraphInputs); + std::map& paramsDict) { + EvalPeepholeONNX(graph, paramsDict); return paramsDict; }, pybind11::return_value_policy::move) diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index e058acce1947d..b726b2b55e8b6 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -103,17 +103,11 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM export_params (bool, default True): if True, all parameters will be exported. Set this to False if you want to export an untrained model. In this case, the exported model will first take all of its parameters - as arguments, with the ordering as specified by ``model.state_dict().values()``. - This helps in stripping parameters from the model which is useful for training. - Besides, if this is False, any optimization that may adjust graph inputs will - be skipped - for example, Conv and BatchNorm fusion. + as arguments, with the ordering as specified by ``model.state_dict().values()`` verbose (bool, default False): if True, prints a description of the model being exported to stdout. training (enum, default TrainingMode.EVAL): - * ``TrainingMode.EVAL``: export the model in inference mode. In this case, optimizations - (e.g., fusing Conv and BatchNorm ops) may adjust graph inputs by modifying model params - and model param names. Such adjustment could be skipped by setting export_params = False - or keep_initializers_as_inputs = True. + * ``TrainingMode.EVAL``: export the model in inference mode. * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is False and in training mode if model.training is True. * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations @@ -190,8 +184,6 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM do_constant_folding (bool, default False): Apply the constant-folding optimization. Constant-folding will replace some of the ops that have all constant inputs with pre-computed constant nodes. - Since this optimization adjusts model initializers, it will be disabled if - export_params = False or keep_initializers_as_inputs = True. example_outputs (T or a tuple of T, where T is Tensor or convertible to Tensor, default None): Must be provided when exporting a ScriptModule or ScriptFunction, ignored otherwise. Used to determine the type and shape of the outputs without tracing the execution of @@ -273,13 +265,9 @@ def forward(self, x): keep_initializers_as_inputs (bool, default None): If True, all the initializers (typically corresponding to parameters) in the - exported graph will also be added as inputs to the graph. - - If False, then initializers are not added as inputs to the graph, and only - the non-parameter inputs are added as inputs. Meanwhile, the optimization - that might adjust graph inputs will be skipped (e.g., fusing Conv and - BatchNorm ops), even when the user export this model in inference mode. - + exported graph will also be added as inputs to the graph. If False, + then initializers are not added as inputs to the graph, and only + the non-parameter inputs are added as inputs. This may allow for better optimizations (e.g. constant folding) by backends/runtimes. diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 7860e38034028..41ba20f3ad102 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -439,8 +439,7 @@ def _model_to_graph(model, args, verbose=False, example_outputs=None, _retain_param_name=False, do_constant_folding=True, _disable_torch_constant_prop=False, fixed_batch_size=False, - training=None, dynamic_axes=None, export_params=True, - keep_initializers_as_inputs=False): + training=None, dynamic_axes=None): r"""Converts model into an ONNX graph. Returns: @@ -499,12 +498,10 @@ def _model_to_graph(model, args, verbose=False, params_dict = _get_named_param_dict(graph, params) - allow_adjust_graph_inputs = (export_params and not keep_initializers_as_inputs) - if (training is None or training == TrainingMode.EVAL): - params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict, allow_adjust_graph_inputs) + if training is None or training == TrainingMode.EVAL: + params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict) - if do_constant_folding and allow_adjust_graph_inputs and \ - _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: + if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: params_dict = torch._C._jit_pass_onnx_constant_fold(graph, params_dict, _export_onnx_opset_version) torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph) @@ -572,9 +569,7 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, output_names, operator_export_type, example_outputs, _retain_param_name, val_do_constant_folding, fixed_batch_size=fixed_batch_size, - training=training, - export_params=export_params, - keep_initializers_as_inputs=val_keep_init_as_ip) + training=training) return graph._pretty_print_onnx(params_dict, opset_version, False, operator_export_type, google_printer, @@ -690,9 +685,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=None, val_do_constant_folding, fixed_batch_size=fixed_batch_size, training=training, - dynamic_axes=dynamic_axes, - export_params=export_params, - keep_initializers_as_inputs=val_keep_init_as_ip) + dynamic_axes=dynamic_axes) # TODO: Don't allocate a in-memory string for the protobuf defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE From cbfec02007775d96139d8a1b9d9f8a44fcede31c Mon Sep 17 00:00:00 2001 From: Don Jang Date: Thu, 26 Aug 2021 12:58:05 -0700 Subject: [PATCH 262/530] [Static Runtime] Add native op for aten::expand_as (#64024) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64024 `aten::expand_as` creates a view of the input tensor. This change adds its native op implementation for the static runtime. Test Plan: - Added `StaticRuntime.IndividualOps_ExpandAs` Reviewed By: hlu1 Differential Revision: D30546851 fbshipit-source-id: e53483048af890bc41b6192a1ab0c5ba0ee2bdc0 --- benchmarks/static_runtime/test_scripts.h | 6 ++++++ benchmarks/static_runtime/test_static_runtime.cc | 11 +++++++++++ torch/csrc/jit/runtime/static/native_ops.cpp | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 90f93b20c94c0..ecdd491462f62 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -349,6 +349,12 @@ const std::string embedding_bag_max_last_offset = R"JIT( return torch.embedding_bag(a, b, c, False, 2, False, None, True) )JIT"; +const auto expand_as_script = R"JIT( + def forward(self, input: Tensor, other:Tensor): + a = input.expand_as(other) + return a.clone() +)JIT"; + const auto sign_tensor = R"JIT( def forward(self, input: Tensor): return torch.sign(input).clone() diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index f6ec677bbb7bc..4441b7d043db9 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -610,6 +610,17 @@ TEST(StaticRuntime, IndividualOps_Detach) { testStaticRuntime(detach_script_1, args, args2); } +TEST(StaticRuntime, IndividualOps_ExpandAs) { + auto a = at::randn({3,1}); + auto b = at::randn({3,2}); + auto c = at::randn({4,1}); + auto d = at::randn({4,2}); + std::vector args{a, b}; + std::vector args2{c, d}; + testStaticRuntime(expand_as_script, args); + testStaticRuntime(expand_as_script, args, args2); +} + TEST(StaticRuntime, IndividualOps_Full) { auto dtype = at::ScalarType::Int; auto cpu = at::Device(DeviceType::CPU); diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp index 61a6554a3c5cc..7a1558dd70a00 100644 --- a/torch/csrc/jit/runtime/static/native_ops.cpp +++ b/torch/csrc/jit/runtime/static/native_ops.cpp @@ -370,6 +370,22 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR( }; }); +REGISTER_NATIVE_OPERATOR_FUNCTOR( + aten::expand_as, + aten_expand_as, + [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& self = p_node->Input(0).toTensor(); + const auto& other = p_node->Input(1).toTensor(); + p_node->Output(0) = self.expand(other.sizes()); + }; + }); + REGISTER_NATIVE_OPERATOR_FUNCTOR( prim::isinstance, prim_isinstance, From 7cfbc85821e8928db570a0730437b96484ac7b60 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Thu, 26 Aug 2021 13:06:46 -0700 Subject: [PATCH 263/530] [fx_acc] [fx2trt] add acc op mapper for argmin and converter for topk (#63823) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63823 Add mapper for `torch.argmin` which maps it to `acc_ops.flatten` (optional) + `acc_ops.topk` + `acc_ops.getitem` + `acc_ops.squeeze` (optional). This diff doesn't allow mapping if `dim=None && keepdim=True` in `torch.argmin`. Add fx2trt converter for `acc_ops.topk`. Test Plan: buck test mode/opt glow/fb/fx/oss_acc_tracer:test_acc_tracer -- test_argmin buck run mode/opt caffe2/torch/fb/fx2trt:test_topk Reviewed By: jfix71 Differential Revision: D30501771 fbshipit-source-id: 0babc45e69bac5e61ff0b9b4dfb98940398e3e57 --- .../fx2trt/converters/acc_ops_converters.py | 24 +++++++++ torch/fx/experimental/fx2trt/fx2trt.py | 4 +- torch/fx/experimental/fx_acc/acc_ops.py | 51 +++++++++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index 33a817d4ccdb5..ba370b2b067d4 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -717,6 +717,7 @@ def acc_ops_squeeze(network, target, args, kwargs, name): # dim, which is a very rare case. For now we just claim not supporting dim=None. assert dim is not None, "We don't support dim=None right now." + dim = dim % (len(input_val.shape) + (1 if network.has_implicit_batch_dimension else 0)) if network.has_implicit_batch_dimension: assert dim != 0, "We don't support squeeze batch dim when it's implicit." dim -= 1 @@ -796,6 +797,29 @@ def acc_ops_unsqueeze(network, target, args, kwargs, name): layer.name = name return layer.get_output(0) +@tensorrt_converter(acc_ops.topk) +def acc_ops_topk(network, target, args, kwargs, name): + input_val = kwargs["input"] + + if not isinstance(input_val, trt.tensorrt.ITensor): + raise RuntimeError(f"topk received input {input_val} that is not part " + "of the TensorRT region!") + + if kwargs["sorted"] and kwargs["k"] != 1: + raise RuntimeError("Currently we don't support sorted=True in topk.") + + if not network.has_implicit_batch_dimension and len(input_val.shape) <= 1: + raise RuntimeError("At least 2 dimensions are required for input to topk.") + + num_dims = len(input_val.shape) + (1 if network.has_implicit_batch_dimension else 0) + k = kwargs["k"] + dim = (kwargs["dim"] if kwargs["dim"] else -1) % num_dims + operation = trt.TopKOperation.MAX if kwargs["largest"] else trt.TopKOperation.MIN + layer = network.add_topk( + input_val, operation, k, get_axes_for_reduce_op(dim, network.has_implicit_batch_dimension) + ) + layer.name = name + return (layer.get_output(0), layer.get_output(1)) @tensorrt_converter(acc_ops.adaptive_avg_pool2d) def acc_ops_adaptive_avg_pool2d(network, target, args, kwargs, name): diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py index ede99fd6f1700..72497a7d2aafc 100644 --- a/torch/fx/experimental/fx2trt/fx2trt.py +++ b/torch/fx/experimental/fx2trt/fx2trt.py @@ -415,8 +415,6 @@ def output(self, target, args, kwargs): name = f"output{i}" output.name = name self.network.mark_output(output) - if self.fp16_mode: + if self.fp16_mode and output.dtype == trt.float32: output.dtype = trt.float16 - else: - output.dtype = trt.float32 self._output_names.append(name) diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 95fffaa479c9e..692ca6304910f 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -705,6 +705,57 @@ def batch_norm( def layer_norm(*, input, normalized_shape, weight, bias, eps): return nn.functional.layer_norm(**locals()) +def argmin_max_mapper_impl(node: torch.fx.Node, largest: bool) -> torch.fx.Node: + """ + Map torch.argmin or torch.argmax to acc_ops.flatten (depend on dim) + acc_ops.topk + + acc_ops.getitem + acc_ops.squeeze (depends on keepdim). + """ + input_node = node.kwargs["input"] + dim = node.kwargs["dim"] + keepdim = node.kwargs["keepdim"] + + if dim is None and keepdim: + raise RuntimeError("We currently don't support argmin/argmax with dim=None and keepdim=True") + + with node.graph.inserting_before(node): + if dim is None: + flatten_kwargs = {"input": node.kwargs["input"], "start_dim": 0, "end_dim": -1} + flatten_node = node.graph.call_function(flatten, kwargs=flatten_kwargs) + flatten_node.meta["type"] = torch.Tensor + input_node = flatten_node + dim = -1 + + topk_kwargs = {"input": input_node, "k": 1, "dim": dim, "largest": largest, "sorted": False} + topk_node = node.graph.call_function(topk, kwargs=topk_kwargs) + # It's actually more like NamedTuple but tuple here should be fine. + topk_node.meta["type"] = tuple + + getitem_kwargs = {"input": topk_node, "idx": 1} + getitem_node = node.graph.call_function(getitem, kwargs=getitem_kwargs) + getitem_node.meta["type"] = torch.Tensor + output_node = getitem_node + + if not keepdim: + squeeze_kwargs = {"input": getitem_node, "dim": dim} + output_node = node.graph.call_function(squeeze, kwargs=squeeze_kwargs) + + output_node.meta = node.meta.copy() + return output_node + +@register_custom_acc_mapper_fn( + op_and_target=("call_function", torch.argmin), + arg_replacement_tuples=[ + ("input", "input"), + ("dim", "dim"), + ("keepdim", "keepdim"), + ], +) +def torch_argmin_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node: + """ + Map torch.argmin to acc_ops.flatten (depend on dim) + acc_ops.topk + acc_ops.getitem + + acc_ops.squeeze (depends on keepdim). + """ + return argmin_max_mapper_impl(node, largest=False) @register_custom_acc_mapper_fn( op_and_target=("call_method", "split"), From 5a12cb611f40c2277587b955cc851b47ec60c18d Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Thu, 26 Aug 2021 13:29:03 -0700 Subject: [PATCH 264/530] To add Chained Scheduler to the list of PyTorch schedulers. (#63491) Summary: In this PR we are introducing ChainedScheduler which initially proposed in the discussion https://github.com/pytorch/pytorch/pull/26423#discussion_r329976246 . The idea is to provide a user friendly chaining method for schedulers, especially for the cases many of them are involved and we want to have a clean and easy to read interface for schedulers. This method will be even more crucial once CompositeSchedulers and Schedulers for different type of parameters are involved. The immediate application of Chained Scheduler is expected to happen in TorchVision Library to combine WarmUpLR and MultiStepLR https://github.com/pytorch/vision/blob/master/references/video_classification/scheduler.py#L5 . However, it can be expected that in many other use cases also this method could be applied. ### Example The usage is as simple as below: ```python sched=ChainedScheduler([ExponentialLR(self.opt, gamma=0.9), WarmUpLR(self.opt, warmup_factor=0.2, warmup_iters=4, warmup_method="constant"), StepLR(self.opt, gamma=0.1, step_size=3)]) ``` Then calling ```python sched.step() ``` would trigger step function for all three schedulers consecutively Partially resolves https://github.com/pytorch/vision/issues/4281 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63491 Reviewed By: datumbox, mruberry Differential Revision: D30576180 Pulled By: iramazanli fbshipit-source-id: b43f0749f55faab25079641b7d91c21a891a87e4 --- test/test_optim.py | 40 ++++++++++++++++++++++++++++++++++++- torch/optim/lr_scheduler.py | 38 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/test/test_optim.py b/test/test_optim.py index 01ec43bbea883..fe282ef33b4de 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -13,7 +13,7 @@ from torch import sparse from torch.optim.lr_scheduler import LambdaLR, MultiplicativeLR, StepLR, \ MultiStepLR, WarmUpLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \ - _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR + _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler from torch.optim.swa_utils import AveragedModel, SWALR, update_bn from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \ skipIfRocm @@ -1253,6 +1253,44 @@ def test_reduce_lr_on_plateau8(self): threshold=0.1, patience=5, cooldown=5) self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs) + def test_chained_lr1(self): + epochs = 10 + schedulers = [None] * 1 + targets = [[0.05] * 3 + [0.005] * 3 + [0.0005] * 3 + [0.00005] * 3] + schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3) + scheduler = ChainedScheduler(schedulers) + self._test([scheduler], targets, epochs) + + def test_chained_lr2(self): + epochs = 10 + schedulers = [None] * 1 + targets = [[0.02, 0.03, 0.04] + [0.05] * 9] + schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear") + scheduler = ChainedScheduler(schedulers) + self._test([scheduler], targets, epochs) + + def test_chained_lr3(self): + epochs = 10 + schedulers = [None] * 2 + targets = [[0.02, 0.03, 0.04, 0.05] + [0.005] * 4 + [0.0005] * 3 + [0.00005] * 3] + schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear") + schedulers[1] = MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1) + scheduler = ChainedScheduler(schedulers) + self._test([scheduler], targets, epochs) + + def test_chained_lr4(self): + epochs = 9 + schedulers = [None] * 3 + targets = [[0.05 * 0.2 * 0.9 ** x for x in range(3)] + + [0.05 * 0.2 * 0.9 ** 3 * 0.1] + + [0.05 * 0.9 ** x * 0.1 for x in range(4, 6)] + + [0.05 * 0.9 ** x * 0.01 for x in range(6, 9)]] + schedulers[0] = ExponentialLR(self.opt, gamma=0.9) + schedulers[1] = WarmUpLR(self.opt, warmup_factor=0.2, warmup_iters=4, warmup_method="constant") + schedulers[2] = StepLR(self.opt, gamma=0.1, step_size=3) + scheduler = ChainedScheduler(schedulers) + self._test([scheduler], targets, epochs) + def test_compound_step_and_multistep_lr(self): epochs = 10 schedulers = [None] * 2 diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index 657a35ad681b0..761a4041668d6 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -603,6 +603,44 @@ def _get_closed_form_lr(self): for base_lr in self.base_lrs] +class ChainedScheduler(_LRScheduler): + """Chains list of learning rate schedulers. It takes a list of chainable learning + rate schedulers and performs consecutive step() functions belong to them by just + one call. + + Args: + schedulers (list): List of chained schedulers. + + Example: + >>> # Assuming optimizer uses lr = 1. for all groups + >>> # lr = 0.09 if epoch == 0 + >>> # lr = 0.081 if epoch == 1 + >>> # lr = 0.729 if epoch == 2 + >>> # lr = 0.6561 if epoch == 3 + >>> # lr = 0.59049 if epoch >= 4 + >>> scheduler1 = WarmUpLR(self.opt, warmup_factor=0.1, warmup_iters=2, warmup_method="constant") + >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9) + >>> scheduler = ChainedScheduler([scheduler1, scheduler2]) + >>> for epoch in range(100): + >>> train(...) + >>> validate(...) + >>> scheduler.step() + """ + + def __init__(self, schedulers): + for scheduler_idx in range(1, len(schedulers)): + if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer): + raise ValueError( + "ChainedScheduler expects all schedulers to belong to the same optimizer, but " + "got schedulers at index {} and {} to be different".format(0, scheduler_idx) + ) + self.schedulers = list(schedulers) + + def step(self): + for scheduler in self.schedulers: + scheduler.step() + + class ReduceLROnPlateau(object): """Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by a factor From d8d8e4902a9ce3426e84817b936699f85a5f698e Mon Sep 17 00:00:00 2001 From: Can Balioglu Date: Thu, 26 Aug 2021 13:55:08 -0700 Subject: [PATCH 265/530] [torch/elastic] Pretty print the failure message captured by @record (#64036) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64036 This PR slightly revises the implementation of the internal `_format_failure()` method in order to pretty print the error message captured in a subprocess by the `record` annotation. With this PR a failure log is formatted as below: ``` Root Cause: [0]: time: 2021-08-26_17:12:07 rank: 0 (local_rank: 0) exitcode: 1 (pid: 8045) error_file: /tmp/torchelastic_6cj9eppm/6d9d844a-6ce4-4838-93ed-1639a9525b00_rec9kuv3/attempt_0/0/error.json msg: { "message": "ValueError: Test", "extraInfo": { "py_callstack": [ " File \"/data/home/balioglu/fail.py\", line 7, in \n main()\n", " File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 373, in wrapper\n error_handler.record_exception(e)\n", " File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 86, in record_exception\n _write_error(e, self._get_error_file_path())\n", " File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 26, in _write_error\n \"py_callstack\": traceback.format_stack(),\n" ], "timestamp": "1629997927" } } ``` in contrast to the old formatting: ``` Root Cause: [0]: time: 2021-08-26_17:15:50 rank: 0 (local_rank: 0) exitcode: 1 (pid: 9417) error_file: /tmp/torchelastic_22pwarnq/19f22638-848c-4b8f-8379-677f34fc44e7_u43o9vs7/attempt_0/0/error.json msg: "{'message': 'ValueError: Test', 'extraInfo': {'py_callstack': 'Traceback (most recent call last):\n File "/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 351, in wrapper\n return f(*args, **kwargs)\n File "/data/home/balioglu/fail.py", line 5, in main\n raise ValueError("BALIOGLU")\nValueError: BALIOGLU\n', 'timestamp': '1629998150'}}" ``` ghstack-source-id: 136761768 Test Plan: Run the existing unit tests. Reviewed By: kiukchung Differential Revision: D30579025 fbshipit-source-id: 37df0b7c7ec9b620355766122986c2c77e8495ae --- .../elastic/multiprocessing/errors/__init__.py | 17 +++++++++++++++-- .../multiprocessing/errors/error_handler.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py index 7746dbace9af5..ab0e0f3b7c874 100644 --- a/torch/distributed/elastic/multiprocessing/errors/__init__.py +++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py @@ -165,7 +165,7 @@ def timestamp_isoformat(self): rank: ${rank} (local_rank: ${local_rank}) exitcode: ${exitcode} (pid: ${pid}) error_file: ${error_file} - msg: \"${message}\"""" + msg: ${message}""" # extra new lines before and after are intentional _MSG_FORMAT_TEMPLATE = """ @@ -258,6 +258,19 @@ def format_msg(self, boarder_delim="*", section_delim="="): def _format_failure( self, idx: int, rank: int, failure: ProcessFailure ) -> Tuple[str, int]: + if isinstance(failure.message, str): + msg = '"' + failure.message + '"' + else: + try: + dmp = json.dumps(failure.message, indent=2) + except ValueError: + msg = failure.message + else: + msg = os.linesep + # Indent by 4 chars. + for l in dmp.splitlines(): + msg += f" {l}{os.linesep}" + fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute( idx=idx, time=failure.timestamp_isoformat(), @@ -266,7 +279,7 @@ def _format_failure( exitcode=failure.exitcode, pid=failure.pid, error_file=failure.error_file, - message=failure.message, + message=msg, ) width = 0 for line in fmt.split("\n"): diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py index 74586e9fd8523..2974355fae88c 100644 --- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py +++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py @@ -23,7 +23,7 @@ def _write_error(e: BaseException, error_file: Optional[str]): "message": { "message": f"{type(e).__name__}: {e}", "extraInfo": { - "py_callstack": traceback.format_exc(), + "py_callstack": traceback.format_stack(), "timestamp": str(int(time.time())), }, } From aeec177833cb20e8c6177ef8dbcf02ddc37c8a32 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Thu, 26 Aug 2021 14:09:10 -0700 Subject: [PATCH 266/530] [JIT] UseVariadicOp takes list_idx parameter (#63915) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63915 Previously, this function only worked for variadic op substitutions of the form `op(list, args) -> variadic_op(list_1, ..., list_n, args)`. This change allows for transformations of the form `op(args_0, list, args_1) -> variadic_op(args_0, list_1, ..., list_n, args_1)`. Test Plan: `buck test caffe2/test/cpp/jit:jit -- Stack Concat` (tests exercising `list_idx != 0` will be added further up in this diff stack) Reviewed By: navahgar Differential Revision: D30529729 fbshipit-source-id: 568080679c3b40bdaedee56bef2e8a5ce7985d2f --- torch/csrc/jit/passes/variadic_ops.cpp | 47 +++++++++++++++++++------- torch/csrc/jit/passes/variadic_ops.h | 12 +++++++ 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/torch/csrc/jit/passes/variadic_ops.cpp b/torch/csrc/jit/passes/variadic_ops.cpp index 6f4d23cec7b66..a827d3a2371d8 100644 --- a/torch/csrc/jit/passes/variadic_ops.cpp +++ b/torch/csrc/jit/passes/variadic_ops.cpp @@ -14,8 +14,12 @@ class VariadicUpdater { explicit VariadicUpdater( std::shared_ptr graph, NodeKind op, - NodeKind variadic_op) - : graph_(std::move(graph)), op_(op), variadic_op_(variadic_op) {} + NodeKind variadic_op, + size_t list_idx = 0) + : graph_(std::move(graph)), + op_(op), + variadic_op_(variadic_op), + list_idx_(list_idx) {} bool run() { collectOpNodes(graph_->block()); @@ -39,21 +43,34 @@ class VariadicUpdater { } bool replaceWithVariadicOp(Node* op_node) { - if (op_node->input(0)->node()->kind() != prim::ListConstruct) { + const size_t num_inputs = op_node->inputs().size(); + TORCH_CHECK(list_idx_ < num_inputs); + if (op_node->input(list_idx_)->node()->kind() != prim::ListConstruct) { return false; } - auto list = op_node->input(0)->node(); + auto list = op_node->input(list_idx_)->node(); + const size_t list_len = list->inputs().size(); + // We do not transform ops whose list input can not be moved to the // position before op. This in turn implies that there is some mutation // of the input list before op. if (!getOrCreateAliasDb()->couldMoveBeforeTopologically(list, op_node)) { return false; } - std::vector inputs = list->inputs().vec(); - // Add non-list inputs - for (size_t i = 1; i < op_node->inputs().size(); ++i) { - inputs.push_back(op_node->input(i)); - } + + // Construct new inputs + std::vector inputs; + inputs.reserve(num_inputs + list_len - 1); + inputs.insert( + inputs.end(), + op_node->inputs().begin(), + op_node->inputs().begin() + list_idx_); + inputs.insert(inputs.end(), list->inputs().begin(), list->inputs().end()); + inputs.insert( + inputs.end(), + op_node->inputs().begin() + list_idx_ + 1, + op_node->inputs().end()); + auto var_op_node = op_node->owningGraph()->create(variadic_op_, inputs); GRAPH_UPDATE("Adding\n", *var_op_node); var_op_node->insertBefore(op_node); @@ -82,6 +99,8 @@ class VariadicUpdater { NodeKind op_; NodeKind variadic_op_; + + size_t list_idx_; }; } // namespace @@ -89,10 +108,11 @@ class VariadicUpdater { bool UseVariadicOp( const std::shared_ptr& graph, NodeKind op, - NodeKind variadic_op) { + NodeKind variadic_op, + size_t list_idx) { const std::string pass_name = std::string("variadic ") + op.toQualString(); GRAPH_DUMP("Before " + pass_name, graph); - bool changed = VariadicUpdater(graph, op, variadic_op).run(); + bool changed = VariadicUpdater(graph, op, variadic_op, list_idx).run(); if (changed) { GRAPH_DUMP("After " + pass_name, graph); } @@ -102,13 +122,14 @@ bool UseVariadicOp( bool RemoveListMutationAndUseVariadicOp( const std::shared_ptr& graph, NodeKind op, - NodeKind variadic_op) { + NodeKind variadic_op, + size_t list_idx) { bool changed_in_last_iter = true; bool changed = false; while (changed_in_last_iter) { changed_in_last_iter = RemoveListMutation(graph); changed_in_last_iter = - UseVariadicOp(graph, op, variadic_op) || changed_in_last_iter; + UseVariadicOp(graph, op, variadic_op, list_idx) || changed_in_last_iter; changed = changed || changed_in_last_iter; } return changed; diff --git a/torch/csrc/jit/passes/variadic_ops.h b/torch/csrc/jit/passes/variadic_ops.h index 20cc6648dddb4..e5f6a680c5039 100644 --- a/torch/csrc/jit/passes/variadic_ops.h +++ b/torch/csrc/jit/passes/variadic_ops.h @@ -19,5 +19,17 @@ TORCH_API bool UseVariadicStack(const std::shared_ptr& graph); TORCH_API bool RemoveListMutationAndUseVariadicStack( const std::shared_ptr& graph); +TORCH_API bool UseVariadicOp( + const std::shared_ptr& graph, + NodeKind op, + NodeKind variadic_op, + size_t list_idx = 0); + +TORCH_API bool RemoveListMutationAndUseVariadicOp( + const std::shared_ptr& graph, + NodeKind op, + NodeKind variadic_op, + size_t list_idx = 0); + } // namespace jit } // namespace torch From 7861dba7f697f91a19d7fa137a2e15799959e2ca Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Thu, 26 Aug 2021 15:18:37 -0700 Subject: [PATCH 267/530] Automated submodule update: FBGEMM (#62879) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/ce5470385723b0262b47250d6af05f1b734e4509 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62879 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jspark1105 Differential Revision: D30154801 fbshipit-source-id: b2ce185da6f6cadf5128f82b15097d9e13e9e6a0 --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index 10ec0d3388579..d4902e94367b9 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 10ec0d33885795e6b4cc9a17896ee3f25b48fa8e +Subproject commit d4902e94367b9f074cadd29d7dc5ef6b0c69c6c1 From 085278f8b141579c5d5481a8fb96c7dfa830b262 Mon Sep 17 00:00:00 2001 From: MengeTM <34686199+MengeTM@users.noreply.github.com> Date: Thu, 26 Aug 2021 15:32:06 -0700 Subject: [PATCH 268/530] Derivatives of relu (#63027) (#63089) Summary: Optimization of relu and leaky_relu derivatives for reduction of VRAM needed for the backward-passes Fixes https://github.com/pytorch/pytorch/issues/63027 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63089 Reviewed By: iramazanli Differential Revision: D30582049 Pulled By: albanD fbshipit-source-id: a9481fe8c10cbfe2db485e28ce80cabfef501eb8 --- tools/autograd/derivatives.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 49e574a1651ba..641471ebc8f06 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -1604,10 +1604,6 @@ self: soft_margin_loss_backward(grad, self, target, reduction) - name: relu(Tensor self) -> Tensor - self: threshold_backward(grad, self, 0) - -# NB: `output` instead of `self` saves memory. It avoids saving a copy of self. -- name: relu_(Tensor(a!) self) -> Tensor(a!) self: threshold_backward(grad, result, 0) - name: silu(Tensor self) -> Tensor From 49b782b2cb09c80a8c476287509a4f566cc597d2 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 26 Aug 2021 15:42:00 -0700 Subject: [PATCH 269/530] Add shard number to print_test_stats.py upload name (#64055) Summary: Now that the render test results job is gone, each shard on GHA is uploading a JSON test stats report. To ensure differentiation, this PR includes the shard number in the report name. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64055 Reviewed By: iramazanli Differential Revision: D30586869 Pulled By: janeyx99 fbshipit-source-id: fd19f347131deec51486bb0795e4e13ac19bc71a --- tools/stats/print_test_stats.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py index 71df463b14516..1f4c33e8feb43 100755 --- a/tools/stats/print_test_stats.py +++ b/tools/stats/print_test_stats.py @@ -781,14 +781,16 @@ def assemble_s3_object( def send_report_to_s3(head_report: Version2Report) -> None: job = os.getenv('JOB_BASE_NAME', os.environ.get('CIRCLE_JOB')) + # SHARD_NUMBER is specific to GHA jobs, as the shard number would be included in CIRCLE_JOB already + shard = os.environ.get('SHARD_NUMBER', '') sha1 = os.environ.get('CIRCLE_SHA1') branch = os.environ.get('CIRCLE_BRANCH', '') now = datetime.datetime.utcnow().isoformat() if branch not in ['master', 'nightly'] and not branch.startswith("release/"): pr = os.environ.get('CIRCLE_PR_NUMBER', 'unknown') - key = f'pr_test_time/{pr}/{sha1}/{job}/{now}Z.json.bz2' # Z meaning UTC + key = f'pr_test_time/{pr}/{sha1}/{job}{shard}/{now}Z.json.bz2' # Z meaning UTC else: - key = f'test_time/{sha1}/{job}/{now}Z.json.bz2' # Z meaning UTC + key = f'test_time/{sha1}/{job}{shard}/{now}Z.json.bz2' # Z meaning UTC obj = get_S3_object_from_bucket('ossci-metrics', key) # use bz2 because the results are smaller than gzip, and the # compression time penalty we pay is only about half a second for From 49353e319cd6537509f75463f41ddf4ac70e3cb6 Mon Sep 17 00:00:00 2001 From: Bo Wang Date: Thu, 26 Aug 2021 16:00:16 -0700 Subject: [PATCH 270/530] More sharded_tensor creation ops: harded_tensor.zeros, sharded_tensor.full, sharded_tensor.rand (#63732) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63732 Test Plan: $ python test/distributed/_sharded_tensor/test_sharded_tensor.py --v $ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestCreateTensorFromParams --v $ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestShardedTensorChunked --v Imported from OSS Differential Revision: D30472621 D30472621 Reviewed By: pritamdamania87 Pulled By: bowangbj fbshipit-source-id: fd8ebf9b815fdc292ad1aad521f9f4f454163d0e --- .../_sharded_tensor/test_sharded_tensor.py | 197 +++++++++++++++++- torch/distributed/_sharded_tensor/__init__.py | 183 +++++++++++++++- torch/distributed/_sharded_tensor/api.py | 42 +++- 3 files changed, 403 insertions(+), 19 deletions(-) diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py index 6c03d9fdf631c..718b594c831ee 100644 --- a/test/distributed/_sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py @@ -126,8 +126,9 @@ def wrapper(self): class TestCreateTensorFromParams(TestCase): @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') def test_empty(self): + expected_dtype = torch.double tensor_properties = TensorProperties( - dtype=torch.double, + dtype=expected_dtype, layout=torch.strided, requires_grad=False, pin_memory=False, @@ -138,14 +139,15 @@ def test_empty(self): local_tensor = _create_tensor_from_params( 5, 10, local_device=local_device, tensor_init_params=tensor_init_params) self.assertEqual(local_device, local_tensor.device) - self.assertEqual(torch.double, local_tensor.dtype) + self.assertEqual(expected_dtype, local_tensor.dtype) self.assertEqual(torch.strided, local_tensor.layout) self.assertEqual(False, local_tensor.requires_grad) @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') def test_ones(self): + expected_dtype = torch.double tensor_properties = TensorProperties( - dtype=torch.double, + dtype=expected_dtype, layout=torch.strided, requires_grad=False, pin_memory=False, @@ -153,9 +155,98 @@ def test_ones(self): tensor_init_params = TensorInitParams( create_op=CreateOp.ONES, tensor_properties=tensor_properties) local_device = torch.device('cuda:0') + h, w = 5, 10 local_tensor = _create_tensor_from_params( - 5, 10, local_device=local_device, tensor_init_params=tensor_init_params) - expected_tensor = torch.ones(5, 10, device=local_device, dtype=torch.double) + h, w, local_device=local_device, tensor_init_params=tensor_init_params) + expected_tensor = torch.ones(h, w, device=local_device, dtype=expected_dtype) + self.assertEqual(expected_tensor, local_tensor) + + @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') + def test_zeros(self): + expected_dtype = torch.int32 + tensor_properties = TensorProperties( + dtype=expected_dtype, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + ) + tensor_init_params = TensorInitParams(create_op=CreateOp.ZEROS, tensor_properties=tensor_properties, ) + local_device = torch.device('cuda:0') + h, w = 5, 10 + local_tensor = _create_tensor_from_params( + h, w, local_device=local_device, tensor_init_params=tensor_init_params) + expected_tensor = torch.zeros(h, w, device=local_device, dtype=expected_dtype) + self.assertEqual(expected_tensor, local_tensor) + + @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') + def test_rand(self): + expected_dtype = torch.double + tensor_properties = TensorProperties( + dtype=expected_dtype, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + ) + tensor_init_params = TensorInitParams(create_op=CreateOp.RAND, tensor_properties=tensor_properties, ) + local_device = torch.device('cuda:0') + h, w = 5, 10 + seed = 13 + torch.cuda.manual_seed(seed) + local_tensor = _create_tensor_from_params( + h, w, local_device=local_device, tensor_init_params=tensor_init_params) + # reset seed to ensure same random numbers are generated + torch.cuda.manual_seed(seed) + expected_tensor = torch.rand(h, w, device=local_device, dtype=expected_dtype) + self.assertEqual(expected_tensor, local_tensor) + + @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') + def test_full_with_dtype_inferred(self): + fill_value = 23.5 + tensor_properties = TensorProperties( + # tensor's dtype can be inferred from fill_value + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + ) + tensor_init_params = TensorInitParams( + create_op=CreateOp.FULL, + fill_value=fill_value, + tensor_properties=tensor_properties, ) + local_device = torch.device('cuda:0') + h, w = 5, 10 + local_tensor = _create_tensor_from_params( + h, w, local_device=local_device, tensor_init_params=tensor_init_params) + # local_tensor.dtype is inferred from fill_value (float32). + self.assertEqual(torch.float32, local_tensor.dtype) + expected_tensor = torch.full((h, w), fill_value=fill_value, device=local_device) + self.assertEqual(expected_tensor, local_tensor) + + @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') + def test_full_with_dtype_overridden(self): + fill_value = 23.5 + tensor_properties = TensorProperties( + # tensor's dtype can be inferred from fill_value + dtype=torch.double, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + ) + tensor_init_params = TensorInitParams( + create_op=CreateOp.FULL, + fill_value=fill_value, + tensor_properties=tensor_properties, ) + local_device = torch.device('cuda:0') + h, w = 5, 10 + local_tensor = _create_tensor_from_params( + h, w, local_device=local_device, tensor_init_params=tensor_init_params) + # local_tensor.dtype is overridden. + self.assertEqual(torch.double, local_tensor.dtype) + expected_tensor = torch.full((h, w), fill_value=fill_value, device=local_device, dtype=torch.double) self.assertEqual(expected_tensor, local_tensor) class TestShardedTensorChunked(ShardedTensorTestBase, MultiProcessTestCase): @@ -292,6 +383,102 @@ def test_create_sharded_tensor_with_ones(self): self.assertEqual((expected_h, w), local_shard.size()) self.assertEqual(local_shard, torch.ones(expected_h, w)) + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_create_sharded_tensor_with_zeros(self): + """ Test _sharded_tensor.zeros(...) """ + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + h, w = 10, 20 + sharded_tensor = _sharded_tensor.zeros(spec, h, w) + + # Validate local shard is initialized with torch.zeros + local_shards = sharded_tensor.local_shards() + self.assertEqual(1, len(local_shards)) + local_shard = local_shards[0].tensor + self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device) + # The split: for rank!=3 ceil(h/4)=3 for rank=3 1 + expected_h = 1 if self.rank == 3 else math.ceil(h / 4) + self.assertEqual((expected_h, w), local_shard.size()) + self.assertEqual(local_shard, torch.zeros(expected_h, w)) + + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_create_sharded_tensor_with_rand(self): + """ Test _sharded_tensor.rand(...) """ + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + h, w = 8, 2 + seed = 1234 + + expected_h = 2 + expected_device = torch.device(f"cuda:{self.rank}") + dtype = torch.double + torch.manual_seed(seed) + expected = torch.rand(expected_h, w, device=expected_device, dtype=dtype) + # reset seed to ensure the same random numbers are generated + torch.manual_seed(seed) + sharded_tensor = _sharded_tensor.rand(spec, h, w, dtype=dtype) + + # Validate local shard is initialized with torch.rand + local_shards = sharded_tensor.local_shards() + self.assertEqual(1, len(local_shards)) + local_shard = local_shards[0].tensor + self.assertEqual(expected_device, local_shard.device) + self.assertEqual((expected_h, w), local_shard.size()) + self.assertEqual(expected, local_shard) + + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_create_sharded_tensor_with_full(self): + """ Test _sharded_tensor.full(...) """ + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + h, w = 10, 20 + fill_value = 1234 + sharded_tensor = _sharded_tensor.full(spec, size=(h, w), fill_value=fill_value, dtype=torch.int32) + + # Validate local shard is initialized with torch.full + local_shards = sharded_tensor.local_shards() + self.assertEqual(1, len(local_shards)) + local_shard = local_shards[0].tensor + self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device) + # The split: for rank!=3 ceil(h/4)=3 for rank=3 1 + expected_h = 1 if self.rank == 3 else math.ceil(h / 4) + self.assertEqual((expected_h, w), local_shard.size()) + self.assertEqual(local_shard, + torch.full(size=(expected_h, w), fill_value=fill_value, dtype=torch.int32)) + + @with_comms @skip_if_lt_x_gpu(4) @requires_nccl() diff --git a/torch/distributed/_sharded_tensor/__init__.py b/torch/distributed/_sharded_tensor/__init__.py index 4cbdded8ba1c4..4f8646d54268c 100644 --- a/torch/distributed/_sharded_tensor/__init__.py +++ b/torch/distributed/_sharded_tensor/__init__.py @@ -1,7 +1,5 @@ -from typing import List +# coding=utf-8 -import torch -from torch.distributed._sharding_spec import ShardingSpec from .api import ( CreateOp, Shard, @@ -11,6 +9,9 @@ TensorProperties, load_with_process_group, ) +from torch.distributed._sharding_spec import ShardingSpec +from typing import List +import torch def empty(sharding_spec: ShardingSpec, @@ -23,7 +24,8 @@ def empty(sharding_spec: ShardingSpec, process_group=None, init_rrefs=False): """ - Creates an empty :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion. + Returns a :class:`ShardedTensor` filled with uninitialized data. + Needs to be called on all ranks in an SPMD fashion. Args: sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification @@ -74,7 +76,8 @@ def ones(sharding_spec: ShardingSpec, process_group=None, init_rrefs=False): """ - Creates a ones :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion. + Returns a :class:`ShardedTensor` with the scalar value 1. + Needs to be called on all ranks in an SPMD fashion. Args: sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification @@ -113,10 +116,172 @@ def ones(sharding_spec: ShardingSpec, init_rrefs=init_rrefs, ) -def init_from_local_shards(local_shards: List[Shard], - sharded_tensor_metadata: ShardedTensorMetadata, - process_group=None, - init_rrefs=False): + +def rand(sharding_spec: ShardingSpec, + *size, + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + process_group=None, + init_rrefs=False): + """ + Returns a :class:`ShardedTensor` filled with random numbers from a uniform distribution on the + interval :math:`[0, 1)`. Needs to be called on all ranks in an SPMD fashion. + + Args: + sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification + describing how to shard the Tensor. + size (int...): a sequence of integers defining the shape of the output + tensor. Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + process_group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + init_rrefs (bool, optional): Whether or not to initialize + :class:`torch.distributed.rpc.RRef`s pointing to remote shards. + Need to initialize the RPC Framework if specified as ``True``. + Default: ``False``. + + Returns: + A :class:`ShardedTensor` object on each rank + """ + tensor_properties = TensorProperties( + dtype=dtype, layout=layout, requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format + ) + tensor_init_params = TensorInitParams(create_op=CreateOp.RAND, tensor_properties=tensor_properties, ) + return ShardedTensor( + sharding_spec, + *size, + tensor_init_params=tensor_init_params, + process_group=process_group, + init_rrefs=init_rrefs, + ) + + +def zeros(sharding_spec: ShardingSpec, + *size, + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + process_group=None, + init_rrefs=False): + """ + Returns a :class:`ShardedTensor` filled with the scalar value 0. + Needs to be called on all ranks in an SPMD fashion. + + Args: + sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification + describing how to shard the Tensor. + size (int...): a sequence of integers defining the shape of the output + tensor. Can be a variable number of arguments or a collection like a list or tuple. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + process_group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + init_rrefs (bool, optional): Whether or not to initialize + :class:`torch.distributed.rpc.RRef`s pointing to remote shards. + Need to initialize the RPC Framework if specified as ``True``. + Default: ``False``. + + Returns: + A :class:`ShardedTensor` object on each rank + """ + tensor_properties = TensorProperties( + dtype=dtype, layout=layout, requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format, + ) + tensor_init_params = TensorInitParams(create_op=CreateOp.ZEROS, tensor_properties=tensor_properties, ) + return ShardedTensor( + sharding_spec, + *size, + tensor_init_params=tensor_init_params, + process_group=process_group, + init_rrefs=init_rrefs, + ) + + +def full(sharding_spec: ShardingSpec, + size, + fill_value=torch.types.Number, + dtype=None, + layout=torch.strided, + requires_grad=False, + pin_memory=False, + memory_format=torch.contiguous_format, + process_group=None, + init_rrefs=False): + """ + Creates a :class:`ShardedTensor` filled with fill_value. The tensor’s dtype + is inferred from fill_value. If dtype is specified, it will override the + inferred type from fill_value. Needs to be called on all ranks in an SPMD fashion. + + Args: + sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification + describing how to shard the Tensor. + size (int...): a list, tuple, or `torch.Size` of integers defining the shape of the + output tensor. + fill_value (Scalar) – the value to fill the output tensor with. + + Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. + Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`). + layout (:class:`torch.layout`, optional): the desired layout of returned Tensor. + Default: ``torch.strided``. + requires_grad (bool, optional): If autograd should record operations on the + returned tensor. Default: ``False``. + pin_memory (bool, optional): If set, returned tensor would be allocated in + the pinned memory. Works only for CPU tensors. Default: ``False``. + process_group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + init_rrefs (bool, optional): Whether or not to initialize + :class:`torch.distributed.rpc.RRef`s pointing to remote shards. + Need to initialize the RPC Framework if specified as ``True``. + Default: ``False``. + + Returns: + A :class:`ShardedTensor` object on each rank + """ + tensor_properties = TensorProperties( + dtype=dtype, layout=layout, requires_grad=requires_grad, + pin_memory=pin_memory, memory_format=memory_format, + ) + tensor_init_params = TensorInitParams( + create_op=CreateOp.FULL, fill_value=fill_value, tensor_properties=tensor_properties) + return ShardedTensor( + sharding_spec, + *size, + tensor_init_params=tensor_init_params, + process_group=process_group, + init_rrefs=init_rrefs, + ) + + +def init_from_local_shards( + local_shards: List[Shard], + sharded_tensor_metadata: ShardedTensorMetadata, + process_group=None, + init_rrefs=False): """ Creates an :class:`ShardedTensor` from local shards and the global metadata. Needs to be called on all ranks in an SPMD fashion. diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py index ae1a3a9f38844..3b7476dc25bcf 100644 --- a/torch/distributed/_sharded_tensor/api.py +++ b/torch/distributed/_sharded_tensor/api.py @@ -22,7 +22,7 @@ check_tensor, validate_non_overlapping_shards_metadata ) - +from torch.types import Number # Tracking for sharded tensor objects. _sharded_tensor_lock = threading.Lock() @@ -143,17 +143,28 @@ def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]] class CreateOp(Enum): EMPTY = 0 - ONES = 1 + FULL = 1 + ONES = 2 + RAND = 3 + ZEROS = 4 @dataclass class TensorInitParams(object): """ Container for list of common params to create new local tensor. """ - __slots__ = ['create_op', 'tensor_properties'] - create_op: CreateOp - tensor_properties: TensorProperties + + # needed when create_op is FULL + # default set to False (not None) since None is incompatible with Number. + fill_value: Number = field(default=False) + + tensor_properties: TensorProperties = field( + default=TensorProperties(dtype=torch.get_default_dtype(), + layout=torch.strided, + requires_grad=False, + memory_format=torch.contiguous_format, + pin_memory=False)) class ShardedTensor(object): @@ -684,5 +695,26 @@ def _create_tensor_from_params(*size, local_device, tensor_init_params: TensorIn device=local_device, requires_grad=requires_grad, # NB: memory_format param is not accepted by torch.ones memory_format=memory_format, pin_memory=pin_memory,) + elif tensor_init_params.create_op == CreateOp.ZEROS: + return torch.zeros(*size, + dtype=dtype, + layout=layout, + device=local_device, + pin_memory=pin_memory, + requires_grad=requires_grad,) + elif tensor_init_params.create_op == CreateOp.RAND: + return torch.rand(*size, + dtype=dtype, + layout=layout, + device=local_device, + pin_memory=pin_memory, + requires_grad=requires_grad,) + elif tensor_init_params.create_op == CreateOp.FULL: + return torch.full(size=size, + fill_value=tensor_init_params.fill_value, + layout=layout, + dtype=dtype, + requires_grad=requires_grad, + device=local_device, ) else: raise ValueError(f'Unsupported create_op: {tensor_init_params.create_op}') From 92a154aa29186afea961a6fe491721229543535f Mon Sep 17 00:00:00 2001 From: soulitzer Date: Thu, 26 Aug 2021 16:00:21 -0700 Subject: [PATCH 271/530] Move variabletype functions around (#63330) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63330 - This is in preparation for templated/boxed autograd-not-implemented fallback - Make sure VariableTypeUtils does not depend on generated code - Lift `isFwGradDefined` into `autograd/functions/utils.cpp` so it's available to mobile builds - Removes `using namespace at` from VariableTypeUtils, previously we needed this for Templated version, but now its not strictly necessary but still a good change to avoid name conflicts if this header is included elsewhere in the future. Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30518573 Pulled By: soulitzer fbshipit-source-id: a0fb904baafc9713de609fffec4b813f6cfcc000 --- tools/autograd/templates/VariableType.cpp | 1 + torch/csrc/autograd/FunctionsManual.cpp | 6 +-- torch/csrc/autograd/FunctionsManual.h | 1 - torch/csrc/autograd/VariableTypeManual.cpp | 5 ++- torch/csrc/autograd/VariableTypeUtils.h | 50 ++++++++++------------ torch/csrc/autograd/functions/utils.h | 5 +++ 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 1ff3604ec21ea..605a700fb1a47 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -1,4 +1,5 @@ #include "torch/csrc/autograd/VariableTypeUtils.h" +#include "torch/csrc/autograd/generated/VariableType.h" #include "torch/csrc/autograd/FunctionsManual.h" #include diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 86639c13ea678..95170f073fc38 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -1,5 +1,7 @@ #include #include +#include + #include #include @@ -44,10 +46,6 @@ bool isDefined(const c10::optional& t) { return t.has_value() && t->defined(); } -bool isFwGradDefined(const c10::optional& t) { - return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined(); -} - Tensor toNonOptTensor(const c10::optional& t) { return t.has_value() ? *t : Tensor(); } diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index d397f55d15189..31a972e3f3280 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -31,7 +31,6 @@ struct IndexRangeGenerator { size_t i = 0; }; -bool isFwGradDefined(const c10::optional& t); Tensor toNonOptFwGrad(const c10::optional& t); Tensor toNonOptPrimal(const c10::optional& t); Tensor toNonOptTensor(const c10::optional& t); diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index f409daa9b83d6..25f05fc110177 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -100,7 +101,7 @@ Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor & self, int64_t level) { if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } - if (generated::details::isFwGradDefined(self)) { + if (isFwGradDefined(self)) { // Modified from original codegen // We explicitly want to ignore the forward grad at the given level TORCH_CHECK(level == 0, "Invalid level given to _fw_primal"); @@ -131,7 +132,7 @@ Tensor & copy_(c10::DispatchKeySet ks, Tensor & self, const Tensor & src, bool n rebase_history(self , std::move(grad_fn)); if (isDifferentiableType(self.scalar_type()) && - (generated::details::isFwGradDefined(self) || generated::details::isFwGradDefined(src))) { + (isFwGradDefined(self) || isFwGradDefined(src))) { auto self_fw_grad = generated::details::toNonOptFwGrad(self); auto src_fw_grad = generated::details::toNonOptFwGrad(src); Tensor new_fw_grad; diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index bde2dc46352da..977e9e4cecd5c 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -1,14 +1,12 @@ #pragma once #include -#include #include #include #include #include #include -#include #include #include #include @@ -35,9 +33,6 @@ #endif #endif -using namespace at; -using namespace torch::autograd::generated; - namespace torch { namespace autograd { // The requires_grad argument is used to know if the inplace operation needs @@ -47,7 +42,7 @@ namespace torch { namespace autograd { // a = torch.rand(2) // b = torch.rand(2, requires_grad=True) // a.copy_(b) -inline void check_inplace(const Tensor& tensor, bool requires_grad) { +inline void check_inplace(const at::Tensor& tensor, bool requires_grad) { if (requires_grad && GradMode::is_enabled()) { auto diff_view_meta = impl::get_view_autograd_meta(tensor); if (diff_view_meta && diff_view_meta->has_bw_view()) { @@ -65,7 +60,7 @@ inline void check_inplace(const Tensor& tensor, bool requires_grad) { } } -inline void check_inplace(const TensorList tensors, bool requires_grad) { +inline void check_inplace(const at::TensorList tensors, bool requires_grad) { for (const auto& tensor : tensors) { check_inplace(tensor, requires_grad); } @@ -77,14 +72,14 @@ inline void throw_error_out_requires_grad(const char* name) { "but one of the arguments requires grad."); } -inline void throw_error_for_complex_autograd(const Tensor& tensor, const char* name) { +inline void throw_error_for_complex_autograd(const at::Tensor& tensor, const char* name) { if (tensor.requires_grad()) { TORCH_CHECK(!tensor.is_complex(), name, " does not support automatic differentiation for outputs with complex dtype."); } } -inline void throw_error_for_complex_autograd(const TensorList& tensorlist, const char* name) { +inline void throw_error_for_complex_autograd(const at::TensorList& tensorlist, const char* name) { for (const auto& tensor: tensorlist) { throw_error_for_complex_autograd(tensor, name); } @@ -114,7 +109,7 @@ inline void rebase_history(std::vector&& vars, std::shared_ptr g } } -inline void increment_version(const Tensor & t) { +inline void increment_version(const at::Tensor & t) { impl::bump_version(t); } @@ -138,8 +133,8 @@ template inline variable_list flatten_tensor_args(Args&&... ar } // See NOTE [ Autograd View Variables ] for details. -inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_differentiable, - bool is_fw_differentiable, std::function view_func=nullptr, +inline at::Tensor as_view(const at::Tensor & base, const at::Tensor & tensor, bool is_bw_differentiable, + bool is_fw_differentiable, std::function view_func=nullptr, CreationMeta creation_meta=CreationMeta::DEFAULT, bool allow_tensor_metadata_change=true) { // Note [View of inference tensor] // For inference tensor this code can only be hit outside InferenceMode @@ -202,7 +197,7 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif } // See NOTE [ Autograd View Variables ] for details. -inline std::vector as_view(const Tensor & base, std::vector& tensors, bool is_bw_differentiable, +inline std::vector as_view(const at::Tensor & base, std::vector& tensors, bool is_bw_differentiable, bool is_fw_differentiable, CreationMeta creation_meta=CreationMeta::DEFAULT) { // See Note [View of inference tensor] if (base.is_inference()) return tensors; @@ -228,7 +223,7 @@ inline std::vector as_view(const Tensor & base, std::vector& ten new_shared_info = ViewInfo(base, /* view_func */ nullptr); } - for(Tensor &tensor : tensors) { + for(at::Tensor &tensor : tensors) { if (is_fw_differentiable || is_bw_differentiable) { tensor = make_variable_differentiable_view(tensor, new_shared_info, c10::nullopt, /*shared_view_info*/ true, creation_meta); } else { @@ -282,7 +277,7 @@ inline std::vector as_view(const Tensor & base, std::vector& ten creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta); } - for(Tensor &tensor : tensors) { + for(at::Tensor &tensor : tensors) { if (is_fw_differentiable || is_bw_differentiable) { tensor = make_variable_differentiable_view(tensor, new_bw_info, new_fw_info, /*shared_view_info*/ false, creation_meta); } else { @@ -292,20 +287,20 @@ inline std::vector as_view(const Tensor & base, std::vector& ten return tensors; } -inline void check_no_requires_grad(const Tensor& tensor, const char* name, +inline void check_no_requires_grad(const at::Tensor& tensor, const char* name, const char* fn_name="", bool check_grad_mode=true) { TORCH_CHECK(!(tensor.defined() && tensor.requires_grad()) || !(check_grad_mode && GradMode::is_enabled()), "The function '", fn_name, "' is not differentiable with respect to argument '", name, "'. This input cannot have requires_grad True."); } -inline void check_no_requires_grad(const c10::optional& tensor, const char* name, const char* fn_name="") { +inline void check_no_requires_grad(const c10::optional& tensor, const char* name, const char* fn_name="") { if (tensor.has_value()) { check_no_requires_grad(*tensor, name, fn_name); } } -inline void check_no_requires_grad(TensorList tensors, const char* name, const char* fn_name="") { +inline void check_no_requires_grad(at::TensorList tensors, const char* name, const char* fn_name="") { // GradMode check is expensive, so check it only once for TensorLists if (!GradMode::is_enabled()) { return; @@ -315,12 +310,12 @@ inline void check_no_requires_grad(TensorList tensors, const char* name, const c } } -inline void check_no_requires_grad(const c10::List>& tensors, const char* name, const char* fn_name="") { +inline void check_no_requires_grad(const c10::List>& tensors, const char* name, const char* fn_name="") { // GradMode check is expensive, so check it only once for TensorLists if (!GradMode::is_enabled()) { return; } - for (c10::optional tensor : tensors) { + for (c10::optional tensor : tensors) { if (tensor.has_value()) { check_no_requires_grad(*tensor, name, fn_name, /*check_grad_mode*/ false); } @@ -328,23 +323,23 @@ inline void check_no_requires_grad(const c10::List>& tenso } // Assumed that saved tensor lists are never inplace outputs -inline std::vector make_saved_variable_list(TensorList tensors) { - return fmap(tensors, [](const Tensor& tensor) -> SavedVariable { +inline std::vector make_saved_variable_list(at::TensorList tensors) { + return fmap(tensors, [](const at::Tensor& tensor) -> SavedVariable { return SavedVariable{tensor, false /* is output */}; }); } // Assumed that saved tensor lists are never inplace outputs inline std::vector make_saved_variable_list(const c10::List>& tensors) { - return fmap(tensors, [](const c10::optional& tensor) -> SavedVariable { + return fmap(tensors, [](const c10::optional& tensor) -> SavedVariable { if (tensor.has_value()) { return SavedVariable{*tensor, false /* is output */}; } else { - return SavedVariable{Tensor(), false /* is output */}; + return SavedVariable{at::Tensor(), false /* is output */}; } }); } -inline std::vector> to_args_sizes(TensorList tensors) { +inline std::vector> to_args_sizes(at::TensorList tensors) { std::vector> args_sizes(tensors.size()); for (const auto i : c10::irange(tensors.size())) { args_sizes[i] = tensors[i].sizes().vec(); @@ -352,11 +347,12 @@ inline std::vector> to_args_sizes(TensorList tensors) { return args_sizes; } -inline std::vector to_args_scalartypes(TensorList tensors) { - std::vector args_scalartypes(tensors.size()); +inline std::vector to_args_scalartypes(at::TensorList tensors) { + std::vector args_scalartypes(tensors.size()); for (const auto i : c10::irange(tensors.size())) { args_scalartypes[i] = tensors[i].scalar_type(); } return args_scalartypes; } + }} // namespace torch::autograd diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h index 90811e2a30a37..331db5d32cb79 100644 --- a/torch/csrc/autograd/functions/utils.h +++ b/torch/csrc/autograd/functions/utils.h @@ -86,4 +86,9 @@ inline void set_history( set_history(variable, grad_fn); } } + +inline bool isFwGradDefined(const c10::optional& t) { + return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined(); +} + }} From dfa35ab3e710848353aa1d313c5d9127ed2ef745 Mon Sep 17 00:00:00 2001 From: Shijun Kong Date: Thu, 26 Aug 2021 16:06:17 -0700 Subject: [PATCH 272/530] [pytorch][quant][oss] Support 2-bit embedding_bag op "embedding_bag_2bit_rowwise_offsets" (#63658) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63658 Support 2-bit embedding_bag op "embedding_bag_2bit_rowwise_offsets" Reviewed By: jingsh, supriyar Differential Revision: D30454994 fbshipit-source-id: 7aa7bfe405c2ffff639d5658a35181036e162dc9 --- .../native/quantized/cpu/qembeddingbag.cpp | 127 +++++++++++++++--- aten/src/ATen/native/quantized/library.cpp | 1 + test/quantization/core/test_quantized_op.py | 30 +++++ 3 files changed, 142 insertions(+), 16 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index 7adf05a1782ce..6aae3ba02ae09 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -141,9 +141,10 @@ at::Tensor& embedding_lookup_fallback_impl( } template -at::Tensor& embedding_bag_4bit_impl( +at::Tensor& embedding_bag_nbit_impl( at::Tensor& output, const at::Tensor& weight, + const int bit_width, const at::Tensor& indices, const at::Tensor& offsets, bool pruned_weights, @@ -174,8 +175,9 @@ at::Tensor& embedding_bag_4bit_impl( const auto weight_sizes = weight.sizes(); const int64_t weight_size = weight_sizes[1]; + int NUM_ELEM_PER_BYTE = 8 / bit_width; const int64_t D = - (weight_size - 4) * 2; // NB: 2-byte fp16 scale and 2-byte zero_offset + (weight_size - 2 * sizeof(at::Half)) * NUM_ELEM_PER_BYTE; // NB: 2-byte fp16 scale and 2-byte zero_offset const int64_t M = offsets.sizes()[0]; int64_t output_size = M - 1; @@ -211,7 +213,7 @@ at::Tensor& embedding_bag_4bit_impl( if (!pruned_weights || fallback_to_no_sparse) { // Generate the fbgemm kernel auto kernel = fbgemm::GenerateEmbeddingSpMDMNBit( - /*bit rate=*/4, + /*bit rate=*/bit_width, /*block size=*/block_size, /*has weights=*/per_sample_weights_.has_value(), /*normalize_by_lengths=*/false, @@ -234,11 +236,13 @@ at::Tensor& embedding_bag_4bit_impl( TORCH_CHECK( success, - "FBGEMM GenerateEmbeddingSpMDMNBit kernel failed for 4-bit input"); + "FBGEMM GenerateEmbeddingSpMDMNBit kernel failed for ", + bit_width, + "-bit input"); } else { auto kernel = fbgemm::GenerateEmbeddingSpMDMNBitRowWiseSparse( - /*bit rate=*/4, + /*bit rate=*/bit_width, /*block_size=*/block_size, /*has weights=*/per_sample_weights_.has_value(), /*normalize_by_lengths=*/false, @@ -260,11 +264,14 @@ at::Tensor& embedding_bag_4bit_impl( /*compressed_indices_table=*/compressed_indices_mapping_data); TORCH_CHECK( success, - "FBGEMM GenerateEmbeddingSpMDMNBitRowWiseSparse kernel failed for 4-bit input"); + "FBGEMM GenerateEmbeddingSpMDMNBitRowWiseSparse kernel failed for ", + bit_width, + "-bit input"); } return output; #else - return embedding_lookup_fallback_impl( + if (bit_width == 4) { + return embedding_lookup_fallback_impl( weight, indices, offsets, @@ -275,6 +282,19 @@ at::Tensor& embedding_bag_4bit_impl( output_size, include_last_offset, (pruned_weights && !fallback_to_no_sparse)); + } + // bit_width == 2 + return embedding_lookup_fallback_impl( + weight, + indices, + offsets, + per_sample_weights_, + compressed_indices_mapping, + output, + D, + output_size, + include_last_offset, + (pruned_weights && !fallback_to_no_sparse)); #endif } @@ -519,9 +539,10 @@ at::Tensor& embedding_bag_byte_helper( is_embedding_op); } -at::Tensor& embedding_bag_4bit_helper( +at::Tensor& _embedding_bag_nbit_helper( at::Tensor& output, const at::Tensor& weight, + const int bit_width, const at::Tensor& indices, const c10::optional& offsets_in, bool pruned_weights, @@ -529,6 +550,10 @@ at::Tensor& embedding_bag_4bit_helper( const c10::optional& compressed_indices_mapping, bool include_last_offset) { c10::MaybeOwned offsets; + TORCH_CHECK( + bit_width == 4 || bit_width == 2, + "qembedding/qembedding_bag operator supports bit_width 2 or 4, got ", + bit_width); TORCH_CHECK( indices.dim() == 1 || indices.dim() == 2, "qembedding/qembedding_bag operator supports 1 or 2d indices, got ", @@ -539,14 +564,14 @@ at::Tensor& embedding_bag_4bit_helper( if (indices.dim() == 2) { TORCH_CHECK( !offsets_in.has_value(), - "embedding_bag_4bit operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences."); + "embedding_bag_4bit/embedding_bag_2bit operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences."); offsets = c10::MaybeOwned::owned(at::arange( 0, indices.numel(), indices.sizes()[1], indices.scalar_type())); } else { TORCH_CHECK( offsets_in.has_value(), - "embedding_bag_4bit operator expects offsets to be set for 1D indices."); + "embedding_bag_4bit/embedding_bag_2bit operator expects offsets to be set for 1D indices."); offsets = c10::MaybeOwned::borrowed(offsets_in.value()); } @@ -568,9 +593,10 @@ at::Tensor& embedding_bag_4bit_helper( // Using helper function to support different type combination without the // need to cast, which can be additional performance overhead if (indices.scalar_type() == at::kInt && offsets->scalar_type() == at::kInt) { - return embedding_bag_4bit_impl( + return embedding_bag_nbit_impl( output, weight, + bit_width, indices, *offsets, pruned_weights, @@ -579,9 +605,10 @@ at::Tensor& embedding_bag_4bit_helper( include_last_offset); } else if ( indices.scalar_type() == at::kInt && offsets->scalar_type() == at::kLong) { - return embedding_bag_4bit_impl( + return embedding_bag_nbit_impl( output, weight, + bit_width, indices, *offsets, pruned_weights, @@ -590,9 +617,10 @@ at::Tensor& embedding_bag_4bit_helper( include_last_offset); } else if ( indices.scalar_type() == at::kLong && offsets->scalar_type() == at::kInt) { - return embedding_bag_4bit_impl( + return embedding_bag_nbit_impl( output, weight, + bit_width, indices, *offsets, pruned_weights, @@ -600,9 +628,10 @@ at::Tensor& embedding_bag_4bit_helper( compressed_indices_mapping, include_last_offset); } - return embedding_bag_4bit_impl( + return embedding_bag_nbit_impl( output, weight, + bit_width, indices, *offsets, pruned_weights, @@ -650,9 +679,10 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit( } auto output = at::empty({0}, packed_w.options().dtype(at::kFloat)); - return embedding_bag_4bit_helper( + return _embedding_bag_nbit_helper( output, packed_w, + 4, indices, offsets_in, pruned_weights, @@ -709,9 +739,44 @@ Tensor& embedding_bag_4bit_rowwise_offsets_out( per_sample_weights_.value().scalar_type(), " instead") } - return embedding_bag_4bit_helper( + return _embedding_bag_nbit_helper( + output, + weight, + 4, + indices, + offsets_in, + pruned_weights, + per_sample_weights_.has_value() + ? per_sample_weights_.value().to(at::kFloat) + : per_sample_weights_, + compressed_indices_mapping, + include_last_offset); +} + +Tensor& embedding_bag_2bit_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const c10::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const c10::optional& per_sample_weights_, + const c10::optional& compressed_indices_mapping, + bool include_last_offset) { + + if (per_sample_weights_.has_value()) { + TORCH_CHECK( + (per_sample_weights_.value().scalar_type() == at::kFloat || + per_sample_weights_.value().scalar_type() == at::kHalf), + "Expect fp32 or fp16 weights, but found", + per_sample_weights_.value().scalar_type(), + " instead") + } + return _embedding_bag_nbit_helper( output, weight, + 2, indices, offsets_in, pruned_weights, @@ -784,6 +849,33 @@ Tensor embedding_bag_4bit_rowwise_offsets( return output; } +Tensor embedding_bag_2bit_rowwise_offsets( + const Tensor& weight, + const Tensor& indices, + const c10::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const c10::optional& per_sample_weights_, + const c10::optional& compressed_indices_mapping, + bool include_last_offset) { + + auto output = create_empty_from(weight, at::kFloat); + embedding_bag_2bit_rowwise_offsets_out( + output, + weight, + indices, + offsets_in, + false, // unused scale_grad_by_freq + 0, // unused mode + pruned_weights, + per_sample_weights_, + compressed_indices_mapping, + include_last_offset + ); + return output; +} + template class QEmbeddingBag final { public: @@ -869,6 +961,9 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) { m.impl( TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_rowwise_offsets"), embedding_bag_4bit_rowwise_offsets); + m.impl( + TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_rowwise_offsets"), + embedding_bag_2bit_rowwise_offsets); } } // namespace } // namespace native diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index 7cdb5cb35817a..8ead74f326ff2 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -128,6 +128,7 @@ TORCH_LIBRARY(quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool pruned_weights=False) -> Tensor")); diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index 18212671aabaa..9243fe2440173 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -3318,6 +3318,9 @@ def embedding_bag_rowwise_offsets_run( if bit_rate == 4: pt_op = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets pt_prepack_op = torch.ops.quantized.embedding_bag_4bit_prepack + elif bit_rate == 2: + pt_op = torch.ops.quantized.embedding_bag_2bit_rowwise_offsets + pt_prepack_op = torch.ops.quantized.embedding_bag_2bit_prepack weights = torch.from_numpy((np.random.random_sample(( num_embeddings, embedding_dim)) + 1).astype(np.float32)) @@ -3483,6 +3486,33 @@ def test_embedding_bag_4bit(self, num_embeddings, sparsity=sparsity, atol=0.1, rtol=1e-2) + """ Tests the correctness of the embedding_bag_2bit quantized operator """ + @given(num_embeddings=st.integers(10, 100), + embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0), + num_offsets=st.integers(1, 20), + use_32bit_indices=st.booleans(), + use_32bit_offsets=st.booleans(), + enable_per_sample_weights=st.booleans(), + include_last_offset=st.booleans(), + fallback_to_no_sparse=st.booleans(), + sparsity=st.sampled_from([0.0, 0.5, 0.7])) + def test_embedding_bag_2bit(self, num_embeddings, + embedding_dim, num_offsets, + use_32bit_indices, + use_32bit_offsets, + enable_per_sample_weights, + include_last_offset, + fallback_to_no_sparse, + sparsity): + self.embedding_bag_rowwise_offsets_run(2, num_embeddings, + embedding_dim, num_offsets, + use_32bit_indices, use_32bit_offsets, + enable_per_sample_weights, + include_last_offset, + fallback_to_no_sparse, + sparsity=sparsity, + atol=1.0, rtol=1e-1) + """ Tests the correctness of the quantized embedding lookup operator """ @given(num_embeddings=st.integers(10, 100), embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0)) From 0bd8d0951dcb4063c0f7552a7404bd7f0e7b6e6f Mon Sep 17 00:00:00 2001 From: Don Jang Date: Thu, 26 Aug 2021 16:28:35 -0700 Subject: [PATCH 273/530] [Static Runtime] Remove unnecessary fb::equally_split nodes (#64022) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64022 Test Plan: - Added unittest `StaticRuntime.RemoveEquallySplitListUnpack`. Reviewed By: hlu1 Differential Revision: D30472189 fbshipit-source-id: 36040b0146f4be9d0d0fda293f7205f43aad0b87 --- torch/csrc/jit/runtime/static/passes.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp index c8e1107199528..1133e3924c32a 100644 --- a/torch/csrc/jit/runtime/static/passes.cpp +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -412,6 +412,7 @@ void ReplaceWithCopy( // c10::AliasAnalysisKind::PURE_FUNCTION to make alias analysis work. void FuseListUnpack(std::shared_ptr& graph) { auto nodes = graph->nodes(); + std::vector equally_splits_to_remove; for (auto it = nodes.begin(); it != nodes.end(); ++it) { Node* node = *it; const char* node_qual_string = node->kind().toQualString(); @@ -445,8 +446,22 @@ void FuseListUnpack(std::shared_ptr& graph) { it_next.destroyCurrent(); // remove list_unpack node->eraseOutput(0); + + if (strcmp(node_qual_string, "fb::equally_split") == 0 && + node->outputs().size() == 1) { + // This captures a case of `y = fb::equally_split(x, 1, _)` where y + // becomes just an alias of x. + // If this case is found, replace y with x to avoid executing this op. + equally_splits_to_remove.push_back(node); + } } } + + for (Node* node : equally_splits_to_remove) { + node->output(0)->replaceAllUsesWith(node->input(0)); + node->destroy(); + } + #ifndef NDEBUG graph->lint(); AliasDb db2(graph); From ed573a8e08fadaa611d568294df14c0a96dc4a81 Mon Sep 17 00:00:00 2001 From: Jiewen Tan Date: Thu, 26 Aug 2021 16:49:13 -0700 Subject: [PATCH 274/530] Enable test_api IMethodTest in OSS (#63345) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63345 This diff did the following few things to enable the tests: 1. Exposed IMethod as TORCH_API. 2. Linked torch_deploy to test_api if USE_DEPLOY == 1. 3. Generated torch::deploy examples when building torch_deploy library. Test Plan: ./build/bin/test_api --gtest_filter=IMethodTest.* Reviewed By: ngimel Differential Revision: D30346257 Pulled By: alanwaketan fbshipit-source-id: 932ae7d45790dfb6e00c51893933a054a0fad86d --- .jenkins/pytorch/test.sh | 5 ++- test/cpp/api/CMakeLists.txt | 8 +++++ test/cpp/api/imethod.cpp | 44 ++++++++++++++++---------- torch/csrc/api/include/torch/imethod.h | 2 +- 4 files changed, 40 insertions(+), 19 deletions(-) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index daa0da7eeca26..4eb1b35253c91 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -253,6 +253,7 @@ test_libtorch() { ln -sf "$TORCH_LIB_DIR"/libbackend_with_compiler.so "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libjitbackend_test.so "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR" + ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR" @@ -275,7 +276,8 @@ test_libtorch() { python test/cpp/jit/tests_setup.py shutdown # Wait for background download to finish wait - OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" "$TORCH_BIN_DIR"/test_api --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml + # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy. + OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml "$TORCH_BIN_DIR"/test_mobile_nnc --gtest_output=xml:$TEST_REPORTS_DIR/test_mobile_nnc.xml if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-py3* ]]; then @@ -488,6 +490,7 @@ test_torch_deploy() { ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR" "$TORCH_BIN_DIR"/test_deploy + "$TORCH_BIN_DIR"/test_api --gtest_filter='IMethodTest.*' assert_git_not_dirty } diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt index 9bd9d6780fe7d..fc21afaef6a8a 100644 --- a/test/cpp/api/CMakeLists.txt +++ b/test/cpp/api/CMakeLists.txt @@ -41,6 +41,10 @@ set(TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/grad_mode.cpp ) +if(USE_DEPLOY) + list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/imethod.cpp) +endif() + if(USE_CUDA) list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/parallel.cpp) endif() @@ -59,6 +63,10 @@ if(USE_CUDA) target_compile_definitions(test_api PRIVATE "USE_CUDA") endif() +if(USE_DEPLOY) + target_link_libraries(test_api PRIVATE torch_deploy) +endif() + # Workaround for https://github.com/pytorch/pytorch/issues/40941 if(USE_OPENMP AND CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0.0)) # Compiling transformer.cpp or pow_test.cpp with -O2+ and both -fuse-openmp and -faligned-newout any optimization diff --git a/test/cpp/api/imethod.cpp b/test/cpp/api/imethod.cpp index 3349d1b3a8a45..8673e55fb5629 100644 --- a/test/cpp/api/imethod.cpp +++ b/test/cpp/api/imethod.cpp @@ -8,30 +8,40 @@ using namespace ::testing; using namespace caffe2; -// TODO(T96218435): Enable the following tests in OSS. +const char* simple = "torch/csrc/deploy/example/generated/simple"; +const char* simpleJit = "torch/csrc/deploy/example/generated/simple_jit"; + +// TODO(jwtan): Try unifying cmake and buck for getting the path. +const char* path(const char* envname, const char* path) { + const char* env = getenv(envname); + return env ? env : path; +} + +// Run `python torch/csrc/deploy/example/generate_examples.py` before running the following tests. +// TODO(jwtan): Figure out a way to automate the above step for development. (CI has it already.) TEST(IMethodTest, CallMethod) { - auto script_model = torch::jit::load(getenv("SIMPLE_JIT")); - auto script_method = script_model.get_method("forward"); + auto scriptModel = torch::jit::load(path("SIMPLE_JIT", simpleJit)); + auto scriptMethod = scriptModel.get_method("forward"); torch::deploy::InterpreterManager manager(3); - torch::deploy::Package p = manager.load_package(getenv("SIMPLE")); - auto py_model = p.load_pickle("model", "model.pkl"); - torch::deploy::PythonMethodWrapper py_method(py_model, "forward"); + torch::deploy::Package package = manager.load_package(path("SIMPLE", simple)); + auto pyModel = package.load_pickle("model", "model.pkl"); + torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward"); auto input = torch::ones({10, 20}); - auto output_py = py_method({input}); - auto output_script = script_method({input}); - EXPECT_TRUE(output_py.isTensor()); - EXPECT_TRUE(output_script.isTensor()); - auto output_py_tensor = output_py.toTensor(); - auto output_script_tensor = output_script.toTensor(); - - EXPECT_TRUE(output_py_tensor.equal(output_script_tensor)); - EXPECT_EQ(output_py_tensor.numel(), 200); + auto outputPy = pyMethod({input}); + auto outputScript = scriptMethod({input}); + EXPECT_TRUE(outputPy.isTensor()); + EXPECT_TRUE(outputScript.isTensor()); + auto outputPyTensor = outputPy.toTensor(); + auto outputScriptTensor = outputScript.toTensor(); + + EXPECT_TRUE(outputPyTensor.equal(outputScriptTensor)); + EXPECT_EQ(outputPyTensor.numel(), 200); } TEST(IMethodTest, GetArgumentNames) { - auto scriptModel = torch::jit::load(getenv("SIMPLE_JIT")); + auto scriptModel = torch::jit::load(path("SIMPLE_JIT", simpleJit)); auto scriptMethod = scriptModel.get_method("forward"); auto& scriptNames = scriptMethod.getArgumentNames(); @@ -39,7 +49,7 @@ TEST(IMethodTest, GetArgumentNames) { EXPECT_STREQ(scriptNames[0].c_str(), "input"); torch::deploy::InterpreterManager manager(3); - torch::deploy::Package package = manager.load_package(getenv("SIMPLE")); + torch::deploy::Package package = manager.load_package(path("SIMPLE", simple)); auto pyModel = package.load_pickle("model", "model.pkl"); torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward"); diff --git a/torch/csrc/api/include/torch/imethod.h b/torch/csrc/api/include/torch/imethod.h index dfabf50ce7191..af010785a8016 100644 --- a/torch/csrc/api/include/torch/imethod.h +++ b/torch/csrc/api/include/torch/imethod.h @@ -4,7 +4,7 @@ namespace torch { -class IMethod { +class TORCH_API IMethod { /* IMethod provides a portable interface for torch methods, whether they are backed by torchscript or python/deploy. From 81764d1153c607e324390dcac107ea0970ba668c Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Thu, 26 Aug 2021 17:26:52 -0700 Subject: [PATCH 275/530] document that `torch.triangular_solve` has optional out= parameter (#63253) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63253 Fixes #57955 Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30312134 Pulled By: dagitses fbshipit-source-id: 4f484620f5754f4324a99bbac1ff783c64cee6b8 --- torch/_torch_docs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index a4f3bdaef7df7..bbb8d981ab8a2 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -9679,7 +9679,7 @@ def merge_dicts(*dicts): add_docstr(torch.triangular_solve, r""" -triangular_solve(b, A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor) +triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor) Solves a system of equations with a triangular coefficient matrix :math:`A` and multiple right-hand sides :math:`b`. @@ -9706,6 +9706,10 @@ def merge_dicts(*dicts): If True, the diagonal elements of :math:`A` are assumed to be 1 and not referenced from :math:`A`. Default: ``False``. +Keyword args: + out ((Tensor, Tensor), optional): tuple of two tensors to write + the output to. Ignored if `None`. Default: `None`. + Returns: A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient` is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b` From 0c9dce90ed6a12d81b0e769b76d6b0c282326823 Mon Sep 17 00:00:00 2001 From: Paul Johnson Date: Thu, 26 Aug 2021 17:28:35 -0700 Subject: [PATCH 276/530] [pytorch] add per_sample_weights support for embedding_bag_4bit_rowwise_offsets (#63605) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63605 Reviewed By: houseroad Differential Revision: D30434664 fbshipit-source-id: eb4cbae3c705f9dec5c073a56f0f23daee353bc1 --- .../native/quantized/cuda/embedding_bag.cu | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/aten/src/ATen/native/quantized/cuda/embedding_bag.cu b/aten/src/ATen/native/quantized/cuda/embedding_bag.cu index 6d44ce0f41873..55b0b0d4f36d0 100644 --- a/aten/src/ATen/native/quantized/cuda/embedding_bag.cu +++ b/aten/src/ATen/native/quantized/cuda/embedding_bag.cu @@ -56,15 +56,15 @@ dequantize_intx(uint32_t packedVals, float2 scale_bias, uint8_t offset_bits) { template __forceinline__ __device__ void -accumulate_packed_intx(float4* acc, uint32_t packedVals, float2 scale_bias) { +accumulate_packed_intx(float4* acc, uint32_t packedVals, float2 scale_bias, float sample_weight) { constexpr uint8_t dims_per_byte = 8 / bits_per_dim; for (uint8_t i = 0; i < dims_per_byte; i++) { float4 res = dequantize_intx(packedVals, scale_bias, 4 * bits_per_dim * i /* offset_bits */); // Accumulate in float32. - acc[i].x += res.x; - acc[i].y += res.y; - acc[i].z += res.z; - acc[i].w += res.w; + acc[i].x += (res.x * sample_weight); + acc[i].y += (res.y * sample_weight); + acc[i].z += (res.z * sample_weight); + acc[i].w += (res.w * sample_weight); } } @@ -77,7 +77,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel( const PackedTensorAccessor32 indices, const PackedTensorAccessor32 offsets, const bool /* pruned_weights */, - const c10::optional& per_sample_weights_, + const PackedTensorAccessor32 per_sample_weights_, const c10::optional& compressed_indices_mapping, const bool include_last_offset, PackedTensorAccessor32 output) { @@ -96,6 +96,8 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel( const int32_t D_bytes = weight.size(1); + bool use_per_sample = per_sample_weights_.size(0) > 0; + int64_t indices_start = offsets[t * B + b]; int64_t indices_end; if (include_last_offset) { @@ -124,6 +126,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel( } for (int32_t l = indices_start; l < indices_end; ++l) { int64_t idx = indices[l]; + float sample_weight = use_per_sample ? per_sample_weights_[l] : 1.0f; const uint8_t* __restrict__ row = &weights[idx * D_bytes]; float2 scale_bias; if (fp32_scale_bias) { @@ -138,7 +141,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel( uint32_t v0 = reinterpret_cast(&row[byte_offset])[0]; - accumulate_packed_intx(accumulator, v0, scale_bias); + accumulate_packed_intx(accumulator, v0, scale_bias, sample_weight); } @@ -204,9 +207,11 @@ at::Tensor& embedding_bag_byte_impl( const int D = weight_sizes[1] - 8; // NB: -8 to account for scale and bias const int64_t M = offsets.sizes()[0]; TORCH_CHECK(D % 4 == 0); - TORCH_CHECK( - !per_sample_weights_.has_value(), - "Per sample weights not yet implemented for embedding_bag_byte_rowwise_offsets_cuda"); + if(per_sample_weights_.has_value()) { + TORCH_CHECK(per_sample_weights_.value().scalar_type() == at::kFloat, + "Per sample weights expected scalar type ", at::kFloat, " but got ", + per_sample_weights_.value().scalar_type()); + } TORCH_CHECK( !compressed_indices_mapping.has_value(), "Compressed indices mapping not yet implemented for embedding_bag_byte_rowwise_offsets_cuda"); @@ -215,6 +220,13 @@ at::Tensor& embedding_bag_byte_impl( int64_t output_size = include_last_offset ? M - 1 : M; + at::Tensor sample_weights; + if (per_sample_weights_.has_value()) { + sample_weights = per_sample_weights_.value(); + } else { + sample_weights = create_empty_from(output, kFloat); + } + const std::vector shape = {output_size, D}; at::native::resize_(output, shape, c10::nullopt); AT_DISPATCH_INDEX_TYPES( @@ -228,7 +240,7 @@ at::Tensor& embedding_bag_byte_impl( indices.packed_accessor32(), offsets.packed_accessor32(), false /* pruned_weights */, - per_sample_weights_, + sample_weights.packed_accessor32(), compressed_indices_mapping, include_last_offset, output.packed_accessor32()); @@ -377,9 +389,11 @@ at::Tensor& embedding_bag_4bit_impl( const int D = 2*(weight_sizes[1] - 4); // NB: -4 to account for scale and bias @fp16 const int64_t M = offsets.sizes()[0]; TORCH_CHECK(D % 8 == 0); - TORCH_CHECK( - !per_sample_weights_.has_value(), - "Per sample weights not yet implemented for embedding_bag_byte_rowwise_offsets_cuda"); + if(per_sample_weights_.has_value()) { + TORCH_CHECK(per_sample_weights_.value().scalar_type() == at::kFloat, + "Per sample weights expected scalar type ", at::kFloat, " but got ", + per_sample_weights_.value().scalar_type()); + } TORCH_CHECK( !compressed_indices_mapping.has_value(), "Compressed indices mapping not yet implemented for embedding_bag_byte_rowwise_offsets_cuda"); @@ -388,6 +402,13 @@ at::Tensor& embedding_bag_4bit_impl( int64_t output_size = include_last_offset ? M - 1 : M; + at::Tensor sample_weights; + if (per_sample_weights_.has_value()) { + sample_weights = per_sample_weights_.value(); + } else { + sample_weights = create_empty_from(output, kFloat); + } + const std::vector shape = {output_size, D}; at::native::resize_(output, shape, c10::nullopt); AT_DISPATCH_INDEX_TYPES( @@ -401,7 +422,7 @@ at::Tensor& embedding_bag_4bit_impl( indices.packed_accessor32(), offsets.packed_accessor32(), false /* pruned_weights */, - per_sample_weights_, + sample_weights.packed_accessor32(), compressed_indices_mapping, include_last_offset, output.packed_accessor32()); From a5f35ac7cd12c32227fdcd42979dc9c6aea7ba07 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 26 Aug 2021 17:36:56 -0700 Subject: [PATCH 277/530] Run through failures on trunk (#64063) Summary: This PR runs all the tests on trunk instead of stopping on first failure. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64063 Reviewed By: malfet, seemethere Differential Revision: D30592020 Pulled By: janeyx99 fbshipit-source-id: 318b225cdf918a98f73e752d1cc0227d9227f36c --- .github/templates/bazel_ci_workflow.yml.j2 | 2 ++ .github/templates/linux_ci_workflow.yml.j2 | 2 ++ .github/templates/windows_ci_workflow.yml.j2 | 1 + .../workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 2 ++ .../workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml | 2 ++ .../workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 ++ .../workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 ++ .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 2 ++ .../workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml | 2 ++ .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 ++ .../workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml | 1 + .github/workflows/generated-win-vs2019-cpu-py3.yml | 1 + .github/workflows/generated-win-vs2019-cuda10.1-py3.yml | 1 + .github/workflows/generated-win-vs2019-cuda11.3-py3.yml | 1 + 14 files changed, 23 insertions(+) diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 index d25ffe6d8a7e5..f4e0034a0f5d6 100644 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ b/.github/templates/bazel_ci_workflow.yml.j2 @@ -29,6 +29,7 @@ on: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: !{{ build_environment }}-build-and-test NUM_TEST_SHARDS: !{{ num_test_shards }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -121,6 +122,7 @@ on: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e CONTINUE_THROUGH_ERROR \ -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index d9af899b04b66..52c0a09a9e1c5 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -286,6 +286,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -372,6 +373,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 1be7b325306d5..84a30bda92a36 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -195,6 +195,7 @@ jobs: https_proxy: "!{{ squid_proxy }}" RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} needs: [build, generate-test-matrix, !{{ ciflow_config.root_job_name }}] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 769efcaa80a3b..c51f8f047e986 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -262,6 +262,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -351,6 +352,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index ddd81c079df7c..014b1d1162d07 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -262,6 +262,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -351,6 +352,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 5a888d0104174..76b973eebce24 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -262,6 +262,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -351,6 +352,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 25d74de36dc66..8114bd541fdb3 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -262,6 +262,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -351,6 +352,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 341f9e6da6e8c..b5f062c53cb05 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -262,6 +262,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -351,6 +352,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 7ca389635bf56..71a9bf76dac22 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -123,6 +123,7 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-build-and-test NUM_TEST_SHARDS: 1 + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -220,6 +221,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e CONTINUE_THROUGH_ERROR \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 470fdaaad4230..7b947790902ec 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -260,6 +260,7 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - name: Log in to ECR run: | @@ -349,6 +350,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index ce4540b79cee7..6c87f40accd64 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -158,6 +158,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: False PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index d868d19d0fc2c..30f328ae71fdd 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -150,6 +150,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: False PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index 4d4550c9ce06b..72dd21dce3899 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -160,6 +160,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: True PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index c5ae48a888938..eb6e02fb5c2f3 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -160,6 +160,7 @@ jobs: https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" RUN_SMOKE_TESTS_ONLY_ON_PR: False PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} needs: [build, generate-test-matrix, ciflow_should_run] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} From 0e8c3c51d9a2a0d364f2707d4131ab12229dc826 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Aug 2021 17:59:59 -0700 Subject: [PATCH 278/530] port glu to use structured kernel approach (#61800) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61800 resubmitting because the [last one](https://github.com/pytorch/pytorch/pull/61433) was unrecoverable due to making changes incorrectly in the stack Test Plan: Imported from OSS Reviewed By: iramazanli Differential Revision: D29812492 Pulled By: makslevental fbshipit-source-id: c3dfeacd1e00a526e24fbaab02dad48069d690ef --- aten/src/ATen/native/Activation.h | 2 +- aten/src/ATen/native/GatedLinearUnit.cpp | 32 +++++++++++----------- aten/src/ATen/native/cpu/Activation.cpp | 2 +- aten/src/ATen/native/cuda/Activation.cu | 2 +- aten/src/ATen/native/native_functions.yaml | 6 ++-- 5 files changed, 23 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/native/Activation.h b/aten/src/ATen/native/Activation.h index 01782fae1de3f..f0c6d82af2b29 100644 --- a/aten/src/ATen/native/Activation.h +++ b/aten/src/ATen/native/Activation.h @@ -51,7 +51,7 @@ DECLARE_DISPATCH(softshrink_fn, softshrink_stub); DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub); DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub); DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub); -DECLARE_DISPATCH(activation_fn, glu_stub); +DECLARE_DISPATCH(structured_activation_fn, glu_stub); DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub); DECLARE_DISPATCH(structured_activation_fn, silu_stub); DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub); diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp index a0e2c16ed645f..c585caa71a011 100644 --- a/aten/src/ATen/native/GatedLinearUnit.cpp +++ b/aten/src/ATen/native/GatedLinearUnit.cpp @@ -3,12 +3,11 @@ #include namespace at { -namespace native { - -DEFINE_DISPATCH(glu_stub); -DEFINE_DISPATCH(glu_backward_stub); -Tensor& glu_out(const Tensor& self, int64_t dim, Tensor &result) { +namespace meta { +TORCH_META_FUNC(glu) ( + const Tensor& self, int64_t dim +) { // this can't pass anyway because a 0-dimensional tensor has "size" 1, which // can't be evenly halved, but give a nicer error message here. TORCH_CHECK(self.dim() > 0, "glu does not support 0-dimensional tensors"); @@ -16,23 +15,24 @@ Tensor& glu_out(const Tensor& self, int64_t dim, Tensor &result) { const int64_t nIn = self.size(wrap_dim); TORCH_CHECK(nIn % 2 == 0, "Halving dimension must be even, but dimension ", wrap_dim, " is size ", nIn); + // size output to half of input const int64_t selfSize = nIn / 2; - auto newSizes = self.sizes().vec(); - newSizes[wrap_dim] = selfSize; - result.resize_(newSizes); - // half tensor Tensor firstHalf = self.narrow(wrap_dim, 0, selfSize); Tensor secondHalf = self.narrow(wrap_dim, selfSize, selfSize); - - auto iter = TensorIterator::borrowing_binary_op(result, firstHalf, secondHalf); - glu_stub(iter.device_type(), iter); - return result; + build_borrowing_binary_op(maybe_get_output(), firstHalf, secondHalf); } +} // namespace meta + +namespace native { + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +DEFINE_DISPATCH(glu_stub); +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +DEFINE_DISPATCH(glu_backward_stub); -Tensor glu(const Tensor& self, int64_t dim) { - auto result = at::empty({0}, self.options()); - return at::glu_out(result, self, dim); +TORCH_IMPL_FUNC(glu_out) (const Tensor& self, int64_t dim, const Tensor& out) { + glu_stub(device_type(), *this); } Tensor& glu_backward_cpu_out(const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index fc5cc0d1924fb..34b54719fe502 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -519,7 +519,7 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con }); } -void glu_kernel(TensorIterator& iter) { +void glu_kernel(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] { using Vec = Vectorized; const scalar_t one_val(1); diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu index 1229149d76aee..7c8783028a5ac 100644 --- a/aten/src/ATen/native/cuda/Activation.cu +++ b/aten/src/ATen/native/cuda/Activation.cu @@ -28,7 +28,7 @@ namespace native { // ----------------------------------- // glu forward // ----------------------------------- -void glu_kernel(TensorIterator& iter) { +void glu_kernel(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "glu_cuda", [&]() { using acc_t = at::acc_type; gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 4f7d7e66a7d5e..224d850c8004c 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -8460,14 +8460,16 @@ CompositeExplicitAutograd: elu_ - func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: glu_out - func: glu(Tensor self, int dim=-1) -> Tensor + structured_delegate: glu.out + device_check: NoCheck # TensorIterator python_module: nn - dispatch: - CPU, CUDA: glu - func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn From 510d2ece81c18ba3f9bbcbc509effe281a77206c Mon Sep 17 00:00:00 2001 From: nikithamalgi Date: Thu, 26 Aug 2021 18:54:51 -0700 Subject: [PATCH 279/530] Merge script and _script_pdt API (#62420) Summary: Merge `torch.jit.script` and `torch.jit._script_pdt` API. This PR merges profile directed typing with script api Pull Request resolved: https://github.com/pytorch/pytorch/pull/62420 Reviewed By: iramazanli Differential Revision: D30579015 Pulled By: nikithamalgifb fbshipit-source-id: 99ba6839d235d61b2dd0144b466b2063a53ccece --- test/jit/test_pdt.py | 80 ++++++++++++------------- torch/jit/__init__.py | 1 - torch/jit/_script.py | 136 ++++++++++++++++++++++++++---------------- 3 files changed, 124 insertions(+), 93 deletions(-) diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py index 57cd74faf432b..468eb2787814b 100644 --- a/test/jit/test_pdt.py +++ b/test/jit/test_pdt.py @@ -40,7 +40,7 @@ def forward(self, x) -> Any: make_global(TestPDTModel) pdt_model = TestPDTModel() inp: List[Tuple[Any, ...]] = [(20, ), (2.7, ), (False, ), ] - scripted_pdt_model = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: inp}) + scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model: inp}) self.assertEqual(scripted_pdt_model(50), pdt_model(50)) self.assertEqual(scripted_pdt_model(1.8), pdt_model(1.8)) self.assertTrue(scripted_pdt_model(True), pdt_model(True)) @@ -67,7 +67,7 @@ def forward(self, x): inner_pdt_model = NestedPDTInner() wrapped_pdt_model = NestedModulePDTWrapper(inner_pdt_model) inp: List[Tuple[Any, ...]] = [(20, ), (False, )] - scripted_pdt_model = torch.jit._script_pdt(wrapped_pdt_model, example_inputs={wrapped_pdt_model: inp}) + scripted_pdt_model = torch.jit.script(wrapped_pdt_model, example_inputs={wrapped_pdt_model: inp}) self.assertEqual(scripted_pdt_model(30), wrapped_pdt_model(30)) self.assertEqual(scripted_pdt_model(1.9), wrapped_pdt_model(1.9)) self.assertTrue(scripted_pdt_model(True), wrapped_pdt_model(True)) @@ -95,8 +95,8 @@ def forward(self, x): outer_pdt_model = NestedModulePDTOuter(inner_pdt_model) inner_input: List[Tuple[Any, ...]] = [(10, 10), (1.9, 20), ] outer_input: List[Tuple[Any, ...]] = [(20, ), (False, )] - scripted_pdt_model = torch.jit._script_pdt(outer_pdt_model, example_inputs={inner_pdt_model: inner_input, - outer_pdt_model: outer_input, }) + scripted_pdt_model = torch.jit.script(outer_pdt_model, example_inputs={inner_pdt_model: inner_input, + outer_pdt_model: outer_input, }) self.assertEqual(scripted_pdt_model(30), outer_pdt_model(30)) self.assertEqual(scripted_pdt_model(1.9), outer_pdt_model(1.9)) self.assertTrue(scripted_pdt_model(True), outer_pdt_model(True)) @@ -119,7 +119,7 @@ def fun(self, x): make_global(NestedFunctionInForward) pdt_model = NestedFunctionInForward() inp: List[Tuple[Any, ...]] = [(-1, ), (False, )] - scripted_pdt_model = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: inp}) + scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model: inp}) self.assertEqual(scripted_pdt_model(30), pdt_model(30)) self.assertEqual(scripted_pdt_model(True), pdt_model(True)) @@ -142,7 +142,7 @@ def fn(self, x, y) -> Any: make_global(TestModelWithExport) pdt_model = TestModelWithExport() inp: List[Tuple[Any, ...]] = [(20, 10, ), (2.7, 8.9, ), ] - scripted_pdt_model = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model.fn: inp}) + scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model.fn: inp}) self.assertEqual(scripted_pdt_model.fn(10, 90), pdt_model.fn(10, 90)) self.assertEqual(scripted_pdt_model.fn(1.8, 2.2), pdt_model.fn(1.8, 2.2)) self.assertTrue(scripted_pdt_model.fn(torch.ones(1), 2), pdt_model.fn(torch.ones(1), 2)) @@ -155,7 +155,7 @@ def test_sum(self, a): make_global(PDTModel) pdt_model = PDTModel() inp: List[Tuple[Any, ...]] = [([10, 20, ], ), ] - scripted_pdt_model = torch.jit._script_pdt(PDTModel, example_inputs={pdt_model.test_sum: inp}) + scripted_pdt_model = torch.jit.script(PDTModel, example_inputs={pdt_model.test_sum: inp}) script_model = scripted_pdt_model() self.assertEqual(script_model.test_sum([10, 20, 30, ], ), pdt_model.test_sum([10, 20, 30, ], )) @@ -174,8 +174,8 @@ def test_substring(self, a, b): pdt_model = PDTModelWithManyMethods() list_inp: List[Tuple[Any, ...]] = [([1.2, 2.3, ], ), ] str_inp: List[Tuple[Any, ...]] = [("abc", "b", ), ] - scripted_pdt_model = torch.jit._script_pdt(PDTModelWithManyMethods, example_inputs={pdt_model.test_list_to_dict: list_inp, - pdt_model.test_substring: str_inp}) + scripted_pdt_model = torch.jit.script(PDTModelWithManyMethods, example_inputs={pdt_model.test_list_to_dict: list_inp, + pdt_model.test_substring: str_inp}) script_model = scripted_pdt_model() self.assertEqual(script_model.test_list_to_dict([1.1, 2.2, 3.3, ], ), pdt_model.test_list_to_dict([1.1, 2.2, 3.3, ], )) self.assertEqual(script_model.test_substring("helloworld", "world", ), pdt_model.test_substring("helloworld", "world", )) @@ -195,8 +195,8 @@ def test_find(self, a, b): pdt_model_two = PDTModelTwo() dict_inp: List[Tuple[Any, ...]] = [({1.2: True, 2.3: False, }, 1.2), ] list_inp: List[Tuple[Any, ...]] = [(["abc", "b", ], "c"), ] - scripted_pdt_model_one = torch.jit._script_pdt(PDTModelOne, example_inputs={pdt_model_one.test_find: dict_inp}) - scripted_pdt_model_two = torch.jit._script_pdt(PDTModelTwo, example_inputs={pdt_model_two.test_find: list_inp}) + scripted_pdt_model_one = torch.jit.script(PDTModelOne, example_inputs={pdt_model_one.test_find: dict_inp}) + scripted_pdt_model_two = torch.jit.script(PDTModelTwo, example_inputs={pdt_model_two.test_find: list_inp}) script_model_one, script_model_two = scripted_pdt_model_one(), scripted_pdt_model_two() self.assertEqual(script_model_one.test_find({1.1: True, 2.2: True, 3.3: False, }, 4.4), @@ -209,28 +209,28 @@ def test_sum(a, b): return a + b make_global(test_sum) - scripted_fn_add = torch.jit._script_pdt(test_sum, example_inputs=[(3, 4)]) + scripted_fn_add = torch.jit.script(test_sum, example_inputs=[(3, 4)]) self.assertEqual(scripted_fn_add(10, 2), test_sum(10, 2)) def test_sub(a, b): return a - b make_global(test_sub) - scripted_fn_sub = torch.jit._script_pdt(test_sub, example_inputs=[(3.9, 4.10)]) + scripted_fn_sub = torch.jit.script(test_sub, example_inputs=[(3.9, 4.10)]) self.assertEqual(scripted_fn_sub(6.5, 2.9), test_sub(6.5, 2.9)) def test_mul(a, b): return a * b make_global(test_mul) - scripted_fn_mul = torch.jit._script_pdt(test_mul, example_inputs=[(-10, 9)]) + scripted_fn_mul = torch.jit.script(test_mul, example_inputs=[(-10, 9)]) self.assertEqual(scripted_fn_mul(-1, 3), test_mul(-1, 3)) def test_args_complex(real, img): return torch.complex(real, img) make_global(test_args_complex) - scripted_fn_complex = torch.jit._script_pdt(test_args_complex, example_inputs=[(torch.rand(3, 4), torch.rand(3, 4))]) + scripted_fn_complex = torch.jit.script(test_args_complex, example_inputs=[(torch.rand(3, 4), torch.rand(3, 4))]) arg1, arg2 = torch.rand(3, 4), torch.rand(3, 4) self.assertEqual(scripted_fn_complex(arg1, arg2), test_args_complex(arg1, arg2)) @@ -241,7 +241,7 @@ def test_bool(a): return 0 make_global(test_bool) - scripted_fn_bool = torch.jit._script_pdt(test_bool, example_inputs=[(True,)]) + scripted_fn_bool = torch.jit.script(test_bool, example_inputs=[(True,)]) self.assertEqual(scripted_fn_bool(True), test_bool(True)) def test_str(a): @@ -251,7 +251,7 @@ def test_str(a): return True make_global(test_str) - scripted_fn_str = torch.jit._script_pdt(test_str, example_inputs=[("",)]) + scripted_fn_str = torch.jit.script(test_str, example_inputs=[("",)]) self.assertEqual(scripted_fn_str("abc"), test_str("abc")) def test_pdt_list_and_tuple(self): @@ -260,24 +260,24 @@ def test_list_and_tuple(a): make_global(test_list_and_tuple) - scripted_fn_float_list_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[([4.9, 8.9],)]) + scripted_fn_float_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([4.9, 8.9],)]) self.assertEqual(scripted_fn_float_list_input([11.9, 7.6]), test_list_and_tuple([11.9, 7.6])) - scripted_fn_bool_list_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[([True, False, True],)]) + scripted_fn_bool_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([True, False, True],)]) self.assertEqual(scripted_fn_bool_list_input([True, True, True]), test_list_and_tuple([True, True, True])) - scripted_fn_int_list_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[([3, 4, 5], )]) + scripted_fn_int_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([3, 4, 5], )]) self.assertEqual(scripted_fn_int_list_input([1, 2, 3]), test_list_and_tuple([1, 2, 3])) - scripted_fn_float_tuple_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[((4.9, 8.9),)]) + scripted_fn_float_tuple_input = torch.jit.script(test_list_and_tuple, example_inputs=[((4.9, 8.9),)]) self.assertEqual(scripted_fn_float_tuple_input((11.9, 7.6)), test_list_and_tuple((11.9, 7.6))) - scripted_fn_bool_tuple_input = torch.jit._script_pdt(test_list_and_tuple, - example_inputs=[((True, False, True),)]) + scripted_fn_bool_tuple_input = torch.jit.script(test_list_and_tuple, + example_inputs=[((True, False, True),)]) self.assertEqual(scripted_fn_bool_tuple_input((True, True, True)), test_list_and_tuple((True, True, True))) - scripted_fn_int_tuple_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[((3, 4, 5), )]) + scripted_fn_int_tuple_input = torch.jit.script(test_list_and_tuple, example_inputs=[((3, 4, 5), )]) self.assertEqual(scripted_fn_int_tuple_input((1, 2, 3)), test_list_and_tuple((1, 2, 3))) def test_nested_list_and_tuple(self): @@ -295,22 +295,22 @@ def test_nested_tuple(inp): make_global(test_nested_list, test_nested_tuple) list_inp = [[1, 2, 3, ], [5, 6, 7, ]] - scripted_fn = torch.jit._script_pdt(test_nested_list, example_inputs=[(list_inp, ), ]) + scripted_fn = torch.jit.script(test_nested_list, example_inputs=[(list_inp, ), ]) inp = [[0, 4, 7, ], [8, 11, ], [6, -1, -20, ]] self.assertEqual(scripted_fn(inp, ), test_nested_list(inp, )) list_inp = ([1, 2, 3, ], [5, 6, 7, ]) - scripted_fn = torch.jit._script_pdt(test_nested_list, example_inputs=[(list_inp, ), ]) + scripted_fn = torch.jit.script(test_nested_list, example_inputs=[(list_inp, ), ]) inp = ([0, 4, 7, ], [8, 11, ], [6, -1, -20, ]) self.assertEqual(scripted_fn(inp, ), test_nested_list(inp, )) tup_inp = [(1.0, 2.6, 3.7, ), (5.7, 6.1, 1.7, )] - scripted_fn = torch.jit._script_pdt(test_nested_tuple, example_inputs=[(tup_inp, ), ]) + scripted_fn = torch.jit.script(test_nested_tuple, example_inputs=[(tup_inp, ), ]) inp = [(1.0, 4.1, 7.4, ), (4.8, 1.1, -1.2, ), (6.3, -1.3, -2.0, )] self.assertEqual(scripted_fn(inp, ), test_nested_tuple(inp, )) tup_inp = ((True, False, True, ), (False, False, False, )) - scripted_fn = torch.jit._script_pdt(test_nested_tuple, example_inputs=[(tup_inp, ), ]) + scripted_fn = torch.jit.script(test_nested_tuple, example_inputs=[(tup_inp, ), ]) inp = ((True, True, True, ), (False, False, True, )) self.assertEqual(scripted_fn(inp, ), test_nested_tuple(inp, )) @@ -324,11 +324,11 @@ def test_dict_int_list(a): make_global(test_dict, test_dict_int_list) str_bool_inp = {'foo' : True, 'bar': False} - scripted_fn = torch.jit._script_pdt(test_dict, example_inputs=[(str_bool_inp,)]) + scripted_fn = torch.jit.script(test_dict, example_inputs=[(str_bool_inp,)]) self.assertEqual(scripted_fn({'foo' : False, 'bar': True}, ), test_dict({'foo' : False, 'bar': True}, )) str_list_inp = {0 : [True, False], 1: [False, True]} - scripted_fn = torch.jit._script_pdt(test_dict_int_list, example_inputs=[(str_list_inp,)]) + scripted_fn = torch.jit.script(test_dict_int_list, example_inputs=[(str_list_inp,)]) self.assertEqual(scripted_fn({0 : [False, False], 1: [True, True]}, ), test_dict_int_list({0 : [False, False], 1: [True, True]}, )) @@ -349,14 +349,14 @@ def test_multiple_type_refinement(a): make_global(test_multiple_types, test_multiple_type_refinement) - scripted_fn = torch.jit._script_pdt(test_multiple_types, example_inputs=[(1,), ("abc", ), (8.9,), ([3, 4, 5], )]) + scripted_fn = torch.jit.script(test_multiple_types, example_inputs=[(1,), ("abc", ), (8.9,), ([3, 4, 5], )]) self.assertEqual(scripted_fn(10), test_multiple_types(10)) self.assertEqual(scripted_fn("def"), test_multiple_types("def")) self.assertEqual(scripted_fn(7.89999), test_multiple_types(7.89999)) self.assertEqual(scripted_fn([10, 11, 14]), test_multiple_types([10, 11, 14])) - scripted_fn = torch.jit._script_pdt(test_multiple_type_refinement, example_inputs=[(1,), ("abc", ), (8.9,), - ([3, 4, 5],), (True, ), ({"a": True}, ), ]) + scripted_fn = torch.jit.script(test_multiple_type_refinement, example_inputs=[(1,), ("abc", ), (8.9,), + ([3, 4, 5],), (True, ), ({"a": True}, ), ]) self.assertEqual(scripted_fn(10), test_multiple_type_refinement(10)) self.assertEqual(scripted_fn("def"), test_multiple_type_refinement("def")) self.assertEqual(scripted_fn(7.89999), test_multiple_type_refinement(7.89999)) @@ -381,7 +381,7 @@ def test_model(a, m): make_global(UserDefinedClass, test_model) user_class = UserDefinedClass() - scripted_fn = torch.jit._script_pdt(test_model, example_inputs=[(10, user_class, ), (10.9, user_class, ), ]) + scripted_fn = torch.jit.script(test_model, example_inputs=[(10, user_class, ), (10.9, user_class, ), ]) self.assertEqual(scripted_fn(100, user_class, ), test_model(100, user_class)) self.assertEqual(scripted_fn(1.9, user_class, ), test_model(1.9, user_class)) @@ -403,7 +403,7 @@ def test_model_with_args(a, m): make_global(ClassWithArgs, test_model_with_args) user_class = ClassWithArgs(False) - scripted_fn = torch.jit._script_pdt(test_model_with_args, example_inputs=[(10, user_class, ), (10.9, user_class, ), ]) + scripted_fn = torch.jit.script(test_model_with_args, example_inputs=[(10, user_class, ), (10.9, user_class, ), ]) self.assertEqual(scripted_fn(100, ClassWithArgs(True), ), test_model_with_args(100, ClassWithArgs(True))) def test_nn_parameter_as_arg(self): @@ -420,7 +420,7 @@ def forward(self, y): make_global(TestNNParameter) pdt_model = TestNNParameter() - scripted_fn = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: [(10, ), ], }) + scripted_fn = torch.jit.script(pdt_model, example_inputs={pdt_model: [(10, ), ], }) self.assertEqual(scripted_fn(20), pdt_model(20)) def test_fx_tracing_with_typing(self): @@ -434,7 +434,7 @@ def forward(self, a) -> FXModelOutput: make_global(FXModel, FXModelOutput) pdt_model = FXModel() - scripted_fn = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], }) + scripted_fn = torch.jit.script(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], }) self.assertEqual(scripted_fn([20]), pdt_model([20])) def test_nonetype_as_optional_of_type(self): @@ -446,11 +446,11 @@ def test_none(a) -> Any: make_global(test_none) - scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (10.6, )]) + scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (10.6, )]) self.assertEqual(scripted_fn(30.9, ), test_none(30.9, )) - scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (10, )]) + scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (10, )]) self.assertEqual(scripted_fn(2, ), test_none(2, )) - scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (torch.Tensor(1), )]) + scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (torch.Tensor(1), )]) self.assertEqual(scripted_fn(torch.ones(1), ), test_none(torch.ones(1), )) diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index c9fd886c7336d..f7fa58bd36434 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -20,7 +20,6 @@ ) from torch.jit._script import ( script, - _script_pdt, Attribute, ScriptModule, script_method, diff --git a/torch/jit/_script.py b/torch/jit/_script.py index 3d173ae27bd01..09801bab938a7 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -984,57 +984,6 @@ def call_prepare_scriptable_func(obj): memo: Dict[int, torch.nn.Module] = {} return call_prepare_scriptable_func_impl(obj, memo) - -def _script_pdt(obj, optimize=None, _frames_up=0, _rcb=None, - example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None): - # This is a private API, intended for internal use only. Usage of this API is only for experimental - # purposes only and is highly discouraged. - global type_trace_db - if not _enabled: - return obj - - if optimize is not None: - warnings.warn( - "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead" - ) - - # No-op for modules and functions that are already scripted - if isinstance(obj, ScriptModule): - return obj - if isinstance(obj, ScriptFunction): - return obj - - if example_inputs: - # If MonkeyType is installed, enable profile directed type annotation - # Check if example_inputs are defined and generate call traces - # for the method by running eager mode version of the method with - # the provide example inputs. This logs all the traces in type_trace_db - type_trace_db = JitTypeTraceStore() - if monkeytype_trace: - monkeytype_config = JitTypeTraceConfig(type_trace_db) - with monkeytype_trace(monkeytype_config): - if isinstance(example_inputs, Dict): - # If the obj is an nn.Module or a class, then each method is - # executed with the arguments provided in the example inputs. - # example inputs here will be of type Dict(class.method, (arguments)) - # This is used to infer type annotations for those methods - # which are not called directly under the hood of monkeytype. - for module, example_input in example_inputs.items(): - for example in example_input: - module(*example) - elif isinstance(example_inputs, List): - for examples in example_inputs: - obj(*examples) - else: - warnings.warn("Error: Unable to infer types. Please format the inputs to type `List[Tuple]`" - " or `Dict[Callable, List[Tuple]]` to be run with MonkeyType.") - else: - warnings.warn("Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType " - "to enable Profile-Directed Typing in TorchScript. Refer to " - "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ") - return script(obj, optimize, _frames_up, _rcb) - - def create_script_dict(obj): """ Create a ``torch._C.ScriptDict`` instance with the data from ``obj``. @@ -1065,7 +1014,8 @@ def create_script_list(obj, type_hint=None): return torch._C.ScriptList(obj) # type: ignore[attr-defined] -def script(obj, optimize=None, _frames_up=0, _rcb=None): +def script(obj, optimize=None, _frames_up=0, _rcb=None, + example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None): r""" Scripting a function or ``nn.Module`` will inspect the source code, compile it as TorchScript code using the TorchScript compiler, and return a :class:`ScriptModule` or @@ -1083,6 +1033,8 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None): Args: obj (callable, class, or ``nn.Module``): The ``nn.Module``, function, class type, dictionary, or list to compile. + example_inputs (Union[List[Tuple], Dict[Callable, List[Tuple]], None]): Provide example inputs + to annotate the arguments for a function or ``nn.Module``. Returns: If ``obj`` is ``nn.Module``, ``script`` returns @@ -1124,6 +1076,34 @@ def foo(x, y): ... + ****Scripting a function using example_inputs** + Example inputs can be used to annotate a function arguments. + + Example (annotating a function before scripting): + + .. testcode:: + + import torch + + def test_sum(a, b): + return a + b + + # Annotate the arguments to be int + scripted_fn = torch.jit.script(test_sum, example_inputs=[(3, 4)]) + + print(type(scripted_fn)) # torch.jit.ScriptFunction + + # See the compiled graph as Python code + print(scripted_fn.code) + + # Call the function using the TorchScript interpreter + scripted_fn(20, 100) + + .. testoutput:: + :hide: + + ... + **Scripting an nn.Module** Scripting an ``nn.Module`` by default will compile the ``forward`` method and recursively compile any methods, submodules, and functions called by ``forward``. If a ``nn.Module`` only uses @@ -1210,7 +1190,30 @@ def forward(self, input): scripted_module = torch.jit.script(MyModule()) print(scripted_module.some_entry_point(torch.randn(2, 2))) print(scripted_module(torch.randn(2, 2))) + + Example ( Annotating forward of nn.Module using example_inputs):: + + import torch + import torch.nn as nn + from typing import NamedTuple + + class MyModule(NamedTuple): + result: List[int] + + class TestNNModule(torch.nn.Module): + def forward(self, a) -> MyModule: + result = MyModule(result=a) + return result + + pdt_model = TestNNModule() + + # Runs the pdt_model in eager model with the inputs provided and annotates the arguments of forward + scripted_model = torch.jit.script(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], }) + + # Run the scripted_model with actual inputs + print(scripted_model([20])) """ + global type_trace_db if not _enabled: return obj @@ -1227,6 +1230,35 @@ def forward(self, input): if isinstance(obj, ScriptFunction): return obj + if example_inputs: + # If MonkeyType is installed, enable profile directed type annotation + # Check if example_inputs are defined and generate call traces + # for the method by running eager mode version of the method with + # the provide example inputs. This logs all the traces in type_trace_db + type_trace_db = JitTypeTraceStore() + if monkeytype_trace: + monkeytype_config = JitTypeTraceConfig(type_trace_db) + with monkeytype_trace(monkeytype_config): + if isinstance(example_inputs, Dict): + # If the obj is an nn.Module or a class, then each method is + # executed with the arguments provided in the example inputs. + # example inputs here will be of type Dict(class.method, (arguments)) + # This is used to infer type annotations for those methods + # which are not called directly under the hood of monkeytype. + for module, example_input in example_inputs.items(): + for example in example_input: + module(*example) + elif isinstance(example_inputs, List): + for examples in example_inputs: + obj(*examples) + else: + raise ValueError("Error: Unable to infer types. Please format the inputs to type `List[Tuple]`" + " or `Dict[Callable, List[Tuple]]` to be run with MonkeyType.") + else: + warnings.warn("Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType " + "to enable Profile-Directed Typing in TorchScript. Refer to " + "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ") + if isinstance(obj, torch.nn.Module): obj = call_prepare_scriptable_func(obj) return torch.jit._recursive.create_script_module( From 65e6194aeb3269a182cfe2c05c122159da12770f Mon Sep 17 00:00:00 2001 From: Can Balioglu Date: Thu, 26 Aug 2021 20:16:10 -0700 Subject: [PATCH 280/530] Introduce the torchrun entrypoint (#64049) Summary: This PR introduces a new `torchrun` entrypoint that simply "points" to `python -m torch.distributed.run`. It is shorter and less error-prone to type and gives a nicer syntax than a rather cryptic `python -m ...` command line. Along with the new entrypoint the documentation is also updated and places where `torch.distributed.run` are mentioned are replaced with `torchrun`. cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64049 Reviewed By: cbalioglu Differential Revision: D30584041 Pulled By: kiukchung fbshipit-source-id: d99db3b5d12e7bf9676bab70e680d4b88031ae2d --- docs/source/elastic/quickstart.rst | 34 ++++++++++----------- docs/source/elastic/run.rst | 2 +- docs/source/elastic/train_script.rst | 2 +- setup.py | 1 + torch/distributed/launch.py | 6 ++-- torch/distributed/run.py | 44 ++++++++++++++-------------- 6 files changed, 45 insertions(+), 44 deletions(-) diff --git a/docs/source/elastic/quickstart.rst b/docs/source/elastic/quickstart.rst index 1d22426d06a8b..f7c1ebf7dd0de 100644 --- a/docs/source/elastic/quickstart.rst +++ b/docs/source/elastic/quickstart.rst @@ -5,13 +5,13 @@ To launch a **fault-tolerant** job, run the following on all nodes. .. code-block:: bash - python -m torch.distributed.run - --nnodes=NUM_NODES - --nproc_per_node=TRAINERS_PER_NODE - --rdzv_id=JOB_ID - --rdzv_backend=c10d - --rdzv_endpoint=HOST_NODE_ADDR - YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) + torchrun + --nnodes=NUM_NODES + --nproc_per_node=TRAINERS_PER_NODE + --rdzv_id=JOB_ID + --rdzv_backend=c10d + --rdzv_endpoint=HOST_NODE_ADDR + YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) To launch an **elastic** job, run the following on at least ``MIN_SIZE`` nodes @@ -19,13 +19,13 @@ and at most ``MAX_SIZE`` nodes. .. code-block:: bash - python -m torch.distributed.run - --nnodes=MIN_SIZE:MAX_SIZE - --nproc_per_node=TRAINERS_PER_NODE - --rdzv_id=JOB_ID - --rdzv_backend=c10d - --rdzv_endpoint=HOST_NODE_ADDR - YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) + torchrun + --nnodes=MIN_SIZE:MAX_SIZE + --nproc_per_node=TRAINERS_PER_NODE + --rdzv_id=JOB_ID + --rdzv_backend=c10d + --rdzv_endpoint=HOST_NODE_ADDR + YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) ``HOST_NODE_ADDR``, in form [:] (e.g. node1.example.com:29400), specifies the node and the port on which the C10d rendezvous backend should be @@ -46,6 +46,6 @@ ideally you should pick a node that has a high bandwidth. Learn more about writing your distributed training script `here `_. -If ``torch.distributed.run`` does not meet your requirements you may use our -APIs directly for more powerful customization. Start by taking a look at the -`elastic agent `_ API). +If ``torchrun`` does not meet your requirements you may use our APIs directly +for more powerful customization. Start by taking a look at the +`elastic agent `_ API. diff --git a/docs/source/elastic/run.rst b/docs/source/elastic/run.rst index fb870fae41f58..284fc7f755311 100644 --- a/docs/source/elastic/run.rst +++ b/docs/source/elastic/run.rst @@ -1,6 +1,6 @@ .. _launcher-api: -torch.distributed.run (Elastic Launch) +torchrun (Elastic Launch) ====================================== .. automodule:: torch.distributed.run diff --git a/docs/source/elastic/train_script.rst b/docs/source/elastic/train_script.rst index 263f2df659574..04225d79067a8 100644 --- a/docs/source/elastic/train_script.rst +++ b/docs/source/elastic/train_script.rst @@ -4,7 +4,7 @@ Train script ------------- If your train script works with ``torch.distributed.launch`` it will continue -working with ``torch.distributed.run`` with these differences: +working with ``torchrun`` with these differences: 1. No need to manually pass ``RANK``, ``WORLD_SIZE``, ``MASTER_ADDR``, and ``MASTER_PORT``. diff --git a/setup.py b/setup.py index a20098232af3c..6d9ed53dc66aa 100644 --- a/setup.py +++ b/setup.py @@ -854,6 +854,7 @@ def make_relative_rpath_args(path): 'console_scripts': [ 'convert-caffe2-to-onnx = caffe2.python.onnx.bin.conversion:caffe2_to_onnx', 'convert-onnx-to-caffe2 = caffe2.python.onnx.bin.conversion:onnx_to_caffe2', + 'torchrun = torch.distributed.run:main', ] } diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py index 4f29edd10d521..6173abb2c9ecf 100644 --- a/torch/distributed/launch.py +++ b/torch/distributed/launch.py @@ -4,7 +4,7 @@ .. warning:: - This module is going to be deprecated in favor of :ref:`torch.distributed.run `. + This module is going to be deprecated in favor of :ref:`torchrun `. The utility can be used for single-node distributed training, in which one or more processes per node will be spawned. The utility can be used for either @@ -177,8 +177,8 @@ def launch(args): def main(args=None): warnings.warn( "The module torch.distributed.launch is deprecated\n" - "and will be removed in future. Use torch.distributed.run.\n" - "Note that --use_env is set by default in torch.distributed.run.\n" + "and will be removed in future. Use torchrun.\n" + "Note that --use_env is set by default in torchrun.\n" "If your script expects `--local_rank` argument to be set, please\n" "change it to read from `os.environ['LOCAL_RANK']` instead. See \n" "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n" diff --git a/torch/distributed/run.py b/torch/distributed/run.py index d4428a0cde3c1..c6e84d6f65f4b 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -7,7 +7,7 @@ # LICENSE file in the root directory of this source tree. """ -``torch.distributed.run`` provides a superset of the functionality as ``torch.distributed.launch`` +``torchrun`` provides a superset of the functionality as ``torch.distributed.launch`` with the following additional functionalities: 1. Worker failures are handled gracefully by restarting all workers. @@ -18,33 +18,33 @@ -Transitioning from torch.distributed.launch to torch.distributed.run +Transitioning from torch.distributed.launch to torchrun ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``torch.distributed.run`` supports the same arguments as ``torch.distributed.launch`` **except** +``torchrun`` supports the same arguments as ``torch.distributed.launch`` **except** for ``--use_env`` which is now deprecated. To migrate from ``torch.distributed.launch`` -to ``torch.distributed.run`` follow these steps: +to ``torchrun`` follow these steps: 1. If your training script is already reading ``local_rank`` from the ``LOCAL_RANK`` environment variable. Then you need simply omit the ``--use_env`` flag, e.g.: - +--------------------------------------------------------------------+------------------------------------------------------+ - | ``torch.distributed.launch`` | ``torch.distributed.run`` | - +====================================================================+======================================================+ - | | | - | .. code-block:: shell-session | .. code-block:: shell-session | - | | | - | $ python -m torch.distributed.launch --use_env train_script.py | $ python -m torch.distributed.run train_script.py | - | | | - +--------------------------------------------------------------------+------------------------------------------------------+ + +--------------------------------------------------------------------+--------------------------------------------+ + | ``torch.distributed.launch`` | ``torchrun`` | + +====================================================================+============================================+ + | | | + | .. code-block:: shell-session | .. code-block:: shell-session | + | | | + | $ python -m torch.distributed.launch --use_env train_script.py | $ torchrun train_script.py | + | | | + +--------------------------------------------------------------------+--------------------------------------------+ 2. If your training script reads local rank from a ``--local_rank`` cmd argument. Change your training script to read from the ``LOCAL_RANK`` environment variable as demonstrated by the following code snippet: +-------------------------------------------------------+----------------------------------------------------+ - | ``torch.distributed.launch`` | ``torch.distributed.run`` | + | ``torch.distributed.launch`` | ``torchrun`` | +=======================================================+====================================================+ | | | | .. code-block:: python | .. code-block:: python | @@ -59,12 +59,12 @@ | | | +-------------------------------------------------------+----------------------------------------------------+ -The aformentioned changes suffice to migrate from ``torch.distributed.launch`` to ``torch.distributed.run``. -To take advantage of new features such as elasticity, fault-tolerance, and error reporting of ``torch.distributed.run`` +The aformentioned changes suffice to migrate from ``torch.distributed.launch`` to ``torchrun``. +To take advantage of new features such as elasticity, fault-tolerance, and error reporting of ``torchrun`` please refer to: -* :ref:`elastic_train_script` for more information on authoring training scripts that are ``torch.distributed.run`` compliant. -* the rest of this page for more information on the features of ``torch.distributed.run``. +* :ref:`elastic_train_script` for more information on authoring training scripts that are ``torchrun`` compliant. +* the rest of this page for more information on the features of ``torchrun``. @@ -75,7 +75,7 @@ :: - >>> python -m torch.distributed.run + >>> torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_TRAINERS @@ -85,7 +85,7 @@ :: - >>> python -m torch.distributed.run + >>> torchrun --nnodes=$NUM_NODES --nproc_per_node=$NUM_TRAINERS --rdzv_id=$JOB_ID @@ -104,7 +104,7 @@ :: - >>> python -m torch.distributed.run + >>> torchrun --nnodes=1:4 --nproc_per_node=$NUM_TRAINERS --rdzv_id=$JOB_ID @@ -186,7 +186,7 @@ of the worker is specified in the ``WorkerSpec``. 5. ``LOCAL_WORLD_SIZE`` - The local world size (e.g. number of workers running locally); equals to - ``--nproc_per_node`` specified on ``torch.distributed.run``. + ``--nproc_per_node`` specified on ``torchrun``. 6. ``WORLD_SIZE`` - The world size (total number of workers in the job). From 63c90ec3bf6c9445a36199f65e0523a5e6532b0d Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 26 Aug 2021 20:54:54 -0700 Subject: [PATCH 281/530] [torch/deploy] add torch.distributed to build (#63918) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63918 Previously we were building with `USE_DISTRIBUTED` off, because c10d was built as a separately library for historical reasons. Since then, lw has merged the c10d build into libtorch, so this is fairly easy to turn on. Differential Revision: D30492442 **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.intern.facebook.com/intern/diff/D30492442/)! D30492442 D30492442 Test Plan: added a unit test Reviewed By: wconstab Pulled By: suo fbshipit-source-id: 843b8fcf349a72a7f6fcbd1fcc8961268690fb8c --- tools/build_variables.bzl | 5 +- torch/CMakeLists.txt | 99 +++++++---- .../csrc/deploy/example/generate_examples.py | 3 + torch/csrc/deploy/test_deploy.cpp | 12 ++ torch/csrc/deploy/test_deploy_gpu.cpp | 12 ++ torch/csrc/distributed/c10d/frontend.cpp | 161 +++++------------- torch/csrc/distributed/c10d/frontend.hpp | 24 ++- torch/csrc/distributed/c10d/frontend_cuda.cpp | 136 +++++++++++++++ torch/csrc/distributed/c10d/frontend_cuda.hpp | 12 ++ torch/csrc/distributed/c10d/init.cpp | 28 +-- .../distributed/rpc/request_callback_impl.cpp | 1 + 11 files changed, 315 insertions(+), 178 deletions(-) create mode 100644 torch/csrc/distributed/c10d/frontend_cuda.cpp create mode 100644 torch/csrc/distributed/c10d/frontend_cuda.hpp diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 3f6225358ac97..650830b3143f0 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -340,6 +340,7 @@ libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_tr # These files are the only ones that are supported on Windows. libtorch_distributed_base_sources = [ + "torch/csrc/distributed/c10d/frontend.cpp", "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/default_comm_hooks.cpp", "torch/csrc/distributed/c10d/FileStore.cpp", @@ -351,6 +352,7 @@ libtorch_distributed_base_sources = [ "torch/csrc/distributed/c10d/ProcessGroupGloo.cpp", "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp", "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp", + "torch/csrc/distributed/c10d/quantization/quantization.cpp", "torch/csrc/distributed/c10d/reducer.cpp", "torch/csrc/distributed/c10d/sequence_num.cpp", "torch/csrc/distributed/c10d/Store.cpp", @@ -548,6 +550,7 @@ libtorch_cuda_distributed_base_sources = [ # These files are only supported on Linux (and others) but not on Windows. libtorch_cuda_distributed_extra_sources = [ + "torch/csrc/distributed/c10d/frontend_cuda.cpp", "torch/csrc/distributed/c10d/NCCLUtils.cpp", "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp", "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", @@ -735,10 +738,8 @@ libtorch_python_core_sources = [ ] libtorch_python_distributed_core_sources = [ - "torch/csrc/distributed/c10d/frontend.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/python_comm_hook.cpp", - "torch/csrc/distributed/c10d/quantization/quantization.cpp", ] libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 761605fadcce8..7c086855612ca 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -214,11 +214,78 @@ add_custom_command( WORKING_DIRECTORY "${TORCH_ROOT}" ) +if(USE_DISTRIBUTED) + if(WIN32) + append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) + else() + append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) + endif() + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + endif() + # NCCL is a private dependency of libtorch, but libtorch_python includes + # some private headers of libtorch, which in turn include NCCL. As a hacky + # alternative to making NCCL a public dependency of libtorch, we make it + # a private dependency of libtorch_python as well. + if(USE_NCCL) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) + endif() + # Same for MPI. + if(USE_MPI) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES}) + endif() + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) + +endif() + +if(USE_NCCL AND NOT WIN32) + list(APPEND TORCH_PYTHON_SRCS + ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) +endif() + # WARNING- any TORCH_PYTHON_COMPILE_DEFINITIONS above this line # affect both torch_python and DEPLOY interpreter. if(USE_DEPLOY) add_library(torch_python_obj OBJECT ${TORCH_PYTHON_SRCS}) + if(USE_DISTRIBUTED) + # Set c10d-related compile definitions. For a "normal" build of + # libtorch_python, these are set on libtorch as PUBLIC so they are + # automatically propagated when libtorch_python links against libtorch. But + # since in the deploy build we are intentionally *not* linking against + # libtorch, we need to set them manually here. + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) + if(USE_GLOO AND USE_C10D_GLOO) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D_GLOO) + endif() + if(USE_NCCL AND USE_C10D_NCCL) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D_NCCL) + # Put nccl headers on the include path. We are specifically only setting + # include dirs here instead of linking against __caffe2_nccl wholesale + # to ensure we aren't accidentally replicating the nccl lib. + target_include_directories(torch_python_obj PRIVATE $) + endif() + if(USE_MPI AND USE_C10D_MPI) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D_MPI) + endif() + + # Pass USE_RPC in order to reduce use of + # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) + # need to be removed when RPC is supported + if(NOT WIN32) + target_compile_definitions(torch_cpu PUBLIC USE_RPC) + endif() + if(USE_TENSORPIPE) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_TENSORPIPE) + endif() + + # Set c10d-related include directories as well. + target_include_directories(torch_python_obj PRIVATE $) + endif() target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB -DUSE_DEPLOY") target_compile_definitions(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS}) @@ -268,38 +335,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set_source_files_properties(${TORCH_SRC_DIR}/csrc/utils/throughput_benchmark.cpp PROPERTIES COMPILE_FLAGS -Wno-attributes) endif() -if(USE_DISTRIBUTED) - if(WIN32) - append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) - else() - append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) - endif() - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - endif() - # NCCL is a private dependency of libtorch, but libtorch_python includes - # some private headers of libtorch, which in turn include NCCL. As a hacky - # alternative to making NCCL a public dependency of libtorch, we make it - # a private dependency of libtorch_python as well. - if(USE_NCCL) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) - endif() - # Same for MPI. - if(USE_MPI) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES}) - endif() - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) -endif() - -if(USE_NCCL AND NOT WIN32) - list(APPEND TORCH_PYTHON_SRCS - ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) -endif() - add_library(torch_python SHARED ${TORCH_PYTHON_SRCS}) if(HAVE_SOVERSION) set_target_properties(torch_python PROPERTIES diff --git a/torch/csrc/deploy/example/generate_examples.py b/torch/csrc/deploy/example/generate_examples.py index 65f244373d954..0f279d922157c 100644 --- a/torch/csrc/deploy/example/generate_examples.py +++ b/torch/csrc/deploy/example/generate_examples.py @@ -79,3 +79,6 @@ def save(name, model, model_jit=None, eg=None, featurestore_meta=None): e.save_pickle("fn", "fn.pkl", load_library) generate_fx_example() + + with PackageExporter(p / "uses_distributed") as e: + e.save_source_string("uses_distributed", "import torch.distributed; assert torch.distributed.is_available()") diff --git a/torch/csrc/deploy/test_deploy.cpp b/torch/csrc/deploy/test_deploy.cpp index a004db1e0d232..53456cacca2ad 100644 --- a/torch/csrc/deploy/test_deploy.cpp +++ b/torch/csrc/deploy/test_deploy.cpp @@ -366,3 +366,15 @@ TEST(TorchpyTest, SharedLibraryLoad) { } } #endif + +TEST(TorchpyTest, UsesDistributed) { + const auto model_filename = path( + "USES_DISTRIBUTED", + "torch/csrc/deploy/example/generated/uses_distributed"); + torch::deploy::InterpreterManager m(1); + torch::deploy::Package p = m.load_package(model_filename); + { + auto I = p.acquire_session(); + I.self.attr("import_module")({"uses_distributed"}); + } +} diff --git a/torch/csrc/deploy/test_deploy_gpu.cpp b/torch/csrc/deploy/test_deploy_gpu.cpp index 8287d1683edca..4e990adcd9e89 100644 --- a/torch/csrc/deploy/test_deploy_gpu.cpp +++ b/torch/csrc/deploy/test_deploy_gpu.cpp @@ -53,3 +53,15 @@ TEST(TorchDeployGPUTest, SimpleModel) { ASSERT_TRUE(ref_output.allclose(output, 1e-03, 1e-05)); } + +TEST(TorchDeployGPUTest, UsesDistributed) { + const auto model_filename = path( + "USES_DISTRIBUTED", + "torch/csrc/deploy/example/generated/uses_distributed"); + torch::deploy::InterpreterManager m(1); + torch::deploy::Package p = m.load_package(model_filename); + { + auto I = p.acquire_session(); + I.self.attr("import_module")({"uses_distributed"}); + } +} diff --git a/torch/csrc/distributed/c10d/frontend.cpp b/torch/csrc/distributed/c10d/frontend.cpp index b65cba79884af..e5b59f28982f6 100644 --- a/torch/csrc/distributed/c10d/frontend.cpp +++ b/torch/csrc/distributed/c10d/frontend.cpp @@ -3,10 +3,11 @@ #include #include #include -#include #include #include #include +#include +#include #include #include @@ -17,10 +18,6 @@ #include #endif -#ifdef USE_C10D_NCCL -#include -#endif - #ifdef USE_C10D_MPI #include #endif @@ -29,6 +26,20 @@ namespace c10d { namespace { +// Constant initialization, so it is guaranteed to be initialized before +// static initialization calls which may invoke registerNCCLProcessGroupProvider +const NCCLProcessGroupProvider stubProvider; +constexpr const NCCLProcessGroupProvider* defaultStubProviderAddr = + &stubProvider; +inline const NCCLProcessGroupProvider*& getNCCLProcessGroupProviderAddress() { + static const NCCLProcessGroupProvider* stubs_ = defaultStubProviderAddr; + return stubs_; +} + +const NCCLProcessGroupProvider* GetNCCLProcessGroupProvider() { + return getNCCLProcessGroupProviderAddress(); +} + void maybePreprocessComplexTensor(at::Tensor& tensor) { if(!tensor.is_complex()) { return; @@ -63,6 +74,11 @@ void assertReduceOpSupportsComplexTensor(ReduceOp op) { } // namespace anonymous +void registerNCCLProcessGroupProvider(NCCLProcessGroupProvider* provider) { + getNCCLProcessGroupProviderAddress() = provider; +} + + std::string Backend::get(const std::string& backend_type) { return backend_type; } @@ -207,17 +223,7 @@ c10::intrusive_ptr DistributedC10d::newProcessGroupHelper( "Attempting to create GLOO-based process group while GLOO is either not enabled or built"); #endif // USE_C10D_GLOO } else if (backend == "nccl") { -#ifdef USE_C10D_NCCL - auto options = ProcessGroupNCCL::Options::create(); - - options->is_high_priority_stream = false; - options->timeout = timeout; - pg = c10::make_intrusive( - prefix_store, rank, world_size, options); -#else - AT_ERROR( - "Attempting to create NCCL-based process group while NCCL is either not enabled or built"); -#endif // USE_C10D_NCCL + pg = GetNCCLProcessGroupProvider()->get(prefix_store, rank, world_size, timeout); } else { // TODO: discuss to figure out how to extend this to third party backends? AT_ERROR("Unsupported backend type: ", backend); @@ -1008,7 +1014,7 @@ void initCustomClassBindings() { .def( "broadcast", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, - std::vector data) { return self->broadcast(data); + std::vector data) { return self->broadcast(data); }) */ .def( @@ -1045,14 +1051,14 @@ void initCustomClassBindings() { .def( "allreduce", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, - at::Tensor& tensor, - c10::intrusive_ptr<::c10d::ReduceOp> op) { + at::Tensor& tensor, + c10::intrusive_ptr<::c10d::ReduceOp> op) { ::c10d::AllreduceOptions opts; opts.reduceOp = *op; std::vector tensors = {tensor}; return self->allreduce(tensors, opts); - } - ) + } + ) */ // TODO: make AllreduceCoalescedOptions compatible with TorchBind to // provide the full API in python. @@ -1098,8 +1104,8 @@ void initCustomClassBindings() { .def( "allgather", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, - std::vector output, - at::Tensor input) { + std::vector output, + at::Tensor input) { std::vector> outputs = { std::move(output)}; std::vector inputs = {std::move(input)}; @@ -1121,8 +1127,8 @@ void initCustomClassBindings() { .def( "gather", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, - std::vector> output_tensors, - std::vector input_tensors) { + std::vector> output_tensors, + std::vector input_tensors) { ::c10d::GatherOptions opts; return self->gather(output_tensors, input_tensors, opts); }) @@ -1145,8 +1151,8 @@ void initCustomClassBindings() { .def( "scatter", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, - std::vector outputTensors, - std::vector> inputTensors) { + std::vector outputTensors, + std::vector> inputTensors) { ::c10d::ScatterOptions opts; self->scatter(outputTensors, inputTensors, opts); }) @@ -1169,8 +1175,8 @@ void initCustomClassBindings() { // TODO: Enable this method when TorchBind supports ReduceScatterOptions. .def( "reduce_scatter", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, - std::vector outputTensors, - std::vector> inputTensors) { + std::vector outputTensors, + std::vector> inputTensors) { ::c10d::ReduceScatterOptions opts; return self->reduce_scatter(outputTensors, inputTensors, opts); }) @@ -1241,95 +1247,6 @@ void initCustomClassBindings() { return self->barrier(opts); }); -#ifdef USE_C10D_NCCL - // XXX: Ideally the Options of ProcessGroupNCCL should be - // bound using `def_readwrite` like in pybind11, but we - // didn't do that because: 1. no milisecond support yet - // 2. no def_readwrite or property support yet. - // TODO: make this binding the same as pybind11 - static const auto ProcessGroupNCCLOptionsTorchBind = - torch::class_<::c10d::ProcessGroupNCCL::Options>( - "dist_c10d", "ProcessGroupNCCLOptions") - .def(torch::init([](int64_t timeout, bool isHighPriorityStream) { - auto opTimeout = std::chrono::milliseconds(timeout); - auto opts = - ::c10d::ProcessGroupNCCL::Options::create(isHighPriorityStream); - opts->timeout = opTimeout; - return opts; - })); - - static const auto ProcessGroupNCCLTorchBind = - torch::class_<::c10d::ProcessGroupNCCL>("dist_c10d", "ProcessGroupNCCL") - .def_pickle( - [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { - auto base_process_group = - ::c10::static_intrusive_pointer_cast<::c10d::ProcessGroup>(self); - auto name = - ::c10d::DistributedC10d::get()->getNameOfProcessGroup(self); - return std::vector{name}; - }, - [](std::vector state) { - TORCH_CHECK( - state.size() == 1, - "Expecting exactly 1 state when restoring ProcessGroupNCCL, got: ", - state.size()); - const auto& process_group_name = state.front(); - auto base_process_group = - ::c10d::DistributedC10d::get()->getProcessGroupByName( - process_group_name); - TORCH_CHECK( - base_process_group.defined(), - "Needed process group not found, ", - "please create a process group with name: ", - process_group_name); - c10::intrusive_ptr<::c10d::ProcessGroupNCCL> - process_group_nccl = ::c10::dynamic_intrusive_pointer_cast< - ::c10d::ProcessGroupNCCL>(base_process_group); - TORCH_CHECK( - process_group_nccl.defined(), - "Process group ", - process_group_name, - " isn't configured for NCCL backend"); - return process_group_nccl; - }) - .def(torch::init( - [](const c10::intrusive_ptr<::c10d::Store>& store, - int64_t rank, - int64_t size, - c10::intrusive_ptr<::c10d::ProcessGroupNCCL::Options> options, - const std::string& name) { - auto pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>( - store, rank, size, options); - ::c10d::DistributedC10d::get()->registerProcessGroupName( - pg, name); - return pg; - })) - .def( - "alltoall_base", - [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self, - at::Tensor output, - at::Tensor input, - std::vector outputSplitSizes, - std::vector inputSplitSizes) { - return self->alltoall_base( - output, - input, - outputSplitSizes, - inputSplitSizes, - ::c10d::AllToAllOptions()); - }) - .def( - "size", - [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { - return (int64_t)self->getSize(); - }) - .def( - "rank", - [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { - return (int64_t)self->getRank(); - }); -#endif - static const auto DistributedC10dFrontendTorchBind = torch::class_<::c10d::DistributedC10d>("dist_c10d", "frontend") .def(torch::init([]() { return ::c10d::DistributedC10d::get(); })) @@ -1344,4 +1261,12 @@ void initCustomClassBindings() { &::c10d::DistributedC10d::getNameOfProcessGroup); } +TORCH_LIBRARY(q, m) { + m.def("_Bfloat16QuantizedToFloat(Tensor input) -> Tensor"); + m.def("_FloatToBfloat16Quantized(Tensor input) -> Tensor"); +} +TORCH_LIBRARY_IMPL(q, CPU, m) { + m.impl("_Bfloat16QuantizedToFloat", ::torch::distributed::c10d::quantization::_bfloat16_to_float_cpu); + m.impl("_FloatToBfloat16Quantized", ::torch::distributed::c10d::quantization::_float_to_bfloat16_cpu); +} } // namespace c10d diff --git a/torch/csrc/distributed/c10d/frontend.hpp b/torch/csrc/distributed/c10d/frontend.hpp index c90cc077b2823..b39d8b7a444bf 100644 --- a/torch/csrc/distributed/c10d/frontend.hpp +++ b/torch/csrc/distributed/c10d/frontend.hpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -259,7 +260,26 @@ class TORCH_PYTHON_API DistributedC10d : public torch::CustomClassHolder { int64_t group_count_; }; -// Must be called to initialize Torchbind bindings for c10d. -void initCustomClassBindings(); +// This class exists as a way to allow us to split NCCL-specific code into a +// different file. frontend_cuda.cpp will, if USE_C10D_NCCL is defined, +// override this NCCLProcessGroupProvider with one that will actually do +// something. +struct TORCH_API NCCLProcessGroupProvider { + virtual c10::intrusive_ptr get( + c10::intrusive_ptr /*prefix_store*/, + int64_t /*rank*/, + int64_t /*world_size*/, + std::chrono::milliseconds /*timeout*/) const { + AT_ERROR( + "Attempting to create NCCL-based process group while NCCL is either not enabled or built"); + } + + virtual ~NCCLProcessGroupProvider() = default; +}; + +TORCH_API void registerNCCLProcessGroupProvider( + NCCLProcessGroupProvider* provider); + +TORCH_API void initCustomClassBindings(); } // namespace c10d diff --git a/torch/csrc/distributed/c10d/frontend_cuda.cpp b/torch/csrc/distributed/c10d/frontend_cuda.cpp new file mode 100644 index 0000000000000..1b42f13b3c8df --- /dev/null +++ b/torch/csrc/distributed/c10d/frontend_cuda.cpp @@ -0,0 +1,136 @@ +#include + +#ifdef USE_C10D_NCCL + +#include +#include +#include +#include +#include + +namespace c10d { + +void initCustomClassBindingsNccl() { + // XXX: Ideally the Options of ProcessGroupNCCL should be + // bound using `def_readwrite` like in pybind11, but we + // didn't do that because: 1. no milisecond support yet + // 2. no def_readwrite or property support yet. + // TODO: make this binding the same as pybind11 + static const auto ProcessGroupNCCLOptionsTorchBind = + torch::class_<::c10d::ProcessGroupNCCL::Options>( + "dist_c10d", "ProcessGroupNCCLOptions") + .def(torch::init([](int64_t timeout, bool isHighPriorityStream) { + auto opTimeout = std::chrono::milliseconds(timeout); + auto opts = + ::c10d::ProcessGroupNCCL::Options::create(isHighPriorityStream); + opts->timeout = opTimeout; + return opts; + })); + + static const auto ProcessGroupNCCLTorchBind = + torch::class_<::c10d::ProcessGroupNCCL>("dist_c10d", "ProcessGroupNCCL") + .def_pickle( + [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + auto base_process_group = + ::c10::static_intrusive_pointer_cast<::c10d::ProcessGroup>( + self); + auto name = + ::c10d::DistributedC10d::get()->getNameOfProcessGroup(self); + return std::vector{name}; + }, + [](std::vector state) { + TORCH_CHECK( + state.size() == 1, + "Expecting exactly 1 state when restoring ProcessGroupNCCL, got: ", + state.size()); + const auto& process_group_name = state.front(); + auto base_process_group = + ::c10d::DistributedC10d::get()->getProcessGroupByName( + process_group_name); + TORCH_CHECK( + base_process_group.defined(), + "Needed process group not found, ", + "please create a process group with name: ", + process_group_name); + c10::intrusive_ptr<::c10d::ProcessGroupNCCL> + process_group_nccl = ::c10::dynamic_intrusive_pointer_cast< + ::c10d::ProcessGroupNCCL>(base_process_group); + TORCH_CHECK( + process_group_nccl.defined(), + "Process group ", + process_group_name, + " isn't configured for NCCL backend"); + return process_group_nccl; + }) + .def(torch::init( + [](const c10::intrusive_ptr<::c10d::Store>& store, + int64_t rank, + int64_t size, + c10::intrusive_ptr<::c10d::ProcessGroupNCCL::Options> options, + const std::string& name) { + auto pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>( + store, rank, size, options); + ::c10d::DistributedC10d::get()->registerProcessGroupName( + pg, name); + return pg; + })) + .def( + "alltoall_base", + [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self, + at::Tensor output, + at::Tensor input, + std::vector outputSplitSizes, + std::vector inputSplitSizes) { + return self->alltoall_base( + output, + input, + outputSplitSizes, + inputSplitSizes, + ::c10d::AllToAllOptions()); + }) + .def( + "size", + [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + return (int64_t)self->getSize(); + }) + .def( + "rank", + [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + return (int64_t)self->getRank(); + }); +} + +namespace { +struct RealNCCLProcessGroupProvider : public NCCLProcessGroupProvider { + c10::intrusive_ptr get( + c10::intrusive_ptr prefix_store, + int64_t rank, + int64_t world_size, + std::chrono::milliseconds timeout) const override { + auto options = ProcessGroupNCCL::Options::create(); + options->is_high_priority_stream = false; + options->timeout = timeout; + return c10::make_intrusive( + prefix_store, rank, world_size, options); + } +}; + +struct RegisterNCCLProcessGroupProvider { + RegisterNCCLProcessGroupProvider() { + static RealNCCLProcessGroupProvider provider; + registerNCCLProcessGroupProvider(&provider); + } +}; + +RegisterNCCLProcessGroupProvider reg; + +} // namespace +#define DISPATCH_TO_CUDA(name, function) \ + m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function))) +TORCH_LIBRARY_IMPL(q, CUDA, m) { + DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", ::torch::distributed::c10d::quantization::_bfloat16_to_float_cuda); + DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", ::torch::distributed::c10d::quantization::_float_to_bfloat16_cuda); +} +} // namespace c10d + +#endif // USE_C10D_NCCL diff --git a/torch/csrc/distributed/c10d/frontend_cuda.hpp b/torch/csrc/distributed/c10d/frontend_cuda.hpp new file mode 100644 index 0000000000000..a790f2e847b0d --- /dev/null +++ b/torch/csrc/distributed/c10d/frontend_cuda.hpp @@ -0,0 +1,12 @@ +#pragma once + +#ifdef USE_C10D_NCCL +#include + +namespace c10d { + +TORCH_API void initCustomClassBindingsNccl(); + +} + +#endif diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 6b52d3c058384..4bac0ca46edc4 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -17,7 +17,7 @@ #ifdef USE_C10D_NCCL #include -#include +#include #endif #ifdef USE_C10D_MPI @@ -35,7 +35,6 @@ #include #include -#include #include #include #include @@ -233,6 +232,9 @@ void _register_builtin_comm_hook( PyObject* c10d_init(PyObject* _unused, PyObject* noargs) { C10_LOG_API_USAGE_ONCE("c10d.python.import"); ::c10d::initCustomClassBindings(); +#ifdef USE_C10D_NCCL + ::c10d::initCustomClassBindingsNccl(); +#endif auto c10d_module = THPObjectPtr(PyImport_ImportModule("torch.distributed")); if (!c10d_module) { @@ -1646,28 +1648,6 @@ static PyMethodDef methods[] = { // NOLINT PyMethodDef* python_functions() { return methods; } - -namespace quantization { -TORCH_LIBRARY(q, m) { - m.def("_Bfloat16QuantizedToFloat(Tensor input) -> Tensor"); - m.def("_FloatToBfloat16Quantized(Tensor input) -> Tensor"); -} - TORCH_LIBRARY_IMPL(q, CPU, m) { - m.impl("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cpu); - m.impl("_FloatToBfloat16Quantized", _float_to_bfloat16_cpu); - } - -#ifdef USE_C10D_NCCL - #define DISPATCH_TO_CUDA(name, function) \ - m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function))) - TORCH_LIBRARY_IMPL(q, CUDA, m) { - DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cuda); - DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", _float_to_bfloat16_cuda); - } -#endif - -} // namespace quantization - } // namespace c10d } // namespace distributed } // namespace torch diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp index 7001209be9851..5fbe63ede321c 100644 --- a/torch/csrc/distributed/rpc/request_callback_impl.cpp +++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include From c7027f19efbb2f7b274c9e5fc0e87fe4b084e6ae Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Thu, 26 Aug 2021 21:05:56 -0700 Subject: [PATCH 282/530] [quant][fx] Add support for dynamic linear + relu fusion (INT8) (#63799) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63799 Add a new module that can be used for module swap with the nni.LinearReLU module in convert function. Supports INT8 currently (since FP16 op doesn't have relu fusion yet). Fixes #55393 Test Plan: python test/test_quantization.py test_dynamic_fusion Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30502812 fbshipit-source-id: 3668e4f001a0626d469e17ac323acf582ee28a51 --- .../eager/test_quantize_eager_ptq.py | 18 +++++++ test/quantization/fx/test_quantize_fx.py | 54 ++++++++++++++++++- .../intrinsic/quantized/dynamic/__init__.py | 1 + .../quantized/dynamic/modules/__init__.py | 6 +++ .../quantized/dynamic/modules/linear_relu.py | 47 ++++++++++++++++ torch/nn/quantized/dynamic/modules/linear.py | 9 +++- .../quantization/fx/quantization_patterns.py | 13 +++-- torch/quantization/ns/mappings.py | 3 ++ torch/quantization/quantization_mappings.py | 2 + .../testing/_internal/common_quantization.py | 8 +++ 10 files changed, 154 insertions(+), 7 deletions(-) create mode 100644 torch/nn/intrinsic/quantized/dynamic/__init__.py create mode 100644 torch/nn/intrinsic/quantized/dynamic/modules/__init__.py create mode 100644 torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index 1824da514b733..10cbd928b2b36 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -42,6 +42,7 @@ EmbeddingBagModule, EmbeddingModule, EmbeddingWithLinear, + LinearReluLinearModel, ) # annotated models @@ -995,6 +996,23 @@ def checkQuantized(model): model = quantize_dynamic(NestedModel().eval(), qconfig_dict) checkQuantized(model) + def test_linear_relu_fusion(self): + dtype = torch.qint8 + model = LinearReluLinearModel().eval() + qconfig = default_dynamic_qconfig + qconfig_dict = {'' : qconfig} + torch.quantization.fuse_modules(model, [['fc1', 'relu']], inplace=True) + prepare_dynamic(model, qconfig_dict) + convert_dynamic(model) + + def checkQuantized(model): + self.checkDynamicQuantizedLinearRelu(model.fc1, dtype) + self.checkDynamicQuantizedLinear(model.fc2, dtype) + self.checkScriptable(model, self.calib_data, check_save_load=True) + self.checkNoQconfig(model) + + checkQuantized(model) + @given(qconfig=st.sampled_from([per_channel_dynamic_qconfig, default_dynamic_qconfig]), dtype=st.sampled_from([torch.qint8, torch.float16])) def test_quantized_rnn(self, qconfig, dtype): diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 08474d2bc1d19..cdf2e7bea4328 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -6,6 +6,7 @@ import torch.nn.quantized.dynamic as nnqd import torch.nn.intrinsic as nni import torch.nn.intrinsic.quantized as nniq +import torch.nn.intrinsic.quantized.dynamic as nniqd import torch.multiprocessing as mp # graph mode quantization based on fx @@ -2883,6 +2884,57 @@ def forward(self, x): self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence) self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref) + @skipIfNoFBGEMM + def test_dynamic_with_fusion(self): + """ + Tests that dynamic quantization APIs work with Linear + Relu fusion + """ + class LinearRelu(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 5) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.linear(x) + return self.relu(x) + + class Linear(torch.nn.Module): + def __init__(self): + super().__init__() + self.w = torch.ones(5, 5) + self.b = torch.zeros(5) + + def forward(self, x): + return torch.nn.functional.linear(x, self.w, self.b) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.mods1 = torch.nn.Sequential(LinearRelu(), LinearRelu()) + self.mods2 = Linear() + self.relu = F.relu + + def forward(self, x): + x = self.mods1(x) + x = self.mods2(x) + x = self.relu(x) + return x + + model = M().eval() + qconfig = { + "": default_dynamic_qconfig, + } + m = prepare_fx(model, qconfig) + m = convert_fx(m) + m(torch.rand(5, 5)) + node_list = [ + ns.call_module(nniqd.LinearReLU), + ns.call_module(nniqd.LinearReLU), + ns.call_function(torch.ops.quantized.linear_relu_dynamic), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops @@ -2956,7 +3008,7 @@ def forward(self, x): } quant_type_to_qlinear_relu_fun = { # we don't have linear_relu_dynamic - QuantType.DYNAMIC: ns.call_function(torch.ops.quantized.linear_dynamic), + QuantType.DYNAMIC: ns.call_function(torch.ops.quantized.linear_relu_dynamic), QuantType.STATIC: ns.call_function(torch.ops.quantized.linear_relu), QuantType.QAT: ns.call_function(torch.ops.quantized.linear_relu), } diff --git a/torch/nn/intrinsic/quantized/dynamic/__init__.py b/torch/nn/intrinsic/quantized/dynamic/__init__.py new file mode 100644 index 0000000000000..3d79bdbfe8320 --- /dev/null +++ b/torch/nn/intrinsic/quantized/dynamic/__init__.py @@ -0,0 +1 @@ +from .modules import * # noqa: F403 diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py b/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py new file mode 100644 index 0000000000000..ce571862b4275 --- /dev/null +++ b/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py @@ -0,0 +1,6 @@ +import torch +from .linear_relu import LinearReLU + +__all__ = [ + 'LinearReLU', +] diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py new file mode 100644 index 0000000000000..04c4c954810fd --- /dev/null +++ b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py @@ -0,0 +1,47 @@ +import torch +import torch.nn.quantized.dynamic as nnqd +import torch.nn.intrinsic as nni + +class LinearReLU(nnqd.Linear): + r""" + A LinearReLU module fused from Linear and ReLU modules that can be used + for dynamic quantization. + Supports both, FP16 and INT8 quantization. + + We adopt the same interface as :class:`torch.nn.quantized.dynamic.Linear`. + + Attributes: + Same as torch.nn.quantized.dynamic.Linear + + Examples:: + + >>> m = nn.intrinsic.quantized.dynamic.LinearReLU(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + _FLOAT_MODULE = nni.LinearReLU # type: ignore[assignment] + + def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8): + super().__init__(in_features, out_features, bias, dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self._packed_params.dtype == torch.qint8: + # TODO check if we should set reduce_rage = True by default here + Y = torch.ops.quantized.linear_relu_dynamic( + x, self._packed_params._packed_params, reduce_range=True) + # TODO Support this in a later PR + # elif self._packed_params.dtype == torch.float16: + # Y = torch.ops.quantized.linear_relu_dynamic_fp16( + # x, self._packed_params._packed_params) + else: + raise RuntimeError('Unsupported dtype on dynamic quantized linear relu!') + return Y.to(x.dtype) + + def _get_name(self): + return 'DynamicQuantizedLinearReLU' + + @classmethod + def from_float(cls, mod): + return super(LinearReLU, cls).from_float(mod) diff --git a/torch/nn/quantized/dynamic/modules/linear.py b/torch/nn/quantized/dynamic/modules/linear.py index 07cfdfe2846cc..ee153b10d2466 100644 --- a/torch/nn/quantized/dynamic/modules/linear.py +++ b/torch/nn/quantized/dynamic/modules/linear.py @@ -1,5 +1,6 @@ import torch import torch.nn.quantized as nnq +import torch.nn.intrinsic as nni from torch.nn.quantized.modules.utils import _quantize_weight class Linear(nnq.Linear): @@ -79,11 +80,15 @@ def from_float(cls, mod): mod (Module): a float module, either produced by torch.quantization utilities or provided by the user """ - float_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear] + float_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear, + torch.nn.intrinsic.modules.fused.LinearReLU] + assert type(mod) in float_modules, \ 'nn.quantized.dynamic.Linear.from_float only works for one of' + \ str([float_mod.__name__ for float_mod in float_modules]) assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + if type(mod) == nni.LinearReLU: + mod = mod[0] if mod.qconfig is not None and mod.qconfig.weight is not None: weight_observer = mod.qconfig.weight() else: @@ -102,6 +107,6 @@ def from_float(cls, mod): qweight = mod.weight.float() else: raise RuntimeError('Unsupported dtype specified for dynamic quantized Linear!') - qlinear = Linear(mod.in_features, mod.out_features, dtype=dtype) + qlinear = cls(mod.in_features, mod.out_features, dtype=dtype) qlinear.set_weight_bias(qweight, mod.bias) return qlinear diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 09ca190a73668..b7c39ca92cead 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -1022,9 +1022,14 @@ def convert(self, elif dtypes in [(torch.float32, torch.qint8, torch.quint8), (torch.float32, torch.float16, None)]: # choose linear dynamic or linear dynamic fp16 op based on weight dtype - qlinear_op = torch.ops.quantized.linear_dynamic \ - if weight_dtype == torch.qint8 \ - else torch.ops.quantized.linear_dynamic_fp16 + if weight_dtype == torch.qint8: + if self.relu_node: + qlinear_op = torch.ops.quantized.linear_relu_dynamic + else: + qlinear_op = torch.ops.quantized.linear_dynamic + else: # TODO add support for fp16 + relu fusion in a later PR + qlinear_op = torch.ops.quantized.linear_dynamic_fp16 + linear_input = load_arg(quantized=torch.float)(self.linear_node.args[0]) qlinear_args = (linear_input, packed_weight) # type: ignore[assignment] op_out = quantized_graph.create_node( @@ -1033,7 +1038,7 @@ def convert(self, # TODO: may need to change the key to Node regenerate the map in each transformation, # since we might not be able to rely on the name node_name_to_scope[op_out.name] = node_name_to_scope[self.linear_node.name] - if self.relu_node: + if self.relu_node and weight_dtype is not torch.qint8: op_out = quantized_graph.create_node("call_function", torch.nn.functional.relu, (op_out,), {}) return op_out else: diff --git a/torch/quantization/ns/mappings.py b/torch/quantization/ns/mappings.py index 399ddca22668e..e97d77119d00e 100644 --- a/torch/quantization/ns/mappings.py +++ b/torch/quantization/ns/mappings.py @@ -8,6 +8,7 @@ import torch.nn.quantized as nnq import torch.nn.quantized.dynamic as nnqd import torch.nn.intrinsic.quantized as nniq +import torch.nn.intrinsic.quantized.dynamic as nniqd import torch.nn.intrinsic.qat as nniqat import torch.nn.intrinsic as nni import torch.nn.qat as nnqat @@ -70,6 +71,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]: nnq.Linear, nni.LinearReLU, nniq.LinearReLU, + nniqd.LinearReLU, nnqat.Linear, nnqd.Linear, nniqat.LinearReLU, @@ -529,6 +531,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nniqat.ConvReLU2d, nniqat.ConvReLU3d, nniqat.LinearReLU, + nniqd.LinearReLU, ]) MODS_IO_TYPE_INT8: Set[NSNodeTargetType] = set([ diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py index 6179398b7398a..775d40bb23efa 100644 --- a/torch/quantization/quantization_mappings.py +++ b/torch/quantization/quantization_mappings.py @@ -6,6 +6,7 @@ import torch.nn.functional as F import torch.nn.intrinsic as nni import torch.nn.intrinsic.quantized as nniq +import torch.nn.intrinsic.quantized.dynamic as nniqd import torch.nn.intrinsic.quantized._reference as nniqr import torch.nn.intrinsic.qat as nniqat import torch.nn.quantized as nnq @@ -122,6 +123,7 @@ nn.GRU: nnqd.GRU, nn.LSTMCell: nnqd.LSTMCell, nn.RNNCell: nnqd.RNNCell, + nni.LinearReLU: nniqd.LinearReLU, } # Allowlist for propagating the qconfig diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index 6b2d1dd13a33d..77512f7ef445a 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.nn.intrinsic.quantized.dynamic as nniqd import torch.nn.quantized as nnq import torch.nn.quantized.dynamic as nnqd from torch.nn.intrinsic import _FusedModule @@ -422,6 +423,13 @@ def checkDynamicQuantizedLinear(self, mod, dtype): self.assertEqual(type(mod), nnqd.Linear) self.assertEqual(mod._packed_params.dtype, dtype) + def checkDynamicQuantizedLinearRelu(self, mod, dtype): + r"""Checks that mod has been swapped for an nnqd.Linear + module, the bias is float. + """ + self.assertEqual(type(mod), nniqd.LinearReLU) + self.assertEqual(mod._packed_params.dtype, dtype) + def check_eager_serialization(self, ref_model, loaded_model, x): # Check state dict serialization and torch.save APIs model_dict = ref_model.state_dict() From 975f4ccad6fb7ca13c50ee628ec3fb3a77a64893 Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Thu, 26 Aug 2021 21:05:56 -0700 Subject: [PATCH 283/530] [quant] support linear_relu_dynamic for qnnpack backend (#63820) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63820 Adds support in the operator directly to call relu operator if relu fusion is enabled. Once QNNPACK natively supports relu fusion in the linear_dynamic this can be removed Test Plan: python test/test_quantization.py TestDynamicQuantizedLinear.test_qlinear Imported from OSS Reviewed By: vkuzo Differential Revision: D30502813 fbshipit-source-id: 3352ee5f73e482b6d1941f389d720a461b84ba23 --- aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp | 6 ++++++ test/quantization/core/test_quantized_op.py | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index da64197fb4577..23c6158889db2 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -349,6 +349,12 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) { TORCH_INTERNAL_ASSERT( runStatus == pytorch_qnnp_status_success, "failed to run QNNPACK Linear operator"); + + // Call the relu operator here until qlinear dynamic in QNNPACK + // supports it natively. + if (ReluFused) { + output.relu_(); + } return output; } diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index 9243fe2440173..86fe3509ab4fb 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -2606,7 +2606,6 @@ class TestDynamicQuantizedLinear(TestCase): def test_qlinear(self, batch_size, input_channels, output_channels, use_bias, use_relu, use_multi_dim_input, use_channelwise, reduce_range): if torch.backends.quantized.engine == 'qnnpack': - use_relu = False reduce_range = False qlinear_prepack = torch.ops.quantized.linear_prepack From cec44aa574e06e8aa1096b62a7c6d7c4dda8a3f5 Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Thu, 26 Aug 2021 21:05:56 -0700 Subject: [PATCH 284/530] [quant] Add op support for linear_relu_dynamic_fp16 (#63824) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63824 Add a fused operator implementation that will work with the quantization fusion APIs. Once FBGEMM FP16 kernel supports relu fusion natively we can remove the addition from the PT operator. Test Plan: python test/test_quantization.py Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30503514 fbshipit-source-id: 6bf3bd53f47ffaa3f1d178eaad8cc980a7f5258a --- .../native/quantized/cpu/qlinear_dynamic.cpp | 11 +++++-- aten/src/ATen/native/quantized/library.cpp | 1 + test/quantization/core/test_quantized_op.py | 32 +++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index 23c6158889db2..3331a0387111c 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -451,8 +451,14 @@ class QLinearDynamicFp16 final { TORCH_CHECK( fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM."); - TORCH_INTERNAL_ASSERT(!ReluFused); - return packed_weight->apply_dynamic(std::move(input)); + auto output = packed_weight->apply_dynamic(std::move(input)); + + // Call the relu operator here until fp16 linear dynamic in FBGEMM + // supports it natively. + if (ReluFused) { + output.relu_(); + } + return output; } #else // USE_FBGEMM static at::Tensor run( @@ -471,6 +477,7 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) { m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8::run)); m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic"), TORCH_FN(QLinearDynamicInt8::run)); m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16::run)); } TORCH_LIBRARY_IMPL(_quantized, CPU, m) { diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index 8ead74f326ff2..3dcf75b1ccb32 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -142,6 +142,7 @@ TORCH_LIBRARY(quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index 86fe3509ab4fb..49b7c96847612 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -2782,6 +2782,38 @@ def test_qlinear_legacy(self, batch_size, input_channels, output_channels): self.assertEqual(Y_fp32, Y_fp32_ref, msg="torch.ops.quantized.fbgemm_linear_dynamic results are off") + @skipIfNoFBGEMM + def test_qlinear_dynamic_fp16(self): + + options = itertools.product( + (2, 4), # batch_size + (4, 5, 12), # input_channels + (4, 7, 8), # output_channels + (True, False), # use_bias + (True, False), # use_relu + ) + for batch_size, input_channels, output_channels, use_bias, use_relu in options: + qlinear_prepack = torch.ops.quantized.linear_prepack_fp16 + if use_relu: + qlinear_dynamic = torch.ops.quantized.linear_relu_dynamic_fp16 + else: + qlinear_dynamic = torch.ops.quantized.linear_dynamic_fp16 + + x = torch.randn(batch_size, input_channels) + w = torch.randn(output_channels, input_channels) + bias = torch.randn(output_channels) if use_bias else None + + w_packed = qlinear_prepack(w, bias) + out = qlinear_dynamic(x, w_packed) + + # qlinear_dynamic_fp16 uses FP32 activation tensors and FP16 weight tensors + # output is FP32 + w_fp16 = w.to(torch.float16).to(torch.float32) + ref = F.linear(x, w_fp16, bias) + if use_relu: + ref.relu_() + + self.assertEqual(out, ref) class TestDynamicQuantizedRNNOp(TestCase): """Tests the correctness of the dynamic quantized lstm/gru.""" From 294db0603fef315c8f6ac95e30f8ce6b5cce2b5a Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Thu, 26 Aug 2021 21:05:56 -0700 Subject: [PATCH 285/530] [quant] Add support for linear_relu fusion for FP16 dynamic quant (#63826) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63826 Support the conversion of the intrinsic linearRelu module to the quantized dynamic LinearReLU module Verify the support works for both linear module and functional linear fusion Test Plan: python test/test_quantization.py test_dynamic_with_fusion Imported from OSS Reviewed By: iramazanli Differential Revision: D30503513 fbshipit-source-id: 70446797e9670dfef7341cba2047183d6f88b70f --- test/quantization/fx/test_quantize_fx.py | 33 ++++++++++++------- .../quantized/dynamic/modules/linear_relu.py | 7 ++-- .../quantization/fx/quantization_patterns.py | 9 ++--- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index cdf2e7bea4328..762919eeb04ea 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -2922,18 +2922,24 @@ def forward(self, x): return x model = M().eval() - qconfig = { - "": default_dynamic_qconfig, + + dynamic_quantized_ops = { + float16_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic_fp16, + default_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic } - m = prepare_fx(model, qconfig) - m = convert_fx(m) - m(torch.rand(5, 5)) - node_list = [ - ns.call_module(nniqd.LinearReLU), - ns.call_module(nniqd.LinearReLU), - ns.call_function(torch.ops.quantized.linear_relu_dynamic), - ] - self.checkGraphModuleNodes(m, expected_node_list=node_list) + for config in [float16_dynamic_qconfig, default_dynamic_qconfig]: + qconfig = { + "": config + } + m = prepare_fx(model, qconfig) + m = convert_fx(m) + m(torch.rand(5, 5)) + node_list = [ + ns.call_module(nniqd.LinearReLU), + ns.call_module(nniqd.LinearReLU), + ns.call_function(dynamic_quantized_ops[config]), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): @@ -3089,7 +3095,10 @@ def forward(self, x): if is_reference: qlinear_fun = ns.call_function(torch.nn.functional.linear) else: - qlinear_fun = ns.call_function(torch.ops.quantized.linear_dynamic_fp16) + if has_relu: + qlinear_fun = ns.call_function(torch.ops.quantized.linear_relu_dynamic_fp16) + else: + qlinear_fun = ns.call_function(torch.ops.quantized.linear_dynamic_fp16) prepare_node_occurrence = { # weight ns.call_module(torch.quantization.PlaceholderObserver): 1 diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py index 04c4c954810fd..c30b3109ef601 100644 --- a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py +++ b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py @@ -31,10 +31,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # TODO check if we should set reduce_rage = True by default here Y = torch.ops.quantized.linear_relu_dynamic( x, self._packed_params._packed_params, reduce_range=True) - # TODO Support this in a later PR - # elif self._packed_params.dtype == torch.float16: - # Y = torch.ops.quantized.linear_relu_dynamic_fp16( - # x, self._packed_params._packed_params) + elif self._packed_params.dtype == torch.float16: + Y = torch.ops.quantized.linear_relu_dynamic_fp16( + x, self._packed_params._packed_params) else: raise RuntimeError('Unsupported dtype on dynamic quantized linear relu!') return Y.to(x.dtype) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index b7c39ca92cead..6362961ad8daa 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -1027,8 +1027,11 @@ def convert(self, qlinear_op = torch.ops.quantized.linear_relu_dynamic else: qlinear_op = torch.ops.quantized.linear_dynamic - else: # TODO add support for fp16 + relu fusion in a later PR - qlinear_op = torch.ops.quantized.linear_dynamic_fp16 + else: + if self.relu_node: + qlinear_op = torch.ops.quantized.linear_relu_dynamic_fp16 + else: + qlinear_op = torch.ops.quantized.linear_dynamic_fp16 linear_input = load_arg(quantized=torch.float)(self.linear_node.args[0]) qlinear_args = (linear_input, packed_weight) # type: ignore[assignment] @@ -1038,8 +1041,6 @@ def convert(self, # TODO: may need to change the key to Node regenerate the map in each transformation, # since we might not be able to rely on the name node_name_to_scope[op_out.name] = node_name_to_scope[self.linear_node.name] - if self.relu_node and weight_dtype is not torch.qint8: - op_out = quantized_graph.create_node("call_function", torch.nn.functional.relu, (op_out,), {}) return op_out else: assert dtypes == (torch.float16, torch.float16, None) From 3f1c8094707f695cf0cf51c795b18093e0a3ab86 Mon Sep 17 00:00:00 2001 From: Ansha Yu Date: Thu, 26 Aug 2021 23:17:42 -0700 Subject: [PATCH 286/530] [static runtime] port c2 argmin kernel (#63632) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63632 Local benchmarking with 1 input repeated 10k iter on 290331537_4 local net. Reduces argmin runtime by about 80% and and local net execution by about ~0.71-0.77ms. Before: ``` I0826 17:25:53.972786 1104614 PyTorchPredictorBenchLib.cpp:313] PyTorch run finished. Milliseconds per iter: 7.37599. Iters per second: 135.57 ``` ``` Static runtime ms per iter: 8.22086. Iters per second: 121.642 Time per node type: 4.13527 ms. 50.9157%. fb::sigrid_transforms_torch_bind (1 nodes, out variant) 0.868506 ms. 10.6935%. aten::argmin (1 nodes, out variant) ... ``` After: ``` I0826 17:17:54.165174 1064079 PyTorchPredictorBenchLib.cpp:313] PyTorch run finished. Milliseconds per iter: 6.66724. Iters per second: 149.987 ``` ``` Static runtime ms per iter: 7.68172. Iters per second: 130.179 Time per node type: 4.1452 ms. 54.0612%. fb::sigrid_transforms_torch_bind (1 nodes, out variant) 0.656778 ms. 8.56562%. fb::quantized_linear (8 nodes) 0.488229 ms. 6.36741%. static_runtime::to_copy (827 nodes, out variant) 0.372678 ms. 4.86042%. aten::argmin (1 nodes, out variant) ...Time per node type: 3.39387 ms. 53.5467%. fb::sigrid_transforms_torch_bind (1 nodes, out variant) 0.636216 ms. 10.0379%. fb::quantized_linear (8 nodes, out variant) 0.410535 ms. 6.47721%. fb::clip_ranges_to_gather_to_offsets (304 nodes, out variant) 0.212721 ms. 3.3562%. fb::clip_ranges_gather_sigrid_hash_precompute_v3 (157 nodes, out variant) 0.173736 ms. 2.74111%. aten::matmul (1 nodes, out variant) 0.150514 ms. 2.37474%. aten::argmin (1 nodes, out variant) ``` P447422384 Test Plan: Test with local replayer sending traffic to `ansha_perf_test_0819.test`, and compare outputs to jit interpreter. Start compute tier: ``` RUN_UUID=ansha_perf_test_0819.test.storage JOB_EXPIRE_TIME=864000 MODEL_ID=290331537_4 PREDICTOR_TAG= PREDICTOR_VERSION=405 PREDICTOR_TYPE=CPU ADDITIONAL_FLAGS="--enable_disagg_file_split=true --enable_adx=false --load_remote_file_locally=true --pytorch_predictor_static_runtime_whitelist_by_id=290331537" GFLAGS_CONFIG_PATH=sigrid/predictor/gflags/predictor_gflags_ads_perf_cpu_pyper SMC_TIER_NAME=sigrid.predictor.perf.ansha_per_test_0819.test.storage CLUSTER=tsp_rva ENTITLEMENT_NAME=ads_ranking_infra_test_t6 PREDICTOR_LOCAL_DIRECTORY= ICET_CONFIG_PATH= NNPI_COMPILATION_CONFIG_FILE= NUM_TASKS=1 NNPI_NUM_WORKERS=0 tw job start /data/users/ansha/fbsource/fbcode/tupperware/config/admarket/sigrid/predictor/predictor_perf_canary.tw ``` Start nnpi tier: ``` RUN_UUID=ansha_perf_test_0819.test JOB_EXPIRE_TIME=247200 MODEL_ID=290331537_4 PREDICTOR_TAG= PREDICTOR_VERSION=343 PREDICTOR_TYPE=NNPI_TWSHARED ADDITIONAL_FLAGS="--torch_glow_min_fusion_group_size=30 --pytorch_storage_tier_replayer_sr_connection_options=overall_timeout:1000000,processing_timeout:1000000 --predictor_storage_smc_tier=sigrid.predictor.perf.ansha_perf_test_0819.test.storage --pytorch_predictor_static_runtime_whitelist_by_id=290331537" GFLAGS_CONFIG_PATH=sigrid/predictor/gflags/predictor_gflags_ads_perf_glow_nnpi_pyper_v1 SMC_TIER_NAME=sigrid.predictor.perf.ansha_perf_test_0819.test CLUSTER=tsp_rva ENTITLEMENT_NAME=ads_ranking_infra_test_t17 PREDICTOR_LOCAL_DIRECTORY= ICET_CONFIG_PATH= NNPI_COMPILATION_CONFIG_FILE= NUM_TASKS=1 NNPI_NUM_WORKERS=0 tw job start /data/users/ansha/fbsource/fbcode/tupperware/config/admarket/sigrid/predictor/predictor_perf_canary.tw ``` ```buck test caffe2/benchmarks/static_runtime:static_runtime_cpptest -- StaticRuntime.IndividualOps_Argmin --print-passing-details``` Compared outputs to jit interpreter to check for no differences greater than 1e-3 (with nnc on) https://www.internalfb.com/intern/diff/view-version/136824794/ Reviewed By: hlu1 Differential Revision: D30445635 fbshipit-source-id: 048de8867ac72f764132295d1ebfa843cde2fa27 --- torch/csrc/jit/runtime/static/ops.cpp | 94 ++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 4d34ed9388364..484c4b03ad64c 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -178,6 +179,94 @@ Tensor& linear_out( return output; } +Tensor& c2_argmin_out( + Tensor& output, + const Tensor& input, + const int64_t dim, + const bool keepdim) { + const auto ndim = input.dim(); + int64_t dim_ = maybe_wrap_dim(dim, ndim); + TORCH_CHECK(dim_ >= 0 && dim_ < ndim); + + const auto in_dims = input.sizes(); + + c10::SmallVector out_dims; + out_dims.reserve(ndim); + int prev_size = 1; + int next_size = 1; + for (int i = 0; i < dim_; ++i) { + out_dims.push_back(in_dims[i]); + prev_size *= in_dims[i]; + } + if (keepdim) { + out_dims.push_back(1); + } + for (auto i = dim_ + 1; i < ndim; ++i) { + out_dims.push_back(in_dims[i]); + next_size *= in_dims[i]; + } + at::native::resize_(output, out_dims, c10::nullopt); + + const auto n = in_dims[dim_]; + + if (next_size == 1) { + AT_DISPATCH_ALL_TYPES_AND2( + kHalf, kBFloat16, input.scalar_type(), "argmin_input", [&]() { + const auto in_ptr = input.data_ptr(); + const auto out_ptr = output.data_ptr(); + // input is a [prev_size, n] tensor. + // output is a [prev_size,] tensor. + // Thus, access is contiguous/coalesced. + for (int i = 0; i < prev_size; ++i) { + auto v = std::min_element( + in_ptr + i * n, + in_ptr + (i + 1) * n, + [](scalar_t a, scalar_t b) { + // if a is nan, then a is *less* than b with LessOrNan + // semantics + if (at::_isnan(a)) { + return true; + } + // if a is not nan and b is nan, then a is not less than b + // with LessOrNan semantics otherwise, act normally. If `b` is + // NaN then a < b will always return false, so this is + // equivalent to the first snippet. + return a < b; + }); + out_ptr[i] = std::distance(in_ptr + i * n, v); + } + }); + } else { + AT_DISPATCH_ALL_TYPES_AND2( + kHalf, kBFloat16, input.scalar_type(), "argmin_input", [&]() { + const auto less_or_nan = native::detail::LessOrNan{}; + + const auto in_ptr = input.data_ptr(); + const auto out_ptr = output.data_ptr(); + + std::memset(out_ptr, 0, prev_size * next_size * sizeof(int64_t)); + + for (int i = 0; i < prev_size; ++i) { + const scalar_t* cur_in_ptr = in_ptr + i * n * next_size + next_size; + for (int k = 1; k < n; ++k) { + for (int j = 0; j < next_size; ++j) { + int64_t* cur_out_ptr = out_ptr + i * next_size + j; + if (less_or_nan( + *cur_in_ptr, + in_ptr + [i * n * next_size + *cur_out_ptr * next_size + j], + *cur_out_ptr, + k)) { + *cur_out_ptr = k; + } + ++cur_in_ptr; + } + } + } + }); + } + return output; +} } // namespace native } // namespace at @@ -1209,6 +1298,10 @@ REGISTER_OPERATOR_FUNCTOR(aten::argmin, aten_argmin, [](Node* n) -> SROperator { } else { auto& out_t = p_node->Output(0).toTensor(); fastResizeToZero(out_t); + if (in0_t.is_contiguous() && dim.has_value()) { + at::native::c2_argmin_out(out_t, in0_t, dim.value(), keepdim); + return; + } at::cpu::argmin_out(out_t, in0_t, dim, keepdim); } }; @@ -1533,6 +1626,5 @@ REGISTER_OPERATOR_FUNCTOR( } }; }); - } // namespace jit } // namespace torch From 3c3bba4169067a7340ff1d786a6b61282cf26820 Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Fri, 27 Aug 2021 01:39:14 -0700 Subject: [PATCH 287/530] [Static Runtime] Use F14FastMap/F14FastSet (#63999) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63999 Use folly::F14FastMap/F14FastSet instead of std::unordered_map/unordered_set in the Static Runtime code base. folly::F14FastMap/F14FastSet implements the same APIs as std::unordered_map/unordered_set but faster. For details see https://github.com/facebook/folly/blob/master/folly/container/F14.md Reviewed By: d1jang Differential Revision: D30566149 fbshipit-source-id: 20a7fa2519e4dde96fb3fc61ef6c92bf6d759383 --- torch/csrc/jit/runtime/static/impl.cpp | 83 ++++++++++---------- torch/csrc/jit/runtime/static/impl.h | 33 ++++++-- torch/csrc/jit/runtime/static/ops.cpp | 3 +- torch/csrc/jit/runtime/static/te_wrapper.cpp | 5 +- 4 files changed, 72 insertions(+), 52 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index cb9342b364cc5..b3e1eb116dc7d 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -104,8 +104,8 @@ bool mayContainAlias(AliasDb& db, const Value* a, const Value* b) { bool mayContainAlias( AliasDb& db, - const std::unordered_set& a, - const std::unordered_set& b) { + const FastSet& a, + const FastSet& b) { std::vector as; std::vector bs; as.reserve(a.size()); @@ -122,11 +122,11 @@ bool mayContainAlias( } // Get set of all inputs/outputs/constants (always alive) and their aliases -std::unordered_set GetAlwaysAliveValues( +FastSet GetAlwaysAliveValues( const std::shared_ptr& graph, AliasDb& db) { // a set of Values whose live-range exceed current inference - std::unordered_set always_alive; + FastSet always_alive; // mark inputs, constants, outputs as always_alive for (const auto* input : graph->inputs()) { @@ -148,7 +148,7 @@ std::unordered_set GetAlwaysAliveValues( // constants are already in the always_alive set if (node->kind() != prim::Constant) { for (const auto* v : node->outputs()) { - if (mayContainAlias(db, ValueSet{v}, always_alive)) { + if (mayContainAlias(db, {v}, always_alive)) { always_alive.insert(v); } } @@ -158,22 +158,22 @@ std::unordered_set GetAlwaysAliveValues( } // Map each value to all values that are alive at the same time. -using LivenessMap = std::unordered_map>; +using LivenessMap = FastMap>; // The algorithm does a traversal of the execution graph // while keeping track of the live values. LivenessMap GetLivenessMap( const std::shared_ptr& graph, - const std::unordered_set& always_alive, + const FastSet& always_alive, AliasDb& db) { // map a Value to a set of Values that overlap live-ranges with the Value's - std::unordered_map> liveness_map; + FastMap> liveness_map; // map Values to its creation order in graph (Note: only traverse top-level // nodes such that nodes under control-flows are represented by top-level // block nodes) std::vector values_in_creation_order; - std::unordered_map values_to_idx_in_creation_order; + FastMap values_to_idx_in_creation_order; for (const auto* node : graph->nodes()) { for (const auto* v : node->outputs()) { values_to_idx_in_creation_order[v] = values_in_creation_order.size(); @@ -184,10 +184,10 @@ LivenessMap GetLivenessMap( // presence of a Value in live_values_use_chain means the Value alive // Value mapped to set of Nodes that may use the Value (i.e., use-chain of // Value) - std::unordered_map> live_values_use_chain; + FastMap> live_values_use_chain; // Node mapped to set of Values that the Node may use (i.e., def-chain of node // inputs) - std::unordered_map> live_nodes_def_chain; + FastMap> live_nodes_def_chain; // add v to the current liveness_map std::function add_live_value_fn = [&](const Value* v) { @@ -320,12 +320,12 @@ LivenessMap GetLivenessMap( std::pair, std::vector> GetMemoryPlanningCandidates(const std::shared_ptr& graph) { // for determinism - std::unordered_set seen_values; + FastSet seen_values; std::vector all_values; - std::unordered_set can_reuse; + FastSet can_reuse; // values used by unsupported ops (as either inputs or outputs) // these need to be removed from "can_reuse" after analyzing all nodes - std::unordered_set cannot_reuse; + FastSet cannot_reuse; for (auto* n : graph->nodes()) { bool can_reuse_inputs_outputs = canReuseInputsOutputs(n); for (const auto* v : n->inputs()) { @@ -388,10 +388,9 @@ GetMemoryPlanningCandidates(const std::shared_ptr& graph) { // // NB: This is a deterministic implementation, which makes it easier to tune // and debug. -std::unordered_map> -GenerateSameStorageValues( +FastMap> GenerateSameStorageValues( const LivenessMap& alive_during, - const std::unordered_set& always_alive, + const FastSet& always_alive, const std::pair, std::vector>& optimizable, AliasDb& db) { @@ -399,8 +398,7 @@ GenerateSameStorageValues( const auto& all_values = optimizable.second; // map Value* to a set Value* that can share the same storage with it - std::unordered_map> - same_storage_values; + FastMap> same_storage_values; // make new_v and old_v map to the same storage (i.e., add to each other's // same_storage_values set) @@ -589,9 +587,9 @@ StaticModule::StaticModule( } // map Value* to IValue (from inputs or prim::Constant) or null - std::unordered_map value_to_ivalue; + FastMap value_to_ivalue; // map Value* to its SSA definition IR - std::unordered_map value_to_ssa_def; + FastMap value_to_ssa_def; // N inputs map to the first N entries in storage for (const auto i : c10::irange(graph_->inputs().size())) { @@ -1165,8 +1163,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up"); } - std::unordered_set output_ivalues( - outputs_.begin(), outputs_.end()); + FastSet output_ivalues(outputs_.begin(), outputs_.end()); for (const auto n : c10::irange(nodes_.size())) { auto& pnode = nodes_[n]; for (const auto i : c10::irange(pnode.outputs().size())) { @@ -1202,13 +1199,13 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { static void assign_storage_to_managed_tensors( StaticRuntime* runtime, - const std::unordered_set& managed_tensor_values, - const std::unordered_map>& + const FastSet& managed_tensor_values, + const FastMap>& value_to_same_storage_values, std::vector>>& managed_tensors) { // map Value to index to managed_storage, where multiple values can // map to the same index (i.e., sharing the same storage) - std::unordered_map value_to_storage_idx; + FastMap value_to_storage_idx; // Snapshot of the current memory state for (auto& pnode : runtime->nodes()) { @@ -1218,19 +1215,21 @@ static void assign_storage_to_managed_tensors( if (managed_tensor_values.count(val)) { TORCH_CHECK(ival.isTensor()); at::Tensor* tensor = &ival.toTensor(); - - if (value_to_storage_idx.count(val)) { - managed_tensors[value_to_storage_idx[val]].second.emplace_back( - tensor); + auto f = value_to_storage_idx.find(val); + if (f != value_to_storage_idx.end()) { + auto storage_idx = f->second; + managed_tensors[storage_idx].second.emplace_back(tensor); } else { auto p = std::make_pair>(0, {tensor}); managed_tensors.emplace_back(std::move(p)); // first of a group, update the value_to_storage_idx map with the // index - if (value_to_same_storage_values.count(val)) { + auto f = value_to_same_storage_values.find(val); + if (f != value_to_same_storage_values.end()) { auto storage_idx = managed_tensors.size() - 1; - for (const auto* v : value_to_same_storage_values.at(val)) { + const auto& same_storage_values = f->second; + for (const auto* v : same_storage_values) { value_to_storage_idx[v] = storage_idx; } } @@ -1242,14 +1241,14 @@ static void assign_storage_to_managed_tensors( MemoryPlanner::MemoryPlanner( StaticRuntime* runtime, - const std::unordered_map>& + const FastMap>& value_to_same_storage_values, - const std::unordered_set& external_values, + const FastSet& external_values, bool enable_out_variant, bool manage_graph_output_memory) { // collect register indices of outputs of ops with out variant - std::unordered_set managed_tensor_values; - std::unordered_set leaked_values; + FastSet managed_tensor_values; + FastSet leaked_values; if (enable_out_variant) { for (ProcessedNode& pnode : runtime->nodes()) { if (pnode.has_out_variant()) { @@ -1260,7 +1259,7 @@ MemoryPlanner::MemoryPlanner( } // Types are stored in the underlying TorchScript IR const auto& type = out_v->type(); - if (type->cast()) { + if (type->castRaw()) { managed_tensor_values.insert(out_v); } else if (isOptimizableContainerType(pnode.node())) { // We "leak" certain container types because their allocations take @@ -1273,7 +1272,7 @@ MemoryPlanner::MemoryPlanner( } // collect unmanaged output ivalues - std::unordered_set unmanaged_ivalues; + FastSet unmanaged_ivalues; for (ProcessedNode& pnode : runtime->nodes()) { for (const auto i : c10::irange(pnode.outputs().size())) { // Types are stored in the underlying TorchScript IR @@ -1295,9 +1294,11 @@ MemoryPlanner::MemoryPlanner( } // copy to unmanaged_ivalues_ - for (IValue* out : unmanaged_ivalues) { - unmanaged_ivalues_.emplace_back(out); - } + unmanaged_ivalues_.reserve(unmanaged_ivalues.size()); + unmanaged_ivalues_.insert( + unmanaged_ivalues_.begin(), + unmanaged_ivalues.begin(), + unmanaged_ivalues.end()); if (enable_out_variant) { ::torch::jit::assign_storage_to_managed_tensors( diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index b16cfefbc0b60..6cff047b4d2ce 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -9,9 +9,26 @@ #include #include +#ifdef FBCODE_CAFFE2 +#include +#include +#endif + namespace torch { namespace jit { +#ifdef FBCODE_CAFFE2 +template +using FastMap = folly::F14FastMap; +template +using FastSet = folly::F14FastSet; +#else +template +using FastMap = std::unordered_map; +template +using FastSet = std::unordered_set; +#endif + TORCH_API bool canEnableStaticRuntime( const std::shared_ptr& graph); @@ -127,7 +144,7 @@ class TORCH_API StaticModule { size_t num_inputs() const; size_t num_outputs() const; - const std::unordered_map>& index_map() const { + const FastMap>& index_map() const { return node_inputs_ssa_def_map_; } @@ -147,12 +164,12 @@ class TORCH_API StaticModule { return schema_; } - const std::unordered_map>& + const FastMap>& values_share_same_storage() const { return value_to_same_storage_values_; } - const std::unordered_set& external_values() const { + const FastSet& external_values() const { return external_values_; } @@ -178,14 +195,14 @@ class TORCH_API StaticModule { // a vector of ssa_defs corresponding to graph->outputs() std::vector output_ssa_defs_; // map a node idx (in graph order) to a vector of ssa_defs for node inputs - std::unordered_map> node_inputs_ssa_def_map_; + FastMap> node_inputs_ssa_def_map_; // Bookkeeping for MemoryPlanner in StaticRuntime // values whose live-time exceeds that of running one inference (e.g., input, // output, prim::Constants, and their aliases) - std::unordered_set external_values_; + FastSet external_values_; // map a value to the set of values that may share the same storage with it - std::unordered_map> + FastMap> value_to_same_storage_values_; }; @@ -323,8 +340,8 @@ class MemoryPlanner { public: explicit MemoryPlanner( StaticRuntime* runtime, - const std::unordered_map>&, - const std::unordered_set& external_values, + const FastMap>&, + const FastSet& external_values, bool enable_out_variant, bool manage_graph_output_memory); // disable copying and moving diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 484c4b03ad64c..54c04566a6eaf 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -288,7 +289,7 @@ bool disableUnsafeMathOp(const char* op_name) { // not guarantee bit exactness vs the jit interpreter. Note aten::relu is not // included even though it uses NNC because the results of relu should always // match. - static const std::unordered_set fast_ops{ + static const FastSet fast_ops{ "aten::add", "aten::tanh", "aten::sigmoid", "aten::logit"}; return fast_ops.count(op_name) > 0; } diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp index d8b494c9d4a23..acd1fb758da0a 100644 --- a/torch/csrc/jit/runtime/static/te_wrapper.cpp +++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace torch { namespace jit { @@ -79,8 +80,8 @@ std::mutex& getNNCCacheMutex() { return nncCacheMutex; } -std::unordered_map>& getNNCCache() { - static std::unordered_map> nncCache; +FastMap>& getNNCCache() { + static FastMap> nncCache; return nncCache; } From c90b3cb1dabe712aa07e082b3735f1f2a9134c9b Mon Sep 17 00:00:00 2001 From: Don Jang Date: Fri, 27 Aug 2021 02:43:22 -0700 Subject: [PATCH 288/530] [Static Runtime] Manage temporary Tensors for aten::layer_norm (#64078) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64078 This change converts `aten::layer_norm -> output Tensor` to `static_runtime::layer_norm -> (output Tensor, temp1 Tensor, tmp2 Tensor)` to manage `tmp1` and `tmp2` Tensors by the static runtime. Currently the out-variant of `aten::layer_norm` creates two temporary Tensors inside it: ``` at::Tensor mean = create_empty_from({M}, *X); at::Tensor rstd = create_empty_from({M}, *X); ``` that the static runtime misses an opportunity to manage. This change puts them into (unused) output Tensors of a new placeholder op `static_runtime::layer_norm` so that the static runtime can mange them since the static runtime as of now chooses to manage only output tensors. Test Plan: - Enhanced `StaticRuntime.LayerNorm` to ensure that `static_runtime::layer_norm` gets activated. - Confirmed that the new op gets activated during testing: ``` V0825 12:51:50.017890 2265227 impl.cpp:1396] Switch to out variant for node: %8 : Tensor, %9 : Tensor, %10 : Tensor = static_runtime::layer_norm(%input.1, %normalized_shape.1, %4, %4, %5, %3) ``` Reviewed By: hlu1 Differential Revision: D30486475 fbshipit-source-id: 5121c44ab58c2d8a954aa0bbd9dfeb7468347a2d --- .../static_runtime/test_static_runtime.cc | 7 ++ torch/csrc/jit/runtime/static/impl.cpp | 1 + torch/csrc/jit/runtime/static/ops.cpp | 117 +++++++++++------- torch/csrc/jit/runtime/static/passes.cpp | 33 +++++ torch/csrc/jit/runtime/static/passes.h | 3 + 5 files changed, 113 insertions(+), 48 deletions(-) diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 4441b7d043db9..0d4202464bc61 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -209,6 +209,13 @@ TEST(StaticRuntime, EmbeddingBag) { } TEST(StaticRuntime, LayerNorm) { +#ifdef FBCODE_CAFFE2 + script::Module module("module"); + module.define(layer_norm_with_weights); + torch::jit::StaticModule smodule(module); + ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr); + ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr); +#endif const auto a = torch::rand({1, 2, 2, 2}); const auto b = torch::rand({3, 2, 2, 2}); for (int normalized_size : {2, 3}) { diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index b3e1eb116dc7d..643842a74691c 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -74,6 +74,7 @@ void OptimizeGraph( if (opts.enable_out_variant) { FuseListUnpack(graph); ReplaceWithCopy(graph); + EnableStaticRuntimeLayerNorm(graph); } #endif ConstantPropagation(graph); diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 54c04566a6eaf..7e78b77246f3b 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1308,55 +1308,76 @@ REGISTER_OPERATOR_FUNCTOR(aten::argmin, aten_argmin, [](Node* n) -> SROperator { }; }); -REGISTER_OPERATOR_FUNCTOR(aten::layer_norm, aten_layer_norm, [](Node* n) -> SROperator { - if (!n->matches(torch::schema( - "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"))) { - LogAndDumpSchema(n); - return nullptr; - } - return [](ProcessedNode* p_node) { - // ignore Input(5): `bool cudnn_enable=True` - const auto& input = p_node->Input(0).toTensor(); - const auto normalized_shape = p_node->Input(1).toIntVector(); - auto weight_opt = p_node->Input(2).toOptional(); - auto bias_opt = p_node->Input(3).toOptional(); - float eps = p_node->Input(4).toDouble(); - - c10::MaybeOwned weight_maybe_owned = - at::borrow_from_optional_tensor(weight_opt); - const at::Tensor& weight = *weight_maybe_owned; - c10::MaybeOwned bias_maybe_owned = - at::borrow_from_optional_tensor(bias_opt); - const at::Tensor& bias = *bias_maybe_owned; - - auto M_N = at::native::_check_layer_norm_inputs( - input, normalized_shape, weight, bias); - auto M = M_N.first; - auto N = M_N.second; - auto X = input.expect_contiguous(); - auto gamma = weight.expect_contiguous(); - auto beta = bias.expect_contiguous(); - - if (p_node->Output(0).isNone()) { - p_node->Output(0) = at::native::empty_like( - *X, - c10::nullopt /* dtype */, - c10::nullopt /* layout */, - c10::nullopt /* device */, - c10::nullopt /* pin_memory */, - at::MemoryFormat::Contiguous); - } else { - at::native::resize_( - p_node->Output(0).toTensor(), X->sizes(), c10::nullopt); - } - at::Tensor& output = p_node->Output(0).toTensor(); - at::Tensor mean = create_empty_from({M}, *X); - at::Tensor rstd = create_empty_from({M}, *X); +REGISTER_OPERATOR_FUNCTOR( + static_runtime::layer_norm, + aten_layer_norm, + [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor,Tensor,Tensor)"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + // ignore Input(5): `bool cudnn_enable=True` + const auto& input = p_node->Input(0).toTensor(); + const auto normalized_shape = p_node->Input(1).toIntVector(); + auto weight_opt = p_node->Input(2).toOptional(); + auto bias_opt = p_node->Input(3).toOptional(); + float eps = p_node->Input(4).toDouble(); + + c10::MaybeOwned weight_maybe_owned = + at::borrow_from_optional_tensor(weight_opt); + const at::Tensor& weight = *weight_maybe_owned; + c10::MaybeOwned bias_maybe_owned = + at::borrow_from_optional_tensor(bias_opt); + const at::Tensor& bias = *bias_maybe_owned; + + auto M_N = at::native::_check_layer_norm_inputs( + input, normalized_shape, weight, bias); + auto M = M_N.first; + auto N = M_N.second; + auto X = input.expect_contiguous(); + auto gamma = weight.expect_contiguous(); + auto beta = bias.expect_contiguous(); - at::native::layer_norm_cpu_out( - output, mean, rstd, input, normalized_shape, *gamma, *beta, eps, M, N); - }; -}); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::empty_like( + *X, + c10::nullopt /* dtype */, + c10::nullopt /* layout */, + c10::nullopt /* device */, + c10::nullopt /* pin_memory */, + at::MemoryFormat::Contiguous); + } else { + at::native::resize_( + p_node->Output(0).toTensor(), X->sizes(), c10::nullopt); + } + if (p_node->Output(1).isNone()) { + p_node->Output(1) = create_empty_from({M}, *X); + } else { + at::native::resize_(p_node->Output(1).toTensor(), {M}, c10::nullopt); + } + if (p_node->Output(2).isNone()) { + p_node->Output(2) = create_empty_from({M}, *X); + } else { + at::native::resize_(p_node->Output(2).toTensor(), {M}, c10::nullopt); + } + at::Tensor& output = p_node->Output(0).toTensor(); + at::Tensor mean = p_node->Output(1).toTensor(); + at::Tensor rstd = p_node->Output(2).toTensor(); + at::native::layer_norm_cpu_out( + output, + mean, + rstd, + input, + normalized_shape, + *gamma, + *beta, + eps, + M, + N); + }; + }); REGISTER_OPERATOR_FUNCTOR(aten::norm, aten_norm, [](Node* n) -> SROperator { if (!n->matches(torch::schema( diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp index 1133e3924c32a..5099dc1ba6e2b 100644 --- a/torch/csrc/jit/runtime/static/passes.cpp +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -303,6 +303,9 @@ TORCH_LIBRARY_FRAGMENT(static_runtime, m) { "static_runtime::to_copy.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor"); m.def( "static_runtime::to_copy.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor"); + m.def(torch::schema( + "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor, Tensor, Tensor)", + c10::AliasAnalysisKind::PURE_FUNCTION)); } bool HasInplaceOp(std::shared_ptr& graph, const AliasDb& alias_db) { @@ -469,5 +472,35 @@ void FuseListUnpack(std::shared_ptr& graph) { #endif } +void EnableStaticRuntimeLayerNorm(std::shared_ptr& graph) { + const c10::Symbol static_runtime_layer_norm_symbol = + c10::Symbol::fromQualString("static_runtime::layer_norm"); + auto nodes = graph->nodes(); + std::vector> replacement; + for (auto it = nodes.begin(); it != nodes.end(); ++it) { + Node* old_node = *it; + if (!old_node->matches(torch::schema( + "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"))) { + continue; + } + TORCH_CHECK(old_node->outputs().size() == 1); + auto* new_node = graph->create( + static_runtime_layer_norm_symbol, + /*layer_norm*/ 1 + /*mean*/ 1 + /*rst=*/1); + new_node->insertBefore(old_node); + for (auto* input : old_node->inputs()) { + new_node->addInput(input); + } + replacement.emplace_back(old_node, new_node); + } + for (const auto& p : replacement) { + auto* old_node = p.first; + auto* new_node = p.second; + new_node->output(0)->copyMetadata(old_node->output(0)); + old_node->output(0)->replaceAllUsesWith(new_node->output(0)); + old_node->destroy(); + } +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h index 11ab4bdc7c46a..a42bc97f19618 100644 --- a/torch/csrc/jit/runtime/static/passes.h +++ b/torch/csrc/jit/runtime/static/passes.h @@ -13,6 +13,9 @@ TORCH_API void ReplaceWithCopy( std::shared_ptr& graph, bool outputs_are_immutable = true); +TORCH_API void EnableStaticRuntimeLayerNorm( + std::shared_ptr& graph); + TORCH_API bool HasInplaceOp( std::shared_ptr& graph, const AliasDb& alias_db); From f2c47cf4dbbdd0cafc1bd2118121c6eda3947f3f Mon Sep 17 00:00:00 2001 From: Harut Movsisyan Date: Fri, 27 Aug 2021 03:03:32 -0700 Subject: [PATCH 289/530] [Static Runtime] Out version for fmod (#64046) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64046 Test Plan: Confirm out variant is used: ``` > //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1 V0826 23:31:30.321382 193428 impl.cpp:1395] Switch to out variant for node: %4 : Tensor = aten::fmod(%a.1, %b.1) ``` Reviewed By: mikeiovine Differential Revision: D30581228 fbshipit-source-id: dfab9a16ff8afd40b29338037769f938f154bf74 --- benchmarks/static_runtime/test_scripts.h | 10 +++++++ .../static_runtime/test_static_runtime.cc | 27 +++++++++++++++++++ torch/csrc/jit/runtime/static/ops.cpp | 25 +++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index ecdd491462f62..477b191b24156 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -762,3 +762,13 @@ const std::string quantize_script = R"IR( %1249: Tensor = aten::dequantize(%1254) return (%1249) )IR"; + +const auto fmod_tensor = R"JIT( + def forward(self, a: Tensor, b: Tensor): + return torch.fmod(a, b).clone() +)JIT"; + +const auto fmod_scalar = R"JIT( + def forward(self, a: Tensor, b: int): + return torch.fmod(a, b).clone() +)JIT"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 0d4202464bc61..bd213c78dac1a 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1230,3 +1230,30 @@ TEST(StaticRuntime, IndividualOps_VarStack) { testStaticRuntime(var_stack_script, args1, args2); } + +TEST(StaticRuntime, IndividualOps_FmodTensor) { + // fmod tensor version + auto a = at::randn({2, 3}); + auto b = at::randn({2, 3}); + std::vector args0{a, b}; + testStaticRuntime(fmod_tensor, args0); + + // check for dynamic shapes + auto c = at::randn({4, 3, 2}); + auto d = at::randn({4, 3, 2}); + std::vector args1{c, d}; + testStaticRuntime(fmod_tensor, args0, args1); +} + +TEST(StaticRuntime, IndividualOps_FmodScalar) { + auto a = at::randn({2, 3}); + + // fmod scalar version + std::vector args2{a, 3}; + testStaticRuntime(fmod_scalar, args2); + + // check for dynamic shapes + auto c = at::randn({4, 3, 2}); + std::vector args3{c, 4}; + testStaticRuntime(fmod_scalar, args2, args3); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 7e78b77246f3b..36f796fb2f256 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1611,6 +1611,31 @@ REGISTER_OPERATOR_FUNCTOR(aten::linear, aten_linear, [](Node* n) -> SROperator { }; }); +REGISTER_OPERATOR_FUNCTOR(aten::fmod, aten_fmod, [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "aten::fmod.Scalar(Tensor self, Scalar other) -> Tensor")) && + !n->matches(torch::schema( + "aten::fmod.Tensor(Tensor self, Tensor other) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& in0_t = p_node->Input(0).toTensor(); + const auto& in1_t = p_node->Input(1).isTensor() + ? p_node->Input(1).toTensor() + : at::native::wrapped_scalar_tensor(p_node->Input(1).toScalar()); + + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::cpu::fmod(in0_t, in1_t); + } else { + auto& out_t = p_node->Output(0).toTensor(); + fastResizeToZero(out_t); + + at::cpu::fmod_out(out_t, in0_t, in1_t); + } + }; +}); + namespace { void check_cat_no_zero_dim(const std::vector& tensors) { From ad8eddbd808a97ac518ffd5b51d2c925803a1a3f Mon Sep 17 00:00:00 2001 From: gmagogsfm Date: Fri, 27 Aug 2021 08:49:54 -0700 Subject: [PATCH 290/530] More robust check of whether a class is defined in torch (#64083) Summary: This would prevent bugs for classes that 1) Is defined in a module that happens to start with `torch`, say `torchvision` 2) Is defined in torch but with an import alias like `import torch as th` Pull Request resolved: https://github.com/pytorch/pytorch/pull/64083 Reviewed By: soulitzer Differential Revision: D30598369 Pulled By: gmagogsfm fbshipit-source-id: 9d3a7135737b2339c9bd32195e4e69a9c07549d4 --- torch/jit/_monkeytype_config.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py index f0e4613e82fd1..9957541ff25d1 100644 --- a/torch/jit/_monkeytype_config.py +++ b/torch/jit/_monkeytype_config.py @@ -1,6 +1,10 @@ + +import torch + import inspect import typing import pathlib +import sys from typing import Optional, Iterable, List, Dict from collections import defaultdict from types import CodeType @@ -15,6 +19,18 @@ except ImportError: _IS_MONKEYTYPE_INSTALLED = False +# Checks whether a class is defind in `torch.*` modules +def is_torch_native_class(cls): + if not hasattr(cls, '__module__'): + return False + + parent_modules = cls.__module__.split('.') + if not parent_modules: + return False + + root_module = sys.modules.get(parent_modules[0]) + return root_module is torch + def get_type(type): """ Helper function which converts the given type to a torchScript acceptable format. @@ -28,7 +44,7 @@ def get_type(type): # typing.List is not accepted by TorchScript. type_to_string = str(type) return type_to_string.replace(type.__module__ + '.', '') - elif type.__module__.startswith('torch'): + elif is_torch_native_class(type): # If the type is a subtype of torch module, then TorchScript expects a fully qualified name # for the type which is obtained by combining the module name and type name. return type.__module__ + '.' + type.__name__ From a43e7a51d7b4c89096510473becffc934644403f Mon Sep 17 00:00:00 2001 From: Aswin Murali Date: Fri, 27 Aug 2021 09:02:22 -0700 Subject: [PATCH 291/530] Adds return type annotation for fork_rng function (#63724) Summary: Fixes https://github.com/pytorch/pytorch/issues/63723 Since it's a generator function the type annotation shall be `Generator`. ![image](https://user-images.githubusercontent.com/47299190/130318830-29ef9529-0daa-463c-90b2-1b11f63ade8a.png) Pull Request resolved: https://github.com/pytorch/pytorch/pull/63724 Reviewed By: iramazanli Differential Revision: D30543098 Pulled By: heitorschueroff fbshipit-source-id: ebdd34749defe1e26c899146786a0357ab4b4b9b --- torch/random.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/random.py b/torch/random.py index d774634478697..f5156bf48730d 100644 --- a/torch/random.py +++ b/torch/random.py @@ -1,4 +1,5 @@ import contextlib +from typing import Generator import warnings from torch._C import default_generator @@ -65,7 +66,7 @@ def initial_seed() -> int: @contextlib.contextmanager -def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices"): +def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices") -> Generator: """ Forks the RNG, so that when you return, the RNG is reset to the state that it was previously in. From 22d38bd10d998edc033d268846eaa2dd395dcb55 Mon Sep 17 00:00:00 2001 From: Hanton Yang Date: Fri, 27 Aug 2021 09:23:45 -0700 Subject: [PATCH 292/530] [OSS] Enable Metal in PyTorch MacOS nightly builds (#63718) Summary: Build on https://github.com/pytorch/pytorch/pull/63825 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63718 Test Plan: 1.Add `ci/binaries` label to PR, so the CI will build those nightly builds 2.Make sure the following CI jobs build with `USE_PYTORCH_METAL_EXPORT` option is `ON`: ``` ci/circleci: binary_macos_arm64_conda_3_8_cpu_nightly_build ci/circleci: binary_macos_arm64_conda_3_9_cpu_nightly_build ci/circleci: binary_macos_arm64_wheel_3_8_cpu_nightly_build ci/circleci: binary_macos_arm64_wheel_3_9_cpu_nightly_build ci/circleci: binary_macos_conda_3_6_cpu_nightly_build ci/circleci: binary_macos_conda_3_7_cpu_nightly_build ci/circleci: binary_macos_conda_3_8_cpu_nightly_build ci/circleci: binary_macos_conda_3_9_cpu_nightly_build ci/circleci: binary_macos_libtorch_3_7_cpu_nightly_build ci/circleci: binary_macos_wheel_3_6_cpu_nightly_build ci/circleci: binary_macos_wheel_3_7_cpu_nightly_build ci/circleci: binary_macos_wheel_3_8_cpu_nightly_build ci/circleci: binary_macos_wheel_3_9_cpu_nightly_build ``` 3.Test `conda` and `wheel` builds locally on [HelloWorld-Metal](https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld-Metal) demo with [(Prototype) Use iOS GPU in PyTorch](https://pytorch.org/tutorials/prototype/ios_gpu_workflow.html) (1) conda ``` conda install https://15667941-65600975-gh.circle-artifacts.com/0/Users/distiller/project/final_pkgs/pytorch-1.10.0.dev20210826-py3.8_0.tar.bz2 ``` (2) wheel ``` pip3 install https://15598647-65600975-gh.circle-artifacts.com/0/Users/distiller/project/final_pkgs/torch-1.10.0.dev20210824-cp38-none-macosx_10_9_x86_64.whl ``` Reviewed By: xta0 Differential Revision: D30593167 Pulled By: hanton fbshipit-source-id: 471da204e94b29c11301c857c50501307a5f0785 --- .circleci/scripts/binary_macos_build.sh | 3 +++ CMakeLists.txt | 5 +++++ aten/src/ATen/CMakeLists.txt | 29 +++++++++++++++---------- cmake/Summary.cmake | 1 + 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/.circleci/scripts/binary_macos_build.sh b/.circleci/scripts/binary_macos_build.sh index c402cdd008013..c5cdfa9f09080 100755 --- a/.circleci/scripts/binary_macos_build.sh +++ b/.circleci/scripts/binary_macos_build.sh @@ -14,6 +14,9 @@ chmod +x "$build_script" # Build cat >"$build_script" < Date: Fri, 27 Aug 2021 09:31:36 -0700 Subject: [PATCH 293/530] [bazel] GPU-support: add @local_config_cuda and @cuda (#63604) Summary: ## Context We take the first step at tackling the GPU-bazel support by adding bazel external workspaces `local_config_cuda` and `cuda`, where the first one has some hardcoded values and lists of files, and the second one provides a nicer, high-level wrapper that maps into the already expected by pytorch bazel targets that are guarded with `if_cuda` macro. The prefix `local_config_` signifies the fact that we are breaking the bazel hermeticity philosophy by explicitly relaying on the CUDA installation that is present on the machine. ## Testing Notice an important scenario that is unlocked by this change: compilation of cpp code that depends on cuda libraries (i.e. cuda.h and so on). Before: ``` sergei.vorobev@cs-sv7xn77uoy-gpu-1628706590:~/src/pytorch4$ bazelisk build --define=cuda=true //:c10 ERROR: /home/sergei.vorobev/src/pytorch4/tools/config/BUILD:12:1: no such package 'tools/toolchain': BUILD file not found in any of the following directories. Add a BUILD file to a directory to mark it as a package. - /home/sergei.vorobev/src/pytorch4/tools/toolchain and referenced by '//tools/config:cuda_enabled_and_capable' ERROR: While resolving configuration keys for //:c10: Analysis failed ERROR: Analysis of target '//:c10' failed; build aborted: Analysis failed INFO: Elapsed time: 0.259s INFO: 0 processes. FAILED: Build did NOT complete successfully (2 packages loaded, 2 targets configured) ``` After: ``` sergei.vorobev@cs-sv7xn77uoy-gpu-1628706590:~/src/pytorch4$ bazelisk build --define=cuda=true //:c10 INFO: Analyzed target //:c10 (6 packages loaded, 246 targets configured). INFO: Found 1 target... Target //:c10 up-to-date: bazel-bin/libc10.lo bazel-bin/libc10.so INFO: Elapsed time: 0.617s, Critical Path: 0.04s INFO: 0 processes. INFO: Build completed successfully, 1 total action ``` The `//:c10` target is a good testing one for this, because it has such cases where the [glob is different](https://github.com/pytorch/pytorch/blob/075024b9a34904ec3ecdab3704c3bcaa329bdfea/BUILD.bazel#L76-L81), based on do we compile for CUDA or not. ## What is out of scope of this PR This PR is a first in a series of providing the comprehensive GPU bazel build support. Namely, we don't tackle the [cu_library](https://github.com/pytorch/pytorch/blob/11a40ad915d4d3d8551588e303204810887fcf8d/tools/rules/cu.bzl#L2) implementation here. This would be a separate large chunk of work. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63604 Reviewed By: soulitzer Differential Revision: D30442083 Pulled By: malfet fbshipit-source-id: b2a8e4f7e5a25a69b960a82d9e36ba568eb64595 --- .bazelrc | 6 +- .github/scripts/generate_ci_workflows.py | 2 +- ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 2 +- .jenkins/pytorch/build.sh | 4 + WORKSPACE | 13 +- third_party/cuda.BUILD | 43 ++ third_party/tensorflow_cuda_bazel_build/BUILD | 0 .../tensorflow_cuda_bazel_build/README.md | 5 + .../tensorflow_cuda_bazel_build/WORKSPACE | 1 + .../tensorflow_cuda_bazel_build/cuda/BUILD | 451 ++++++++++++++++++ tools/config/BUILD | 1 - tools/rules/workspace.bzl | 25 + 12 files changed, 548 insertions(+), 5 deletions(-) create mode 100644 third_party/cuda.BUILD create mode 100644 third_party/tensorflow_cuda_bazel_build/BUILD create mode 100644 third_party/tensorflow_cuda_bazel_build/README.md create mode 100644 third_party/tensorflow_cuda_bazel_build/WORKSPACE create mode 100755 third_party/tensorflow_cuda_bazel_build/cuda/BUILD diff --git a/.bazelrc b/.bazelrc index ecfe8fd0efcd0..310eb293389dc 100644 --- a/.bazelrc +++ b/.bazelrc @@ -3,7 +3,11 @@ build --copt=-I. build --copt=-isystem --copt bazel-out/k8-fastbuild/bin # Configuration to disable tty features for environments like CI - build:no-tty --curses no build:no-tty --progress_report_interval 10 build:no-tty --show_progress_rate_limit 10 + +# Configuration to build with GPU support +build:gpu --define=cuda=true +# define a separate build folder for faster switching between configs +build:gpu --platform_suffix=-gpu diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index cd7065dbfaa47..f1819dbac589d 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -497,7 +497,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: CIWorkflow( arch="linux", build_environment="linux-xenial-py3.6-gcc7-bazel-test", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7", test_runner_type=LINUX_CPU_TEST_RUNNER, on_pull_request=True, ciflow_config=CIFlowConfig( diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 71a9bf76dac22..233144210dbcd 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -15,7 +15,7 @@ on: env: BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index d7b66e7c9177e..085cf5152e6f1 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -224,7 +224,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then get_bazel + # first build the whole torch for CPU-only tools/bazel build --config=no-tty :torch + # then build selected set of targets with GPU-support. + # TODO: eventually this should converge to building the whole :torch with GPU-support + tools/bazel build --config=no-tty --config=gpu :c10 else # check that setup.py would fail with bad arguments echo "The next three invocations are expected to fail with invalid command error messages." diff --git a/WORKSPACE b/WORKSPACE index 6f5028d4d0912..9396a3451c360 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,7 +1,7 @@ workspace(name = "pytorch") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") -load("//tools/rules:workspace.bzl", "new_patched_local_repository") +load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository") http_archive( name = "bazel_skylib", @@ -170,3 +170,14 @@ protobuf_deps() load("@rules_python//python:repositories.bzl", "py_repositories") py_repositories() + +local_repository( + name = "local_config_cuda", + path = "third_party/tensorflow_cuda_bazel_build", +) + +# Wrapper to expose local_config_cuda in an agnostic way +new_empty_repository( + name = "cuda", + build_file = "//third_party:cuda.BUILD", +) diff --git a/third_party/cuda.BUILD b/third_party/cuda.BUILD new file mode 100644 index 0000000000000..0c58b34a52e74 --- /dev/null +++ b/third_party/cuda.BUILD @@ -0,0 +1,43 @@ +""" +Collect all the CUDA stuff from @local_config_cuda in a single target +for convenience. +""" + +cc_library( + name = "cuda", + visibility = ["//visibility:public"], + deps = [ + "@local_config_cuda//cuda:cublas", + "@local_config_cuda//cuda:cuda_driver", + "@local_config_cuda//cuda:cuda_headers", + "@local_config_cuda//cuda:cudart", + "@local_config_cuda//cuda:cufft", + "@local_config_cuda//cuda:curand", + ], +) + +cc_library( + name = "cupti", + deps = [ + "@local_config_cuda//cuda:cupti_headers", + "@local_config_cuda//cuda:cupti_link", + ], +) + +[ + alias( + name = lib, + actual = "@local_config_cuda//cuda:{}".format(lib), + visibility = ["//visibility:public"], + ) + for lib in [ + "cublas", + "cufft", + "cusolver", + "cusparse", + "curand", + "nvrtc", + "cuda_driver", + "nvToolsExt", + ] +] diff --git a/third_party/tensorflow_cuda_bazel_build/BUILD b/third_party/tensorflow_cuda_bazel_build/BUILD new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/third_party/tensorflow_cuda_bazel_build/README.md b/third_party/tensorflow_cuda_bazel_build/README.md new file mode 100644 index 0000000000000..439e195d8e44e --- /dev/null +++ b/third_party/tensorflow_cuda_bazel_build/README.md @@ -0,0 +1,5 @@ +# Config for CUDA + +This is a checked-in copy of the auto-generated config for building CUDA code with bazel. The content of this folder was generated from https://github.com/tensorflow/tensorflow `./configure` execution and then edited manually to fit the pytorch needs. + +The LICENSE for the TensorFlow project is APACHE 2. The full LICENSE file could be found here https://github.com/tensorflow/tensorflow/blob/master/LICENSE. diff --git a/third_party/tensorflow_cuda_bazel_build/WORKSPACE b/third_party/tensorflow_cuda_bazel_build/WORKSPACE new file mode 100644 index 0000000000000..59369ce679c14 --- /dev/null +++ b/third_party/tensorflow_cuda_bazel_build/WORKSPACE @@ -0,0 +1 @@ +workspace(name = "local_config_cuda") diff --git a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD new file mode 100755 index 0000000000000..f7271af2750b8 --- /dev/null +++ b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD @@ -0,0 +1,451 @@ +licenses([ + "restricted", + "reciprocal", + "notice", +]) # MPL2, portions GPL v3, LGPL v3, BSD-like + +package(default_visibility = ["//visibility:public"]) + +config_setting( + name = "using_nvcc", + values = { + "define": "using_cuda_nvcc=true", + }, +) + +config_setting( + name = "using_clang", + values = { + "define": "using_cuda_clang=true", + }, +) + +# Equivalent to using_clang && -c opt. +config_setting( + name = "using_clang_opt", + values = { + "define": "using_cuda_clang=true", + "compilation_mode": "opt", + }, +) + +config_setting( + name = "darwin", + values = {"cpu": "darwin"}, +) + +cc_library( + name = "cuda_headers", + hdrs = [ + ":cuda-include", + ":cudnn-include", + ], + includes = [ + ".", + "include", + ], +) + +cc_library( + name = "cudnn_headers", + hdrs = [ + ":cudnn-include", + ], + includes = [ + ".", + "include", + ], +) + +cc_library( + name = "cudart_static", + linkopts = [ + "-L/usr/local/cuda/lib64", + ], +) + +cc_library( + name = "cuda_driver", + linkopts = ["-lcuda"], + deps = [":linker_search_path"], +) + +# Provides the RPATH for Nvidia-less sytems to be able to run binaries linked to libcuda. +cc_library( + name = "driver_stub_runtime", + linkopts = [ + "-Wl,-rpath,/usr/local/cuda/lib64/stubs", + ], + deps = [":cuda_driver"], +) + +cc_library( + name = "linker_search_path", + linkopts = [ + "-L/usr/local/cuda/lib64", + "-L/usr/local/cuda/lib64/stubs", + "-Wl,-rpath-link,/usr/local/cuda/lib64", + "-Wl,-rpath-link,/usr/local/cuda/lib64/stubs", + ], +) + +[ + cc_library( + name = libname, + linkopts = ["-l" + libname] + (["-lgomp"] if (libname == "cusolver") else []), + linkstatic = True, + deps = [":linker_search_path"], + ) + for libname in [ + "cublas", + "cudart", + "cudnn", + "cufft", + "curand", + "cusolver", + "cusparse", + "nvrtc", + "nvToolsExt", + ] +] + +cc_library( + name = "cuda", + deps = [ + ":cublas", + ":cuda_headers", + ":cudart", + ":cudnn", + ":cufft", + ":curand", + ":nvToolsExt", + ], +) + +# NVIDIA Performance Primitives (http://docs.nvidia.com/cuda/npp/modules.html)) +# used by OpenCV +cc_library( + name = "nppi", + linkopts = [ + "-lnppc", + "-lnppial", + "-lnppicom", + "-lnppidei", + "-lnppif", + "-lnppig", + "-lnppim", + "-lnppist", + "-lnppitc", + "-lnpps", + ], + linkstatic = True, + deps = [":linker_search_path"], +) + +# NVIDIA Management Library +cc_library( + name = "nvml", + linkopts = [ + "-lnvidia-ml", + "-Wl,-rpath,/usr/lib/nvidia-410", + "-Wl,-rpath,/usr/lib/nvidia-390", + "-Wl,-rpath,/usr/lib/nvidia-387", + "-Wl,-rpath,/usr/lib/nvidia-384", + ], + deps = [":linker_search_path"], +) + +cc_library( + name = "cupti_headers", + hdrs = [ + ":cuda-extras", + ], + includes = [ + ".", + "extras/CUPTI/include/", + ], +) + +# cupti .so exposed at linktime +cc_library( + name = "cupti_link", + linkopts = [ + "-L/usr/local/cuda/extras/CUPTI/lib64", + "-lcupti", + ], +) + +cc_library( + name = "libdevice_root", + data = [":cuda-nvvm"], +) + +CUDA_INCLUDES_FILES = [ + "include/builtin_types.h", + "include/channel_descriptor.h", + "include/CL/cl_egl.h", + "include/CL/cl_ext.h", + "include/CL/cl_gl_ext.h", + "include/CL/cl_gl.h", + "include/CL/cl.h", + "include/CL/cl.hpp", + "include/CL/cl_platform.h", + "include/CL/opencl.h", + "include/common_functions.h", + "include/cooperative_groups.h", + "include/cooperative_groups_helpers.h", + "include/crt/common_functions.h", + "include/crt/device_double_functions.h", + "include/crt/device_double_functions.hpp", + "include/crt/device_functions.h", + "include/crt/device_functions.hpp", + "include/crt/func_macro.h", + "include/crt/host_config.h", + "include/crt/host_defines.h", + "include/crt/host_runtime.h", + "include/crt/math_functions.h", + "include/crt/math_functions.hpp", + "include/crt/mma.h", + "include/crt/mma.hpp", + "include/crt/nvfunctional", + "include/crt/sm_70_rt.h", + "include/crt/sm_70_rt.hpp", + "include/crt/storage_class.h", + # TODO: figure out why on a CI machine with CUDA 10.2 it's not present + # "include/cublas_api.h", + # "include/cublas.h", + # "include/cublas_v2.h", + # "include/cublasXt.h", + "include/cuComplex.h", + "include/cuda_device_runtime_api.h", + "include/cudaEGL.h", + "include/cuda_egl_interop.h", + "include/cuda_fp16.h", + "include/cuda_fp16.hpp", + "include/cudaGL.h", + "include/cuda_gl_interop.h", + "include/cuda.h", + "include/cudalibxt.h", + "include/cuda_occupancy.h", + "include/cuda_profiler_api.h", + "include/cudaProfiler.h", + "include/cudart_platform.h", + "include/cuda_runtime_api.h", + "include/cuda_runtime.h", + "include/cuda_surface_types.h", + "include/cuda_texture_types.h", + "include/cudaVDPAU.h", + "include/cuda_vdpau_interop.h", + "include/cufft.h", + "include/cufftw.h", + "include/cufftXt.h", + "include/curand_discrete2.h", + "include/curand_discrete.h", + "include/curand_globals.h", + "include/curand.h", + "include/curand_kernel.h", + "include/curand_lognormal.h", + "include/curand_mrg32k3a.h", + "include/curand_mtgp32dc_p_11213.h", + "include/curand_mtgp32.h", + "include/curand_mtgp32_host.h", + "include/curand_mtgp32_kernel.h", + "include/curand_normal.h", + "include/curand_normal_static.h", + "include/curand_philox4x32_x.h", + "include/curand_poisson.h", + "include/curand_precalc.h", + "include/curand_uniform.h", + "include/cusolver_common.h", + "include/cusolverDn.h", + "include/cusolverRf.h", + "include/cusolverSp.h", + "include/cusolverSp_LOWLEVEL_PREVIEW.h", + "include/cusparse.h", + "include/cusparse_v2.h", + "include/device_atomic_functions.h", + "include/device_atomic_functions.hpp", + "include/device_double_functions.h", + "include/device_functions.h", + "include/device_launch_parameters.h", + "include/device_types.h", + "include/driver_functions.h", + "include/driver_types.h", + "include/fatBinaryCtl.h", + "include/fatbinary.h", + "include/host_config.h", + "include/host_defines.h", + "include/library_types.h", + "include/math_constants.h", + "include/math_functions.h", + "include/mma.h", + "include/nppcore.h", + "include/nppdefs.h", + "include/npp.h", + "include/nppi_arithmetic_and_logical_operations.h", + "include/nppi_color_conversion.h", + "include/nppi_compression_functions.h", + "include/nppi_computer_vision.h", + "include/nppi_data_exchange_and_initialization.h", + "include/nppi_filtering_functions.h", + "include/nppi_geometry_transforms.h", + "include/nppi.h", + "include/nppi_linear_transforms.h", + "include/nppi_morphological_operations.h", + "include/nppi_statistics_functions.h", + "include/nppi_support_functions.h", + "include/nppi_threshold_and_compare_operations.h", + "include/npps_arithmetic_and_logical_operations.h", + "include/npps_conversion_functions.h", + "include/npps_filtering_functions.h", + "include/npps.h", + "include/npps_initialization.h", + "include/npps_statistics_functions.h", + "include/npps_support_functions.h", + # Note: CUDA 10.0 only + # "include/nppversion.h", + # TODO: figure out why on a CI machine with CUDA 10.2 it's not present + # "include/nvblas.h", + "include/nvfunctional", + "include/nvgraph.h", + "include/nvjpeg.h", + "include/nvml.h", + "include/nvrtc.h", + "include/nvToolsExtCuda.h", + "include/nvToolsExtCudaRt.h", + "include/nvToolsExt.h", + "include/nvToolsExtMeta.h", + "include/nvToolsExtSync.h", + "include/nvtx3/nvToolsExtCuda.h", + "include/nvtx3/nvToolsExtCudaRt.h", + "include/nvtx3/nvToolsExt.h", + "include/nvtx3/nvToolsExtOpenCL.h", + "include/nvtx3/nvToolsExtSync.h", + "include/nvtx3/nvtxDetail/nvtxImplCore.h", + "include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h", + "include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h", + "include/nvtx3/nvtxDetail/nvtxImpl.h", + "include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h", + "include/nvtx3/nvtxDetail/nvtxImplSync_v3.h", + "include/nvtx3/nvtxDetail/nvtxInitDecls.h", + "include/nvtx3/nvtxDetail/nvtxInitDefs.h", + "include/nvtx3/nvtxDetail/nvtxInit.h", + "include/nvtx3/nvtxDetail/nvtxLinkOnce.h", + "include/nvtx3/nvtxDetail/nvtxTypes.h", + "include/sm_20_atomic_functions.h", + "include/sm_20_atomic_functions.hpp", + "include/sm_20_intrinsics.h", + "include/sm_20_intrinsics.hpp", + "include/sm_30_intrinsics.h", + "include/sm_30_intrinsics.hpp", + "include/sm_32_atomic_functions.h", + "include/sm_32_atomic_functions.hpp", + "include/sm_32_intrinsics.h", + "include/sm_32_intrinsics.hpp", + "include/sm_35_atomic_functions.h", + "include/sm_35_intrinsics.h", + "include/sm_60_atomic_functions.h", + "include/sm_60_atomic_functions.hpp", + "include/sm_61_intrinsics.h", + "include/sm_61_intrinsics.hpp", + # CUDA 10.0 only + # "include/sobol_direction_vectors.h", + "include/surface_functions.h", + "include/surface_functions.hpp", + "include/surface_indirect_functions.h", + "include/surface_indirect_functions.hpp", + "include/surface_types.h", + "include/texture_fetch_functions.h", + "include/texture_fetch_functions.hpp", + "include/texture_indirect_functions.h", + "include/texture_indirect_functions.hpp", + "include/texture_types.h", + "include/vector_functions.h", + "include/vector_functions.hpp", + "include/vector_types.h", +] + +genrule( + name = "cuda-include", + outs = CUDA_INCLUDES_FILES, + cmd = " && ".join([ + "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p) + for p in CUDA_INCLUDES_FILES + ]), + local = True, + tags = ["no-cache"], +) + +CUDA_NVVM_FILES = [ + "nvvm/bin/cicc", + "nvvm/include/nvvm.h", + "nvvm/lib64/libnvvm.so", + "nvvm/lib64/libnvvm.so.3", + "nvvm/lib64/libnvvm.so.3.3.0", + "nvvm/libdevice/libdevice.10.bc", +] + +genrule( + name = "cuda-nvvm", + outs = CUDA_NVVM_FILES, + cmd = " && ".join([ + "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p) + for p in CUDA_NVVM_FILES + ]), + local = True, + tags = ["no-cache"], +) + +CUDA_EXTRAS_FILES = [ + "extras/CUPTI/include/cuda_stdint.h", + "extras/CUPTI/include/cupti.h", + "extras/CUPTI/include/cupti_activity.h", + "extras/CUPTI/include/cupti_callbacks.h", + "extras/CUPTI/include/cupti_driver_cbid.h", + "extras/CUPTI/include/cupti_events.h", + "extras/CUPTI/include/cupti_metrics.h", + "extras/CUPTI/include/cupti_nvtx_cbid.h", + "extras/CUPTI/include/cupti_result.h", + "extras/CUPTI/include/cupti_runtime_cbid.h", + "extras/CUPTI/include/cupti_version.h", + "extras/CUPTI/include/generated_cuda_gl_interop_meta.h", + "extras/CUPTI/include/generated_cuda_meta.h", + "extras/CUPTI/include/generated_cuda_runtime_api_meta.h", + "extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h", + "extras/CUPTI/include/generated_cudaGL_meta.h", + "extras/CUPTI/include/generated_cudaVDPAU_meta.h", + "extras/CUPTI/include/generated_nvtx_meta.h", + "extras/CUPTI/include/GL/gl.h", + "extras/CUPTI/include/GL/glew.h", + "extras/CUPTI/include/GL/glext.h", + "extras/CUPTI/include/GL/glu.h", + "extras/CUPTI/include/GL/glut.h", + "extras/CUPTI/include/GL/glx.h", + "extras/CUPTI/include/GL/glxext.h", + "extras/CUPTI/include/GL/wglew.h", + "extras/CUPTI/include/GL/wglext.h", + "extras/CUPTI/include/openacc/cupti_openacc.h", +] + +genrule( + name = "cuda-extras", + outs = CUDA_EXTRAS_FILES, + cmd = " && ".join([ + "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p) + for p in CUDA_EXTRAS_FILES + ]), + local = True, + tags = ["no-cache"], +) + +genrule( + name = "cudnn-include", + outs = [ + "include/cudnn.h", + ], + cmd = """ + ln -s /usr/include/cudnn.h $(@D)/cudnn.h""", + local = True, + tags = ["no-cache"], +) + diff --git a/tools/config/BUILD b/tools/config/BUILD index a8f9d0452fce8..ba13eda2bba7b 100644 --- a/tools/config/BUILD +++ b/tools/config/BUILD @@ -13,7 +13,6 @@ selects.config_setting_group( name = "cuda_enabled_and_capable", match_all = [ ":cuda", - "//tools/toolchain:is_cuda_capable", ], ) diff --git a/tools/rules/workspace.bzl b/tools/rules/workspace.bzl index 59e12e8d92d03..34317bec25f5d 100644 --- a/tools/rules/workspace.bzl +++ b/tools/rules/workspace.bzl @@ -27,3 +27,28 @@ pkg_tar(name = "content", srcs = glob(["**"])) path = path, ) _patched_rule(name = name, **kwargs) + +def _new_empty_repository_impl(repo_ctx): + build_file = repo_ctx.attr.build_file + build_file_content = repo_ctx.attr.build_file_content + if not (bool(build_file) != bool(build_file_content)): + fail("Exactly one of 'build_file' or 'build_file_content' is required") + + if build_file_content: + repo_ctx.file("BUILD", build_file_content) + elif build_file: + repo_ctx.template("BUILD", repo_ctx.attr.build_file, {}) + +new_empty_repository = repository_rule( + attrs = { + "build_file": attr.label(allow_files = True), + "build_file_content": attr.string(), + }, + implementation = _new_empty_repository_impl, +) + +"""Create an empty repository with the supplied BUILD file. + +This is mostly useful to create wrappers for specific target that we want +to be used with the '@' syntax. +""" From a9983ac09c9772fa426224b080090083b4c9607b Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Fri, 27 Aug 2021 09:37:10 -0700 Subject: [PATCH 294/530] Refactor structured set_output in Register{DispatchKey}.cpp (#62188) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62188 These parts of the `set_output` code are identical for all operators in the kernel registration files. So, this moves them from being copied into every class to two helper functions at the top of the file. Test Plan: Imported from OSS Reviewed By: soulitzer Differential Revision: D29962045 Pulled By: albanD fbshipit-source-id: 753b8aac755f3c91b77ffa2c30a89ac91a84b7c4 --- .../ATen/templates/RegisterDispatchKey.cpp | 2 + tools/codegen/dest/__init__.py | 1 + tools/codegen/dest/register_dispatch_key.py | 130 +++++++++++------- tools/codegen/gen.py | 1 + tools/codegen/gen_backend_stubs.py | 1 + 5 files changed, 84 insertions(+), 51 deletions(-) diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp index 1abc3ee391ae2..16caf5326c711 100644 --- a/aten/src/ATen/templates/RegisterDispatchKey.cpp +++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp @@ -43,6 +43,8 @@ namespace at { // at namespace already. namespace { +${dispatch_helpers} + ${dispatch_anonymous_definitions} TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) { diff --git a/tools/codegen/dest/__init__.py b/tools/codegen/dest/__init__.py index ab4bada277572..441e4426cf29e 100644 --- a/tools/codegen/dest/__init__.py +++ b/tools/codegen/dest/__init__.py @@ -1,2 +1,3 @@ from .register_dispatch_key import RegisterDispatchKey as RegisterDispatchKey +from .register_dispatch_key import gen_registration_helpers as gen_registration_helpers from .native_functions import compute_native_function_declaration as compute_native_function_declaration diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py index a943f51ba5898..784ee56e765fb 100644 --- a/tools/codegen/dest/register_dispatch_key.py +++ b/tools/codegen/dest/register_dispatch_key.py @@ -23,6 +23,79 @@ from tools.codegen.api.translate import translate from tools.codegen.selective_build.selector import SelectiveBuilder + +def gen_create_out_helper(backend_index: BackendIndex) -> List[str]: + if backend_index.dispatch_key == DispatchKey.Meta: + # TODO: dedupe this with below + core = """ +if (strides.empty()) { + return at::empty(sizes, options.device(at::kMeta)); +} else { + return at::empty_strided(sizes, strides, options.device(at::kMeta)); +} +""" + else: + expanded_topts = "optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), " \ + "options.device_opt(), options.pinned_memory_opt()" + empty_init = "" + if backend_index.dispatch_key == DispatchKey.CPU: + empty_impl = "at::native::empty_cpu" + empty_strided_impl = "at::native::empty_strided_cpu" + elif backend_index.dispatch_key == DispatchKey.CUDA: + empty_init = "globalContext().lazyInitCUDA();" + empty_impl = "at::native::empty_cuda" + empty_strided_impl = "at::native::empty_strided_cuda" + elif backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd: + empty_impl = "at::empty" + empty_strided_impl = "at::empty_strided" + else: + return [] + core = f""" + {empty_init} + if (strides.empty()) {{ + return {empty_impl}(sizes, {expanded_topts}, options.memory_format_opt()); + }} else {{ + // TODO: assert options.memory_format_opt() is nullopt (debug only?) + return {empty_strided_impl}(sizes, strides, {expanded_topts}); + }} +""" + return [f""" +Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{ +{core} +}} +"""] + + +def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]: + return [""" +void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) { + TORCH_CHECK(options.dtype() == out.dtype(), + "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead"); + TORCH_CHECK(options.device() == out.device(), + "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead"); + const bool resized = at::native::resize_output(out, sizes); + // Only restride if a resize occurred; otherwise we ignore the (advisory) + // strides from the meta function and directly use the output tensor's + // preexisting strides + if (resized) { + if (!strides.empty()) { + TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); + at::native::as_strided_(out, sizes, strides); + } else if (options.memory_format_opt().has_value()) { + out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); + } + } +} +"""] + + +def gen_registration_helpers(backend_index: BackendIndex) -> List[str]: + return [ + *gen_create_out_helper(backend_index), + *gen_resize_out_helper(backend_index) + ] + + # Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp). # # - The primary function of this file is to register all of the @@ -344,62 +417,17 @@ def gen_class_set_output_body(self, k: SchemaKind) -> str: maybe_set_guard_line = maybe_set_guard = '' if k is SchemaKind.functional: - if self.backend_index.dispatch_key == DispatchKey.Meta: - # TODO: dedupe this with below - return """ -if (strides.empty()) { - outputs_[output_idx] = at::empty(sizes, options.device(at::kMeta)); -} else { - outputs_[output_idx] = at::empty_strided(sizes, strides, options.device(at::kMeta)); -} -""" - else: - expanded_topts = "optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), " \ - "options.device_opt(), options.pinned_memory_opt()" - empty_init = "" - if self.backend_index.dispatch_key == DispatchKey.CPU: - empty_impl = "at::native::empty_cpu" - empty_strided_impl = "at::native::empty_strided_cpu" - elif self.backend_index.dispatch_key == DispatchKey.CUDA: - empty_init = "globalContext().lazyInitCUDA();" - empty_impl = "at::native::empty_cuda" - empty_strided_impl = "at::native::empty_strided_cuda" - elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd: - empty_impl = "at::empty" - empty_strided_impl = "at::empty_strided" - else: - raise AssertionError("unsupported dispatch key") - return f"""{maybe_set_guard_line} -{empty_init} -if (strides.empty()) {{ - outputs_[output_idx] = {empty_impl}(sizes, {expanded_topts}, options.memory_format_opt()); -}} else {{ - // TODO: assert options.memory_format_opt() is nullopt (debug only?) - outputs_[output_idx] = {empty_strided_impl}(sizes, strides, {expanded_topts}); -}} -""" + assert self.backend_index.dispatch_key in ( + DispatchKey.Meta, DispatchKey.CPU, DispatchKey.CUDA, + DispatchKey.CompositeExplicitAutograd) + return f"""{maybe_set_guard_line} +outputs_[output_idx] = create_out(sizes, strides, options);""" elif k is SchemaKind.inplace: return maybe_set_guard elif k is SchemaKind.out: return f"""{maybe_set_guard_line} const auto& out = outputs_[output_idx].get(); -TORCH_CHECK(options.dtype() == out.dtype(), - "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead"); -TORCH_CHECK(options.device() == out.device(), - "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead"); -bool resized = at::native::resize_output(outputs_[output_idx], sizes); -// Only restride if a resize occurred; otherwise we ignore the (advisory) -// strides from the meta function and directly use the output tensor's -// preexisting strides -if (resized) {{ - if (!strides.empty()) {{ - TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); - at::native::as_strided_(outputs_[output_idx], sizes, strides); - }} else if (options.memory_format_opt().has_value()) {{ - outputs_[output_idx].get().unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); - }} -}} -""" +resize_out(out, sizes, strides, options);""" else: assert_never(k) diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index ffa4ed7a1c70e..203b5a99c356c 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -1100,6 +1100,7 @@ def make_file_manager(install_dir: str) -> FileManager: 'namespaced_headers': f'#include ' if dispatch_key in functions_keys else '', 'DispatchKey': dispatch_key, 'dispatch_namespace': dispatch_key.lower(), + 'dispatch_helpers': dest.gen_registration_helpers(backend_indices[dispatch_key]), 'dispatch_namespaced_definitions': list(concatMap( dest.RegisterDispatchKey( backend_indices[dispatch_key], diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py index 51f81c702e122..5fad11c343804 100644 --- a/tools/codegen/gen_backend_stubs.py +++ b/tools/codegen/gen_backend_stubs.py @@ -231,6 +231,7 @@ def make_file_manager(install_dir: str) -> FileManager: 'namespaced_headers': '', 'DispatchKey': dispatch_key, 'dispatch_namespace': dispatch_key.lower(), + 'dispatch_helpers': dest.gen_registration_helpers(backend_indices[dispatch_key]), 'dispatch_namespaced_definitions': list(concatMap( dest.RegisterDispatchKey( backend_indices[dispatch_key], From 3abb6060910186f38fc1eef2a2169639533f0c54 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Fri, 27 Aug 2021 09:49:39 -0700 Subject: [PATCH 295/530] Add doc for nn.MultiMarginLoss (shape, example) (#63760) Summary: Fixes https://github.com/pytorch/pytorch/issues/63747 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63760 Reviewed By: malfet Differential Revision: D30541581 Pulled By: jbschlosser fbshipit-source-id: 99560641e614296645eb0e51999513f57dfcfa98 --- torch/nn/modules/loss.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 03732b6d192d8..af1da83eeef5b 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1323,7 +1323,7 @@ class MultiMarginLoss(_WeightedLoss): The loss function then becomes: .. math:: - \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p)}{\text{x.size}(0)} + \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)} Args: p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2` @@ -1347,6 +1347,20 @@ class MultiMarginLoss(_WeightedLoss): elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` and :attr:`reduce` are in the process of being deprecated, and in the meantime, specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + + Shape: + - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes. + - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`. + - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target. + + Examples:: + + >>> loss = nn.MultiMarginLoss() + >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]]) + >>> y = torch.tensor([3]) + >>> loss(x, y) + >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4))) + tensor(0.3250) """ __constants__ = ['p', 'margin', 'reduction'] margin: float From babd4499783abc699faf36f3a72a9fc491e0e572 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Fri, 27 Aug 2021 10:10:48 -0700 Subject: [PATCH 296/530] [JIT] Add aten::slice optimization (#63049) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63049 Given a graph produced from a function like this: ``` def foo(): li = [1, 2, 3, 4, 5, 6] return li[0:2] ``` This pass produces a graph like this: ``` def foo(): li = [1, 2] return li ``` These changes are mostly adapted from https://github.com/pytorch/pytorch/pull/62297/ Test Plan: `buck test //caffe2/test:jit -- TestPeephole` Reviewed By: eellison Differential Revision: D30231044 fbshipit-source-id: d12ee39f68289a574f533041a5adb38b2f000dd5 --- test/jit/test_peephole.py | 74 +++++++++++++- .../csrc/jit/passes/peephole_list_idioms.cpp | 97 +++++++++++++------ torch/csrc/jit/passes/peephole_list_idioms.h | 8 ++ 3 files changed, 148 insertions(+), 31 deletions(-) diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py index 23de44807761c..ecb4a06dfe0b9 100644 --- a/test/jit/test_peephole.py +++ b/test/jit/test_peephole.py @@ -2,7 +2,7 @@ from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA, _inline_everything from torch import nn from torch.testing import FileCheck -from typing import List +from typing import Callable, List import unittest @@ -721,3 +721,75 @@ def foo(): self.run_pass("peephole", foo.graph) FileCheck().check("DictConstruct").check("len").run(foo.graph) self.assertEqual(foo(), 1) + + def test_peephole_slice_all_three_args(self): + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][-5:6:2] + + graph = torch.jit.script(foo).graph + self.run_pass("peephole", graph) + FileCheck().check_not("aten::slice").run(graph) + self.checkScript(foo, (3, )) + + def test_peephole_slice_one_empty_arg(self): + def check_helper(fn: Callable[[int], None]) -> None: + graph = torch.jit.script(fn).graph + self.run_pass("peephole", graph) + FileCheck().check_not("aten::slice").run(graph) + self.checkScript(fn, (3, )) + + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][1::2] + + check_helper(foo) + + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][:5:3] + + check_helper(foo) + + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][0:4] + + check_helper(foo) + + def test_peephole_slice_two_empty_args(self): + def check_helper(fn: Callable[[int], None]) -> None: + graph = torch.jit.script(fn).graph + self.run_pass("peephole", graph) + FileCheck().check_not("aten::slice").run(graph) + self.checkScript(fn, (3, )) + + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][::2] + + check_helper(foo) + + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][:5] + + check_helper(foo) + + def foo(x: int): + return [1, 2, x, 4, 5, 6, 7][1:] + + check_helper(foo) + + def test_peephole_slice_optimization_not_applied_list_modified(self): + @torch.jit.script + def foo(): + li = [1, 2, 3, 4, 5, 6, 7] + li[0] = 0 + return li[2:5] + + self.run_pass("peephole", foo.graph) + FileCheck().check("aten::slice").run(foo.graph) + + def test_peephole_slice_optimization_not_applied_non_const_args(self): + @torch.jit.script + def foo(x: int, y: int): + li = [1, 2, 3, 4, 5, 6, 7] + return li[x:y] + + self.run_pass("peephole", foo.graph) + FileCheck().check("aten::slice").run(foo.graph) diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp index f33f388259d20..ec3d249b8b1be 100644 --- a/torch/csrc/jit/passes/peephole_list_idioms.cpp +++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp @@ -7,7 +7,9 @@ #include #include #include +#include #include +#include namespace torch { namespace jit { @@ -57,7 +59,7 @@ struct ListLenRefiner { } auto first_input = n->input(0); - if (first_input->type()->cast() && + if (first_input->type()->castRaw() && !mutated_lists_.count(first_input)) { if (!li_with_len_use.count(first_input)) { li_with_len_use.insert(first_input); @@ -172,7 +174,7 @@ struct PeepholeOptimizeListIdiomsImpl { private: void checkForMutatedList(Value* v) { - if (v->type()->cast() && aliasDb_->hasWriters(v)) { + if (v->type()->castRaw() && aliasDb_->hasWriters(v)) { mutated_lists_.insert(v); } } @@ -191,6 +193,43 @@ struct PeepholeOptimizeListIdiomsImpl { } } + bool optimizeSlice(Node* slice_node, Node* list_construct_node) { + auto start_val = toIValue(slice_node->input(1)); + auto end_val = toIValue(slice_node->input(2)); + auto step_val = toIValue(slice_node->input(3)); + + // All args must be constant to apply this optimization. + if (start_val == c10::nullopt || end_val == c10::nullopt || + step_val == c10::nullopt) { + return false; + } + + int64_t start = start_val->isInt() ? start_val->to() + : std::numeric_limits::max(); + int64_t end = end_val->isInt() ? end_val->to() + : std::numeric_limits::max(); + int64_t step = step_val->isInt() ? step_val->to() : 1; + + size_t list_size = list_construct_node->inputs().size(); + size_t num_values = slice_indices_adjust(list_size, &start, &end, step); + + WithInsertPoint guard(slice_node); + auto slice_list_construct = + graph_->insertNode(graph_->create(prim::ListConstruct)); + slice_list_construct->output()->setType(slice_node->output()->type()); + for (size_t i = start, j = 0; j < num_values; ++j) { + slice_list_construct->addInput(list_construct_node->input(i)); + i += step; + } + + slice_node->output()->replaceAllUsesWith(slice_list_construct->output()); + if (mutated_lists_.count(slice_node->output())) { + mutated_lists_.insert(slice_list_construct->output()); + } + + return true; + } + bool runBlock(Block* block) { bool changed = false; for (Node* node : block->nodes()) { @@ -200,7 +239,7 @@ struct PeepholeOptimizeListIdiomsImpl { // only optimizing list ops if (node->inputs().size() == 0 || - !node->input(0)->type()->cast()) { + !node->input(0)->type()->castRaw()) { continue; } @@ -211,36 +250,33 @@ struct PeepholeOptimizeListIdiomsImpl { continue; } + auto list_creation_node = first_input->node(); + if (list_creation_node->kind() != prim::ListConstruct) { + continue; + } + if (node->kind() == aten::len) { - if (first_input->node()->kind() == prim::ListConstruct) { - WithInsertPoint guard(node); - node->output()->replaceAllUsesWith(graph_->insertConstant( - static_cast(first_input->node()->inputs().size()))); - changed = true; - } + WithInsertPoint guard(node); + node->output()->replaceAllUsesWith(graph_->insertConstant( + static_cast(first_input->node()->inputs().size()))); + changed = true; } else if (node->kind() == aten::__getitem__) { - auto list_creation_node = first_input->node(); - if (list_creation_node->kind() == prim::ListConstruct) { - if (auto index = toIValue(node->input(1))) { - size_t list_size = list_creation_node->inputs().size(); - if (auto norm_index = normalizeIndex(index->toInt(), list_size)) { - node->output()->replaceAllUsesWith( - list_creation_node->input(*norm_index)); - changed = true; - } + if (auto index = toIValue(node->input(1))) { + size_t list_size = list_creation_node->inputs().size(); + if (auto norm_index = normalizeIndex(index->toInt(), list_size)) { + node->output()->replaceAllUsesWith( + list_creation_node->input(*norm_index)); + changed = true; } } } else if (node->kind() == prim::ListUnpack) { - auto list_creation_node = first_input->node(); - if (list_creation_node->kind() == prim::ListConstruct) { - // if sizes are unequal it's a runtime error - if (list_creation_node->inputs().size() != node->outputs().size()) { - continue; - } - for (size_t i = 0; i < node->outputs().size(); ++i) { - node->output(i)->replaceAllUsesWith(list_creation_node->input(i)); - changed = true; - } + // if sizes are unequal it's a runtime error + if (list_creation_node->inputs().size() != node->outputs().size()) { + continue; + } + for (size_t i = 0; i < node->outputs().size(); ++i) { + node->output(i)->replaceAllUsesWith(list_creation_node->input(i)); + changed = true; } } else if (node->kind() == aten::add) { if (node->inputs().size() != 2) { @@ -251,8 +287,7 @@ struct PeepholeOptimizeListIdiomsImpl { if (mutated_lists_.count(second_input)) { continue; } - if (first_input->node()->kind() != prim::ListConstruct || - second_input->node()->kind() != prim::ListConstruct) { + if (second_input->node()->kind() != prim::ListConstruct) { continue; } WithInsertPoint guard(node); @@ -270,6 +305,8 @@ struct PeepholeOptimizeListIdiomsImpl { mutated_lists_.insert(list_construct->output()); } changed = true; + } else if (node->kind() == aten::slice) { + changed |= optimizeSlice(node, first_input->node()); } } return changed; diff --git a/torch/csrc/jit/passes/peephole_list_idioms.h b/torch/csrc/jit/passes/peephole_list_idioms.h index c8add4849d4ce..d20df9571db01 100644 --- a/torch/csrc/jit/passes/peephole_list_idioms.h +++ b/torch/csrc/jit/passes/peephole_list_idioms.h @@ -51,6 +51,14 @@ namespace jit { // // This is only applied to lists that are not modified. // +// 5. Slice +// Given a function like this: +// def foo(): +// return [1, 2, 3, 4, 5][0:2] +// This pass produces (after deadcode elimination): +// def foo(): +// return [1, 2] +// // Currently this is invoked as part of PeepholeOptimize // return true if graph is modified. // If `refine_list_len` is true will attempt to refine the len of lists through From eca87f729d071d12ccb31dd2c958a989d8ac17af Mon Sep 17 00:00:00 2001 From: Heitor Schueroff Date: Fri, 27 Aug 2021 10:16:02 -0700 Subject: [PATCH 297/530] Added reference tests to ReductionOpInfo (#62900) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62900 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30408815 Pulled By: heitorschueroff fbshipit-source-id: 6a1f82ac281920ff7405a42f46ccd796e60af9d6 --- aten/src/ATen/native/cpu/ReduceOpsKernel.cpp | 29 +-- test/test_reductions.py | 111 ++++++++++- .../_internal/common_methods_invocations.py | 179 +++++++++++++----- 3 files changed, 258 insertions(+), 61 deletions(-) diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 89d2fb21fb511..01ed54e56fc73 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -163,24 +163,29 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t } static void prod_kernel_impl(TensorIterator& iter) { - // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] + // Workaround for the error: '*' in boolean context, suggest '&&' instead + // [-Werror=int-in-bool-context] if (iter.dtype() == ScalarType::Bool) { using scalar_t = bool; binary_kernel_reduce_vec( - iter, - [=](scalar_t a, scalar_t b) -> scalar_t { return a && b; }, - [=](Vectorized a, Vectorized b) { return a && b; }, - // NOLINTNEXTLINE(bugprone-argument-comment) - /*identity=*/1); - } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] { - binary_kernel_reduce_vec( iter, - [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; }, - [=](Vectorized a, Vectorized b) { return a * b; }, + [=](scalar_t a, scalar_t b) + __ubsan_ignore_undefined__ -> scalar_t { return a && b; }, + [=](Vectorized a, Vectorized b) + __ubsan_ignore_undefined__ { return a && b; }, // NOLINTNEXTLINE(bugprone-argument-comment) /*identity=*/1); - }); + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] { + binary_kernel_reduce_vec( + iter, + [=](scalar_t a, scalar_t b) + __ubsan_ignore_undefined__ -> scalar_t { return a * b; }, + [=](Vectorized a, Vectorized b) + __ubsan_ignore_undefined__ { return a * b; }, + // NOLINTNEXTLINE(bugprone-argument-comment) + /*identity=*/1); + }); } } diff --git a/test/test_reductions.py b/test/test_reductions.py index e716336e4afe7..eed7f732051cd 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -10,7 +10,7 @@ from torch._six import inf, nan from torch.testing import ( - integral_types_and, floating_and_complex_types_and) + integral_types_and, floating_and_complex_types_and, get_all_dtypes) from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict, IS_WINDOWS, make_tensor) @@ -296,6 +296,115 @@ def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo): result = op(t, *args, dim=dim, **kwargs) self.assertEqual(result.shape, _reduced_shape(t.shape, dim)) + def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs): + """Helper method to test noncontiguous input tensors.""" + assert not t.is_contiguous() + + t_contig = t.contiguous() + for args, kwargs in op.generate_args_kwargs(t_contig, **reduction_kwargs): + kwargs.update(reduction_kwargs) + result = op(t, *args, **kwargs) + expected = op(t_contig, *args, **kwargs) + self.assertEqual(result, expected) + + @ops(reduction_ops) + def test_noncontiguous_innermost(self, device, dtype, op: ReductionOpInfo): + """Tests reducing along noncontiguous innermost dimension.""" + t = make_tensor((10, 10), device, dtype) + self._test_noncontiguous(op, t[:, ::2], dim=1) + + @ops(reduction_ops) + def test_noncontiguous_outermost(self, device, dtype, op: ReductionOpInfo): + """Tests reducing along noncontiguous outermost dimension.""" + t = make_tensor((10, 10), device, dtype) + self._test_noncontiguous(op, t[::2, :], dim=0) + + @ops(reduction_ops) + def test_noncontiguous_all(self, device, dtype, op: ReductionOpInfo): + """Tests reducing all dimensions of a noncontiguous tensor.""" + t = make_tensor((5, 5, 5), device, dtype) + self._test_noncontiguous(op, t[::2, ::3, 1:-1:2]) + + @ops(reduction_ops) + def test_noncontiguous_transposed(self, device, dtype, op: ReductionOpInfo): + """Tests reducing a transposed tensor.""" + t = make_tensor((5, 5), device, dtype) + self._test_noncontiguous(op, t.T) + + @ops(reduction_ops) + def test_noncontiguous_expanded(self, device, dtype, op: ReductionOpInfo): + """Tests reducing a tensor with expanded singleton dimensions.""" + t = make_tensor((2, 3), device, dtype) + self._test_noncontiguous(op, t.unsqueeze(1).expand(-1, 5, -1)) + + # NumPy does not support BFloat16 so we don't test that against reference + # implementations. We also don't compare dtypes or test for different + # keepdim because we already have other tests covering those. + # The test_reference_testing in test_ops.py only uses the samples from + # sample_inputs_func which do not test as exhaustively as these tests. + + def _test_ref(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs): + """Compares op against op.ref for the given input and reduction kwargs""" + for args, kwargs in op.generate_args_kwargs(t, **reduction_kwargs): + kwargs.update(reduction_kwargs) + result = op(t, *args, **kwargs) + expected = op.ref(t.detach().cpu().numpy(), *args, **kwargs) + self.assertEqual(result, expected, exact_dtype=False) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=get_all_dtypes(include_bfloat16=False)) + def test_ref_scalar_input(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for scalar input tensors""" + self._test_ref(op, make_tensor([], device, dtype)) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=get_all_dtypes(include_bfloat16=False)) + def test_ref_small_input(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for small input tensors""" + t = make_tensor((5, 3, 4, 2), device, dtype, exclude_zero=True) + self._test_ref(op, t) + for dim in [0, 1, 3] + ([[0, 2], [1, 3]] if op.supports_multiple_dims else []): + self._test_ref(op, t, dim=dim) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=[torch.float32]) + def test_ref_large_input_1D(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for a large 1D input tensor to check stability""" + self._test_ref(op, make_tensor((2 ** 20,), device, dtype, low=-1, high=2, exclude_zero=True)) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=[torch.float32]) + def test_ref_large_input_2D(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for a large 2D input tensor to test parallelism""" + t = make_tensor((32, 2 ** 16), device, dtype, low=-1, high=2, exclude_zero=True) + self._test_ref(op, t, dim=1) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=[torch.float32]) + def test_ref_large_input_64bit_indexing(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for a very large input tensor that requires 64 bit indexing""" + self._test_ref(op, make_tensor((275000000,), device, dtype, low=-1, high=2, exclude_zero=True)) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=get_all_dtypes(include_bfloat16=False)) + def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for input tensors with duplicate values""" + t = make_tensor((8, 8), device, dtype, exclude_zero=True) + t[::2, ::2] = t[1::2, 1::2] + self._test_ref(op, t) + self._test_ref(op, t, dim=0) + self._test_ref(op, t, dim=1) + + @ops(filter(lambda op: op.ref is not None, reduction_ops), + allowed_dtypes=[torch.float32, torch.complex64]) + def test_ref_extremal_values(self, device, dtype, op: ReductionOpInfo): + """Compares op against reference for input tensors with extremal values""" + t = make_tensor((10,), device, dtype, exclude_zero=True) + extremals = [0, 1] + [nan, inf, -inf] if torch.is_floating_point(t) else [] + for extremal in extremals: + t[5] = extremal + self._test_ref(op, t) + ########################################################################### # TODO: Legacy tests - port to ReductionOpInfo ########################################################################### diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 4331c92d56599..2230808b5fd43 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -187,6 +187,8 @@ def _np(t): return tuple(map(to_numpy, x)) elif isinstance(x, dict): return {k: to_numpy(v) for k, v in x.items()} + elif isinstance(x, torch.dtype): + return torch.empty(0, dtype=x).numpy().dtype elif isinstance(x, (numbers.Number, bool, str)): return x @@ -782,8 +784,8 @@ def _generate_reduction_inputs(device, dtype, requires_grad): """Generates input tensors for testing reduction operators""" yield make_tensor([], device, dtype, requires_grad=requires_grad) yield make_tensor([2], device, dtype, requires_grad=requires_grad) - yield make_tensor([2, 3], device, dtype, requires_grad=requires_grad, noncontiguous=True) - yield make_tensor([3, 2, 1, 5], device, dtype, requires_grad=requires_grad) + yield make_tensor([3, 5], device, dtype, requires_grad=requires_grad, noncontiguous=True) + yield make_tensor([3, 2, 1, 2], device, dtype, requires_grad=requires_grad) def _generate_reduction_kwargs(ndim, supports_multiple_dims=True): @@ -927,6 +929,8 @@ def sample_inputs_func(*args, **kwargs): # Override OpInfo defaults and call base class __init__ kwargs.setdefault('inplace_variant', None) kwargs.setdefault('sample_inputs_func', sample_inputs_func) + kwargs.setdefault('default_test_dtypes', ( + torch.uint8, torch.int64, torch.float16, torch.bfloat16, torch.float32, torch.complex64)) super(ReductionOpInfo, self).__init__(name, **kwargs) self.identity = identity @@ -4080,38 +4084,6 @@ def generator(): return list(generator()) -def sample_inputs_prod(op_info, device, dtype, requires_grad): - def make_arg(shape): - # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck - return make_tensor(shape, device, dtype, low=-1, high=+1, requires_grad=requires_grad) - - def prod_single_zero(): - result = make_arg(2 * (S,)) - with torch.no_grad(): - result[0, 1] = 0 - return result - - # will not be needed once OpInfo tests support Iterables - def sample_generator(): - for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad): - yield SampleInput(sample.input) # only Tensor, ignore other inputs - yield sample - sample.kwargs['keepdim'] = True - yield sample - yield SampleInput(prod_single_zero()) - yield SampleInput(make_arg((3, 3, 3)), args=(1,)) - yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True}) - - # test zero scalar tensor - zero = make_arg(()) - with torch.no_grad(): - zero.zero_() - yield SampleInput(zero) - yield SampleInput(zero, args=(0,)) - yield SampleInput(zero, args=(0,), kwargs={'keepdim': True}) - - return list(sample_generator()) - def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -5521,6 +5493,53 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): return op(input.triu() if upper else input.tril(), upper) +def reference_reduction_numpy(f, supports_keepdims=True): + """Wraps a NumPy reduction operator. + + The wrapper function will forward dim and keepdim kwargs to the wrapped + function as the NumPy equivalent axis and keepdims kwargs. + + Args: + f: NumPy reduction operator to wrap + supports_keepdims (bool, optional): Whether the NumPy operator accepts + keepdims parameter. If it does not, the wrapper will manually unsqueeze + the reduced dimensions if it was called with keepdim=True. Defaults to True. + + Returns: + Wrapped function + """ + @wraps(f) + def wrapper(x: np.ndarray, *args, **kwargs): + # Copy keys into a set + keys = set(kwargs.keys()) + + dim = kwargs.pop('dim', None) + keepdim = kwargs.pop('keepdim', False) + + if 'dim' in keys: + if x.ndim == 0: + # NumPy reductions don't accept dim=0 for scalar inputs + for i in dim if isinstance(dim, tuple) else (dim,): + assert i in {0, -1} + kwargs['axis'] = None + else: + kwargs['axis'] = tuple(dim) if isinstance(dim, Sequence) else dim + + if 'keepdim' in keys and supports_keepdims: + kwargs['keepdims'] = keepdim + + result = f(x, *args, **kwargs) + + # Unsqueeze reduced dimensions if NumPy does not support keepdims + if keepdim and not supports_keepdims and x.ndim > 0: + dim = list(range(x.ndim)) if dim is None else dim + result = np.expand_dims(result, dim) + + return result + + return wrapper + + # Operator database (sorted alphabetically) op_db: List[OpInfo] = [ UnaryUfuncInfo('abs', @@ -7039,15 +7058,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_out=False, supports_forward_ad=True, sample_inputs_func=sample_inputs_max_min_reduction_no_dim,), - # TODO(@heitorschueroff) Add test for dtype kwarg - OpInfo('mean', - dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), - assert_autodiffed=True, - supports_forward_ad=True, - sample_inputs_func=sample_inputs_reduction, - # Need to skip out test because one of the overload for mean does not support it - # TODO(@heitorschueroff) fix this when implementing ReductionInfo - skips=(SkipInfo('TestCommon', 'test_out'),)), OpInfo('quantile', dtypes=floating_types(), sample_inputs_func=sample_inputs_reduction_quantile), @@ -8890,6 +8900,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_autograd=False, result_dtype=torch.bool, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.all), skips=( # FIXME: does not support passing keepdim without dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8897,7 +8908,8 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): SkipInfo('TestReductions', 'test_dim_none'), SkipInfo('TestReductions', 'test_dim_none_keepdim'), # FIXME: uint8 input returns uint8 instead of bool - SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]), + SkipInfo('TestReductions', 'test_result_dtype', + dtypes=[torch.uint8]), ), ), ReductionOpInfo( @@ -8908,6 +8920,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_autograd=False, result_dtype=torch.bool, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.any), skips=( # FIXME: does not support passing keepdim without dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8915,14 +8928,15 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): SkipInfo('TestReductions', 'test_dim_none'), SkipInfo('TestReductions', 'test_dim_none_keepdim'), # FIXME: uint8 input returns uint8 instead of bool - SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]), + SkipInfo('TestReductions', 'test_result_dtype', + dtypes=[torch.uint8]), ), ), ReductionOpInfo( 'amax', nan_policy='propagate', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs), + ref=reference_reduction_numpy(np.amax), skips=( # FIXME: sum reduces all dimensions when dim=[] SkipInfo('TestReductions', 'test_dim_empty'), @@ -8933,7 +8947,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): 'amin', nan_policy='propagate', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs), + ref=reference_reduction_numpy(np.amin), skips=( # FIXME: sum reduces all dimensions when dim=[] SkipInfo('TestReductions', 'test_dim_empty'), @@ -8946,6 +8960,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_autograd=False, result_dtype=torch.int64, dtypes=all_types_and(torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.argmax, supports_keepdims=False), skips=( # FIXME: keepdim parameter is ignored when dim=None SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8958,6 +8973,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_autograd=False, result_dtype=torch.int64, dtypes=all_types_and(torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.argmin, supports_keepdims=False), skips=( # FIXME: keepdim parameter is ignored when dim=None SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8972,6 +8988,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): result_dtype=torch.int64, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_reduction_count_nonzero, + ref=reference_reduction_numpy(np.count_nonzero), skips=( # FIXME: count_nonzero does not accept keepdim kwarg SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8985,6 +9002,35 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): SkipInfo('TestReductions', 'test_dim_empty'), ), ), + ReductionOpInfo( + 'mean', + nan_policy='propagate', + supports_out=False, + supports_forward_ad=True, + assert_autodiffed=True, + promotes_int_to_float=True, + dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.mean), + decorators=( + # FIXME: fix precision + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-05, rtol=1e-02), + }), 'TestReductions', 'test_noncontiguous_all'), + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-05, rtol=1e-02), + }), 'TestReductions', 'test_ref_small_input'), + ), + skips=( + # FIXME: prod does not support passing keepdim without passing dim + SkipInfo('TestReductions', 'test_dim_default_keepdim'), + # FIXME: prod reduces all dimensions when dim=[] + SkipInfo('TestReductions', 'test_dim_empty'), + SkipInfo('TestReductions', 'test_dim_empty_keepdim'), + # FIXME: prod does not support passing None to dim + SkipInfo('TestReductions', 'test_dim_none'), + SkipInfo('TestReductions', 'test_dim_none_keepdim'), + ), + ), ReductionOpInfo( 'prod', identity=1, @@ -8995,7 +9041,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, dtypes=all_types_and_complex_and(torch.bool), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - sample_inputs_func=sample_inputs_prod, + ref=reference_reduction_numpy(np.prod), skips=( # FIXME: prod does not support passing keepdim without passing dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -9005,6 +9051,11 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): # FIXME: prod does not support passing None to dim SkipInfo('TestReductions', 'test_dim_none'), SkipInfo('TestReductions', 'test_dim_none_keepdim'), + # FIXME: improve precision, failing with nan != inf + SkipInfo('TestReductions', 'test_ref_small_input', + dtypes=[torch.float16, torch.complex64]), + SkipInfo('TestReductions', 'test_ref_duplicate_values', + dtypes=[torch.uint8, torch.float16, torch.complex64]), ), ), ReductionOpInfo( @@ -9015,6 +9066,22 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_forward_ad=True, promotes_int_to_int64=True, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.sum), + decorators=( + # FIXME: fix precision + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-05, rtol=1e-02), + }), 'TestReductions', 'test_noncontiguous_all'), + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-03, rtol=1e-02), + }), 'TestReductions', 'test_ref_small_input'), + DecorateInfo(toleranceOverride({ + torch.float32: tol(atol=1e-03, rtol=1e-03), + }), 'TestReductions', 'test_ref_large_input_64bit_indexing'), + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-05, rtol=1e-02), + }), 'TestReductions', 'test_ref_duplicate_values'), + ), skips=( # FIXME: sum does not support passing keepdim without passing dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -9033,6 +9100,22 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_out=False, promotes_int_to_int64=True, dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), + ref=reference_reduction_numpy(np.nansum), + decorators=( + # FIXME: fix precision + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-05, rtol=1e-02), + }), 'TestReductions', 'test_noncontiguous_all'), + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-03, rtol=1e-02), + }), 'TestReductions', 'test_ref_small_input'), + DecorateInfo(toleranceOverride({ + torch.float32: tol(atol=1e-03, rtol=1e-03), + }), 'TestReductions', 'test_ref_large_input_64bit_indexing'), + DecorateInfo(toleranceOverride({ + torch.float16: tol(atol=1e-05, rtol=1e-02), + }), 'TestReductions', 'test_ref_duplicate_values'), + ), skips=( # FIXME: nansum does not support passing keepdim without passing dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), From 538c30a713a1ee2a3d654c3e1cdf9cc20b7d8c72 Mon Sep 17 00:00:00 2001 From: Pierluigi Taddei Date: Fri, 27 Aug 2021 10:36:08 -0700 Subject: [PATCH 298/530] [caffe2] fixes to allow stricter compilation flag (#64016) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64016 In order to increase the strictness of the compilation for some target depending on caffe2 we need to fix some errors uncovered when rising such flags. This change introduces the required override tokens for virtual destructors Test Plan: CI. Moreover targets depending on caffe2 using clang strict warnings now compile Reviewed By: kalman5 Differential Revision: D30541714 fbshipit-source-id: 564af31b4a9df3536d7d6f43ad29e1d0c7040551 --- aten/src/ATen/CPUGeneratorImpl.h | 2 +- aten/src/ATen/core/builtin_function.h | 2 +- c10/core/StorageImpl.h | 2 +- caffe2/serialize/istream_adapter.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h index f8b43a04c73c0..69dbb8b8de973 100644 --- a/aten/src/ATen/CPUGeneratorImpl.h +++ b/aten/src/ATen/CPUGeneratorImpl.h @@ -10,7 +10,7 @@ namespace at { struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl { // Constructors CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val); - ~CPUGeneratorImpl() = default; + ~CPUGeneratorImpl() override = default; // CPUGeneratorImpl methods std::shared_ptr clone() const; diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index de30f9b7e179f..600c16bb6e5d4 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -123,7 +123,7 @@ struct BuiltinOpFunction : public Function { return *this; } - ~BuiltinOpFunction() {} + ~BuiltinOpFunction() override {} private: c10::QualifiedName name_; diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h index ff29b68dc4dad..bea717d7ee50f 100644 --- a/c10/core/StorageImpl.h +++ b/c10/core/StorageImpl.h @@ -68,7 +68,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target { StorageImpl() = delete; StorageImpl(StorageImpl&& other) = default; StorageImpl(const StorageImpl&) = delete; - ~StorageImpl() = default; + ~StorageImpl() override = default; void reset() { data_ptr_.clear(); diff --git a/caffe2/serialize/istream_adapter.h b/caffe2/serialize/istream_adapter.h index 8960d5535c885..680c288a15f2e 100644 --- a/caffe2/serialize/istream_adapter.h +++ b/caffe2/serialize/istream_adapter.h @@ -16,7 +16,7 @@ class TORCH_API IStreamAdapter final : public ReadAdapterInterface { size_t size() const override; size_t read(uint64_t pos, void* buf, size_t n, const char* what = "") const override; - ~IStreamAdapter(); + ~IStreamAdapter() override; private: std::istream* istream_; From 6ab3a210983b7eee417e7cd92a8ad2677065e470 Mon Sep 17 00:00:00 2001 From: BBuf <1182563586@qq.com> Date: Fri, 27 Aug 2021 10:42:24 -0700 Subject: [PATCH 299/530] fix resize bug (#61166) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: I think the original intention here is to only take effect in the case of align_corners (because output_size = 1 and the divisor will be 0), but it affects non-align_corners too. For example: ```python input = torch.tensor( np.arange(1, 5, dtype=np.int32).reshape((1, 1, 2, 2)) ) m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear") of_out = m(input) ``` The result we expect should be [[[[2.5]]]] but pytorch get [[[[1.0]]]] which is different from OpenCV and PIL, this pr try to fixed it。 Pull Request resolved: https://github.com/pytorch/pytorch/pull/61166 Reviewed By: malfet Differential Revision: D30543178 Pulled By: heitorschueroff fbshipit-source-id: 21a4035483981986b0ae4a401ef0efbc565ccaf1 --- aten/src/ATen/native/UpSample.h | 16 ++++++++++------ aten/src/ATen/native/cuda/UpSample.cuh | 15 ++++++++++----- test/test_nn.py | 7 +++++++ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index e50b053949d37..602abcebbe3a0 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -251,12 +251,16 @@ static inline scalar_t area_pixel_compute_scale( bool align_corners, const c10::optional scale) { // see Note [area_pixel_compute_scale] - if (output_size > 1) { - return align_corners - ? static_cast(input_size - 1) / (output_size - 1) - : compute_scales_value(scale, input_size, output_size); - } else { - return scalar_t(0); + if(align_corners){ + if(output_size > 1) { + return static_cast(input_size - 1) / (output_size - 1); + } + else { + return static_cast(0); + } + } + else{ + return compute_scales_value(scale, input_size, output_size); } } diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh index 71443e19755d5..c69a2597b74bb 100644 --- a/aten/src/ATen/native/cuda/UpSample.cuh +++ b/aten/src/ATen/native/cuda/UpSample.cuh @@ -94,11 +94,16 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale( int output_size, bool align_corners, const c10::optional scale) { - if (output_size > 1) { - return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1) - : compute_scales_value(scale, input_size, output_size); - } else { - return static_cast(0); + if(align_corners) { + if(output_size > 1) { + return (accscalar_t)(input_size - 1) / (output_size - 1); + } + else { + return static_cast(0); + } + } + else{ + return compute_scales_value(scale, input_size, output_size); } } diff --git a/test/test_nn.py b/test/test_nn.py index c6fe0b28b30ac..4e01c94d4c971 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -10475,6 +10475,13 @@ def test_upsamplingTrilinear3d_spatial_invariance(self): out_t_5 = m(in_t_9[:, :, :5, :5, :5]) self.assertEqual(out_t_9[:, :, :15, :15, :15], out_t_5) + def test_upsampling_small_scale(self): + m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear") + in_t = torch.arange(1, 5, dtype=torch.float64).reshape(1, 1, 2, 2) + out_t = m(in_t) + expected_out_t = torch.tensor([[[[2.5]]]]) + self.assertEqual(expected_out_t, out_t) + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_interpolate_illegal_memory_access(self): in_s = 45 From 9f1f22b9bc04318789b9e79c237f93eecbdc1bfd Mon Sep 17 00:00:00 2001 From: Don Jang Date: Fri, 27 Aug 2021 10:42:50 -0700 Subject: [PATCH 300/530] [Static Runtime] Add out variant of quantized::embedding_bag_byte_prepack (#64081) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64081 This change add an out variant of `quantized::embedding_bag_byte_prepack`. Test Plan: - Added `ShapeInferenceTest.QEmbeddingBagByteUnpack`. - Observed ``` V0824 13:38:49.723708 1322143 impl.cpp:1394] Switch to out variant for node: %2 : Tensor = quantized::embedding_bag_byte_prepack(%input) ``` Reviewed By: hlu1 Differential Revision: D30504216 fbshipit-source-id: 1d9d428e77a15bcc7da373d65e7ffabaf9c6caf2 --- .../quantized/cpu/qembeddingbag_prepack.cpp | 36 ++++++++++++------- .../quantized/cpu/qembeddingbag_prepack.h | 11 ++++++ benchmarks/static_runtime/test_scripts.h | 8 +++++ .../static_runtime/test_static_runtime.cc | 8 +++++ torch/csrc/jit/runtime/static/ops.cpp | 23 ++++++++++++ 5 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index 5d9abce940f58..614e274b5493d 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -122,7 +124,6 @@ c10::intrusive_ptr PackedEmbeddingBagWeight::prepack( namespace at { namespace native { -namespace { // Note - This is a temporary pack function for embedding bag which quantizes // and packs the float weight tensor. In the next step it will be replaced by a @@ -184,7 +185,7 @@ namespace { // // [[50. , 60.00000035], // [70. , 80.00000035]]]) -Tensor qembeddingbag_byte_prepack(const Tensor& weight) { +Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) { // The "last" dimension of an N-Dimensioned batch of embedding bags is // quantization channel. E.g. for a 2D embedding bag, this has // [ row, col ] dimensions, for batched of embedding bags, dimensions might be @@ -208,17 +209,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { const int32_t embedding_cols = weight_sizes[cols_dim]; // Add 8 bytes per column to store FP32 scale and zero_point per row. const int32_t output_columns = embedding_cols + 2 * sizeof(float); - Tensor weight_contig = weight.contiguous(weight.suggest_memory_format()); + const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format()); // Adjust output dimensions to account for FP32 scale and zero_points. std::vector output_shape = weight_sizes.vec(); output_shape[cols_dim] = output_columns; - - // Allocate output packed weights - auto output = at::empty( - output_shape, - weight_contig.options().dtype(at::kByte), - weight_contig.suggest_memory_format()); + at::native::resize_(output, output_shape, c10::nullopt); auto* output_data = output.data_ptr(); #ifdef USE_FBGEMM @@ -246,10 +242,9 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { } #else - const auto float_weight = weight_contig.scalar_type() == at::ScalarType::Half - ? weight_contig.to(at::ScalarType::Float) - : weight_contig; - const auto weight_data = float_weight.data_ptr(); + const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half + ? weight_contig->to(at::ScalarType::Float).data_ptr() + : weight_contig->data_ptr(); constexpr float kEpsilon = 1e-8f; for (auto row: c10::irange(embedding_rows)) { const float* input_row = weight_data + row * embedding_cols; @@ -276,6 +271,21 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { return output; } +Tensor qembeddingbag_byte_prepack(const Tensor& weight) { + const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format()); + auto output = at::detail::empty_cpu( + {0}, + at::kByte, + weight_contig->layout(), + weight_contig->device(), + c10::nullopt, + c10::nullopt); + qembeddingbag_byte_prepack_out(output, weight); + return output; +} + +namespace { + // TODO: Extend support to N-D batched embeddings, similar to qembeddingbag_byte_prepack Tensor _qembeddingbag_nbit_prepack_helper( const Tensor& weight, diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h new file mode 100644 index 0000000000000..c52cbae4f2c80 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h @@ -0,0 +1,11 @@ +#include + +namespace at { +namespace native { + +Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight); + +Tensor qembeddingbag_byte_prepack(const Tensor& weight); + +} // namespace native +} // namespace at diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 477b191b24156..bcc975b79cf25 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -772,3 +772,11 @@ const auto fmod_scalar = R"JIT( def forward(self, a: Tensor, b: int): return torch.fmod(a, b).clone() )JIT"; + +const std::string embedding_bag_byte_prepack_script = R"IR( + graph(%input: Tensor): + %none : None = prim::Constant() + %output: Tensor = quantized::embedding_bag_byte_prepack(%input) + %res: Tensor = aten::clone(%output, %none) + return (%res) +)IR"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index bd213c78dac1a..1e987a9fab58e 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1257,3 +1257,11 @@ TEST(StaticRuntime, IndividualOps_FmodScalar) { std::vector args3{c, 4}; testStaticRuntime(fmod_scalar, args2, args3); } + +TEST(StaticRuntime, QEmbeddingBagByteUnpack) { + auto a = torch::randn({8, 16}, at::ScalarType::Float); + auto b = torch::randn({8*2, 16*2}, at::ScalarType::Float); + + testStaticRuntime(embedding_bag_byte_prepack_script, {a}); + testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b}); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 36f796fb2f256..f171d2889f551 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -761,6 +762,7 @@ REGISTER_OPERATOR_FUNCTOR( include_last_offset); }; }); + REGISTER_OPERATOR_FUNCTOR( quantized::embedding_bag_4bit_rowwise_offsets, embedding_bag_4bit_rowwise_offsets, @@ -799,6 +801,27 @@ REGISTER_OPERATOR_FUNCTOR( }; }); +REGISTER_OPERATOR_FUNCTOR( + quantized::embedding_bag_byte_prepack, + embedding_bag_byte_prepack, + [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& weight = p_node->Input(0).toTensor(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::qembeddingbag_byte_prepack(weight); + return; + } + auto& out_t = p_node->Output(0).toTensor(); + fastResizeToZero(out_t); + at::native::qembeddingbag_byte_prepack_out(out_t, weight); + }; + }); + // The out variant takes precedence over native REGISTER_OPERATOR_FUNCTOR(aten::narrow_copy, aten_narrow_copy, [](Node* n) -> SROperator { if (!n->matches(torch::schema( From 19c1b45f25af32fc6f6d6da315f0055ab7e30222 Mon Sep 17 00:00:00 2001 From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" Date: Fri, 27 Aug 2021 11:18:52 -0700 Subject: [PATCH 301/530] Detect out argument in the schema (#62755) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62755 After this change, out argument can be checked by calling is_out() Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30415256 Pulled By: tugsbayasgalan fbshipit-source-id: b2e1fa46bab7c813aaede1f44149081ef2df566d --- aten/src/ATen/core/function_schema.h | 10 ++++++++ aten/src/ATen/core/function_schema_inl.h | 32 ++++++++++++++++++++---- test/cpp/jit/test_misc.cpp | 22 ++++++++++++++++ test/test_function_schema.py | 21 ++++++++++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index 68e177a225d76..a7b514990185b 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -34,6 +34,9 @@ struct Argument { default_value_(std::move(default_value)), kwarg_only_(kwarg_only), alias_info_(std::move(alias_info)) { + // this is an softly-enforced invariant for out arguments. + bool is_alias = alias_info_.has_value() && alias_info_.value().isWrite(); + is_out_ = kwarg_only_ && is_alias; } const std::string& name() const { return name_; @@ -50,6 +53,11 @@ struct Argument { bool kwarg_only() const { return kwarg_only_; } + + bool is_out() const { + return is_out_; + } + const c10::optional& alias_info() const { return alias_info_; } @@ -116,6 +124,8 @@ struct Argument { // is this only specifiable as a keyword argument? bool kwarg_only_; c10::optional alias_info_; + // marks if the argument is out variant of the schema + bool is_out_; }; inline bool operator==(const Argument& lhs, const Argument& rhs) { diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index 168ecb4f3dc17..6e26e8c14cdab 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -51,6 +51,16 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) return out; } +inline size_t findFirstOutArg(const std::vector& args) { + // find the start of out args in the schema + for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) { + if (args.at(out_start_idx).is_out()) { + return out_start_idx; + } + } + return args.size(); +} + inline bool Argument::isBackwardCompatibleWith( const Argument& old, std::ostream* why_not) const { @@ -121,17 +131,20 @@ inline bool FunctionSchema::isBackwardCompatibleWith( } } - // Make sure that all the old arguments have their corresponding backward - // compatible arguments in this schema. - for (size_t i = 0; i < old.arguments().size(); ++i) { + // we want to test both out and default args seperately + size_t old_out_start_idx = findFirstOutArg(old.arguments()); + size_t new_out_start_idx = findFirstOutArg(arguments()); + + // make sure among the default args, they are backward compatible + for (size_t i = 0; i < old_out_start_idx; i++) { if (!arguments().at(i).isBackwardCompatibleWith( old.arguments().at(i), why_not)) { return false; } } - // Validate that all new arguments provided a default value. - for (size_t i = old.arguments().size(); i < arguments().size(); ++i) { + // // Validate that all new arguments provided has a default value + for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) { if (!arguments().at(i).default_value()) { if (why_not) { *why_not @@ -144,6 +157,15 @@ inline bool FunctionSchema::isBackwardCompatibleWith( } } + // now compare the out args + for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) { + if (!arguments() + .at(i - old_out_start_idx + new_out_start_idx) + .isBackwardCompatibleWith(old.arguments().at(i), why_not)) { + return false; + } + } + return true; } diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 82f70fee1dd20..9f8a732f550f4 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -520,6 +520,28 @@ TEST(SchemaParserTest, NestedArrays) { .getElementType())); } +TEST(SchemaParserTest, OutVariant) { + auto schema_with_out = parseSchema( + "at::foo(Tensor self, *, Tensor(a!) f, Tensor(b!) l) -> (Tensor(a!) f, Tensor(b!) l)"); + ASSERT_TRUE(schema_with_out.arguments().at(1).is_out()); + ASSERT_TRUE(schema_with_out.arguments().at(2).is_out()); + + auto schema_without_out = + parseSchema("at::foo(Tensor self, *, int scalar) -> (int)"); + + for (const auto& arg : schema_without_out.arguments()) { + ASSERT_TRUE(!arg.is_out()); + } + + auto schema_with_is_write = parseSchema( + "aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> (Tensor(a!))"); + + for (const auto& arg : schema_with_is_write.arguments()) { + ASSERT_TRUE(!arg.is_out()); + } +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) TEST(SchemaParserTest, NamedReturns) { // named returns parseSchema("at::what(Tensor! i_will_be_written_to) -> ()"); diff --git a/test/test_function_schema.py b/test/test_function_schema.py index 0451debebd196..7c7a0f77cb922 100644 --- a/test/test_function_schema.py +++ b/test/test_function_schema.py @@ -86,6 +86,27 @@ def test_backward_compatible_arguments(self): new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + def test_backward_compatible_with_smart_serialization(self): + # cases where out arg is provided + old_schema = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)') + new_schema_same_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)') + new_schema_wrong_default = parse_schema('foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)') + new_schema_more_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)') + new_schema_wrong_pos = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)') + self.assertTrue(new_schema_same_out.is_backward_compatible_with(old_schema)) + self.assertTrue(new_schema_more_out.is_backward_compatible_with(old_schema)) + self.assertFalse(new_schema_wrong_default.is_backward_compatible_with(old_schema)) + self.assertFalse(new_schema_wrong_pos.is_backward_compatible_with(old_schema)) + + # cases where out arg is not provided + old_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1) -> int') + new_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1, int c=2) -> int') + new_schema_without_arg_multiple_default = parse_schema('foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int') + new_schema_without_arg_wrong_pos = parse_schema('foo(Tensor self, int a, int c=2, int b=1) -> int') + self.assertTrue(new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg)) + self.assertTrue(new_schema_without_arg_multiple_default.is_backward_compatible_with(old_schema_without_arg)) + self.assertFalse(new_schema_without_arg_wrong_pos.is_backward_compatible_with(old_schema_without_arg)) + def test_string_optional_parameter_default_value(self): schema_a = parse_schema("example::op(str? order=\"NCHW\") -> (Tensor)") schema_b = parse_schema(str(schema_a)) From 196fd3ee7ae7d053803e4f03a37ddd00ee3fd433 Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Fri, 27 Aug 2021 11:28:03 -0700 Subject: [PATCH 302/530] Modules note v2 (#63963) Summary: This PR expands the [note on modules](https://pytorch.org/docs/stable/notes/modules.html) with additional info for 1.10. It adds the following: * Examples of using hooks * Examples of using apply() * Examples for ParameterList / ParameterDict * register_parameter() / register_buffer() usage * Discussion of train() / eval() modes * Distributed training overview / links * TorchScript overview / links * Quantization overview / links * FX overview / links * Parametrization overview / link to tutorial Pull Request resolved: https://github.com/pytorch/pytorch/pull/63963 Reviewed By: albanD Differential Revision: D30606604 Pulled By: jbschlosser fbshipit-source-id: c1030b19162bcb5fe7364bcdc981a2eb6d6e89b4 --- docs/source/notes/modules.rst | 316 ++++++++++++++++++++++++++++++++-- 1 file changed, 303 insertions(+), 13 deletions(-) diff --git a/docs/source/notes/modules.rst b/docs/source/notes/modules.rst index 4eba02231b1ac..c1d978dc78115 100644 --- a/docs/source/notes/modules.rst +++ b/docs/source/notes/modules.rst @@ -117,7 +117,7 @@ multiple modules: Note that :class:`~torch.nn.Sequential` automatically feeds the output of the first ``MyLinear`` module as input into the :class:`~torch.nn.ReLU`, and the output of that as input into the second ``MyLinear`` module. As -shown, it is limited to in-order chaining of modules. +shown, it is limited to in-order chaining of modules with a single input and output. In general, it is recommended to define a custom module for anything beyond the simplest use cases, as this gives full flexibility on how submodules are used for a module's computation. @@ -258,16 +258,32 @@ It's also easy to move all parameters to a different device or change their prec dynamic_net(torch.randn(5, device='cuda', dtype=torch.float64)) : tensor([6.5166], device='cuda:0', dtype=torch.float64, grad_fn=) -These examples show how elaborate neural networks can be formed through module composition. To allow for -quick and easy construction of neural networks with minimal boilerplate, PyTorch provides a large library of -performant modules within the :mod:`torch.nn` namespace that perform computation commonly found within neural -networks, including pooling, convolutions, loss functions, etc. +More generally, an arbitrary function can be applied to a module and its submodules recursively by +using the :func:`~torch.nn.Module.apply` function. For example, to apply custom initialization to parameters +of a module and its submodules: + +.. code-block:: python + + # Define a function to initialize Linear weights. + # Note that no_grad() is used here to avoid tracking this computation in the autograd graph. + @torch.no_grad() + def init_weights(m): + if isinstance(m, nn.Linear): + nn.init.xavier_normal_(m.weight) + m.bias.fill_(0.0) + + # Apply the function recursively on the module and its submodules. + dynamic_net.apply(init_weights) + +These examples show how elaborate neural networks can be formed through module composition and conveniently +manipulated. To allow for quick and easy construction of neural networks with minimal boilerplate, PyTorch +provides a large library of performant modules within the :mod:`torch.nn` namespace that perform common neural +network operations like pooling, convolutions, loss functions, etc. In the next section, we give a full example of training a neural network. For more information, check out: -* Recursively :func:`~torch.nn.Module.apply` a function to a module and its submodules * Library of PyTorch-provided modules: `torch.nn `_ * Defining neural net modules: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html @@ -295,6 +311,12 @@ Optimizers from :mod:`torch.optim`: loss.backward() optimizer.step() + # After training, switch the module to eval mode to do inference, compute performance metrics, etc. + # (see discussion below for a description of training and evaluation modes) + ... + net.eval() + ... + In this simplified example, the network learns to simply output zero, as any non-zero output is "penalized" according to its absolute value by employing :func:`torch.abs` as a loss function. While this is not a very interesting task, the key parts of training are present: @@ -321,6 +343,38 @@ value of ``l1``\ 's ``weight`` parameter shows that its values are now much clos [ 0.0030], [-0.0008]], requires_grad=True) +Note that the above process is done entirely while the network module is in "training mode". Modules default to +training mode and can be switched between training and evaluation modes using :func:`~torch.nn.Module.train` and +:func:`~torch.nn.Module.eval`. They can behave differently depending on which mode they are in. For example, the +:class:`~torch.nn.BatchNorm` module maintains a running mean and variance during training that are not updated +when the module is in evaluation mode. In general, modules should be in training mode during training +and only switched to evaluation mode for inference or evaluation. Below is an example of a custom module +that behaves differently between the two modes: + +.. code-block:: python + + class ModalModule(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + if self.training: + # Add a constant only in training mode. + return x + 1. + else: + return x + + + m = ModalModule() + x = torch.randn(4) + + print('training mode output: {}'.format(m(x))) + : tensor([1.6614, 1.2669, 1.0617, 1.6213, 0.5481]) + + m.eval() + print('evaluation mode output: {}'.format(m(x))) + : tensor([ 0.6614, 0.2669, 0.0617, 0.6213, -0.4519]) + Training neural networks can often be tricky. For more information, check out: * Using Optimizers: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_optim.html. @@ -409,12 +463,127 @@ Both persistent and non-persistent buffers are affected by model-wide device / d Buffers of a module can be iterated over using :func:`~torch.nn.Module.buffers` or :func:`~torch.nn.Module.named_buffers`. +.. code-block:: python + + for buffer in m.named_buffers(): + print(buffer) + +The following class demonstrates the various ways of registering parameters and buffers within a module: + +.. code-block:: python + + class StatefulModule(nn.Module): + def __init__(self): + super().__init__() + # Setting a nn.Parameter as an attribute of the module automatically registers the tensor + # as a parameter of the module. + self.param1 = nn.Parameter(torch.randn(2)) + + # Alternative string-based way to register a parameter. + self.register_parameter('param2', nn.Parameter(torch.randn(3))) + + # Reserves the "param3" attribute as a parameter, preventing it from being set to anything + # except a parameter. "None" entries like this will not be present in the module's state_dict. + self.register_parameter('param3', None) + + # Registers a list of parameters. + self.param_list = nn.ParameterList([nn.Parameter(torch.randn(2)) for i in range(3)]) + + # Registers a dictionary of parameters. + self.param_dict = nn.ParameterDict({ + 'foo': nn.Parameter(torch.randn(3)), + 'bar': nn.Parameter(torch.randn(4)) + }) + + # Registers a persistent buffer (one that appears in the module's state_dict). + self.register_buffer('buffer1', torch.randn(4), persistent=True) + + # Registers a non-persistent buffer (one that does not appear in the module's state_dict). + self.register_buffer('buffer2', torch.randn(5), persistent=False) + + # Reserves the "buffer3" attribute as a buffer, preventing it from being set to anything + # except a buffer. "None" entries like this will not be present in the module's state_dict. + self.register_buffer('buffer3', None) + + # Adding a submodule registers its parameters as parameters of the module. + self.linear = nn.Linear(2, 3) + + m = StatefulModule() + + # Save and load state_dict. + torch.save(m.state_dict(), 'state.pt') + m_loaded = StatefulModule() + m_loaded.load_state_dict(torch.load('state.pt')) + + # Note that non-persistent buffer "buffer2" and reserved attributes "param3" and "buffer3" do + # not appear in the state_dict. + print(m_loaded.state_dict()) + : OrderedDict([('param1', tensor([-0.0322, 0.9066])), + ('param2', tensor([-0.4472, 0.1409, 0.4852])), + ('buffer1', tensor([ 0.6949, -0.1944, 1.2911, -2.1044])), + ('param_list.0', tensor([ 0.4202, -0.1953])), + ('param_list.1', tensor([ 1.5299, -0.8747])), + ('param_list.2', tensor([-1.6289, 1.4898])), + ('param_dict.bar', tensor([-0.6434, 1.5187, 0.0346, -0.4077])), + ('param_dict.foo', tensor([-0.0845, -1.4324, 0.7022])), + ('linear.weight', tensor([[-0.3915, -0.6176], + [ 0.6062, -0.5992], + [ 0.4452, -0.2843]])), + ('linear.bias', tensor([-0.3710, -0.0795, -0.3947]))]) + For more information, check out: * Saving and loading: https://pytorch.org/tutorials/beginner/saving_loading_models.html * Serialization semantics: https://pytorch.org/docs/master/notes/serialization.html * What is a state dict? https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html +Module Initialization +--------------------- + +By default, parameters and floating-point buffers for modules provided by :mod:`torch.nn` are initialized during +module instantiation as 32-bit floating point values on the CPU using an initialization scheme determined to +perform well historically for the module type. For certain use cases, it may be desired to initialize with a different +dtype, device (e.g. GPU), or initialization technique. + +Examples: + +.. code-block:: python + + # Initialize module directly onto GPU. + m = nn.Linear(5, 3, device='cuda') + + # Initialize module with 16-bit floating point parameters. + m = nn.Linear(5, 3, dtype=torch.half) + + # Skip default parameter initialization and perform custom (e.g. orthogonal) initialization. + m = torch.nn.utils.skip_init(nn.Linear, 5, 3) + nn.init.orthogonal_(m.weight) + +Note that the device and dtype options demonstrated above also apply to any floating-point buffers registered +for the module: + +.. code-block:: python + + m = nn.BatchNorm2d(3, dtype=torch.half) + print(m.running_mean) + : tensor([0., 0., 0.], dtype=torch.float16) + +While module writers can use any device or dtype to initialize parameters in their custom modules, good practice is +to use ``dtype=torch.float`` and ``device='cpu'`` by default as well. Optionally, you can provide full flexibility +in these areas for your custom module by conforming to the convention demonstrated above that all +:mod:`torch.nn` modules follow: + +* Provide a ``device`` constructor kwarg that applies to any parameters / buffers registered by the module. +* Provide a ``dtype`` constructor kwarg that applies to any parameters / floating-point buffers registered by + the module. +* Only use initialization functions (i.e. functions from :mod:`torch.nn.init`) on parameters and buffers within the + module's constructor. Note that this is only required to use :func:`~torch.nn.utils.skip_init`; see + `this page `_ for an explanation. + +For more information, check out: + +* Skipping module parameter initialization: https://pytorch.org/tutorials/prototype/skip_param_init.html + Module Hooks ------------ @@ -443,16 +612,137 @@ All hooks allow the user to return an updated value that will be used throughout Thus, these hooks can be used to either execute arbitrary code along the regular module forward/backward or modify some inputs/outputs without having to change the module's ``forward()`` function. +Below is an example demonstrating usage of forward and backward hooks: + +.. code-block:: python + + torch.manual_seed(1) + + def forward_pre_hook(m, inputs): + # Allows for examination and modification of the input before the forward pass. + # Note that inputs are always wrapped in a tuple. + input = inputs[0] + return input + 1. + + def forward_hook(m, inputs, output): + # Allows for examination of inputs / outputs and modification of the outputs + # after the forward pass. Note that inputs are always wrapped in a tuple while outputs + # are passed as-is. + + # Residual computation a la ResNet. + return output + inputs[0] + + def backward_hook(m, grad_inputs, grad_outputs): + # Allows for examination of grad_inputs / grad_outputs and modification of + # grad_inputs used in the rest of the backwards pass. Note that grad_inputs and + # grad_outputs are always wrapped in tuples. + new_grad_inputs = [torch.ones_like(gi) * 42. for gi in grad_inputs] + return new_grad_inputs + + # Create sample module & input. + m = nn.Linear(3, 3) + x = torch.randn(2, 3, requires_grad=True) + + # ==== Demonstrate forward hooks. ==== + # Run input through module before and after adding hooks. + print('output with no forward hooks: {}'.format(m(x))) + : output with no forward hooks: tensor([[-0.5059, -0.8158, 0.2390], + [-0.0043, 0.4724, -0.1714]], grad_fn=) + + # Note that the modified input results in a different output. + forward_pre_hook_handle = m.register_forward_pre_hook(forward_pre_hook) + print('output with forward pre hook: {}'.format(m(x))) + : output with forward pre hook: tensor([[-0.5752, -0.7421, 0.4942], + [-0.0736, 0.5461, 0.0838]], grad_fn=) + + # Note the modified output. + forward_hook_handle = m.register_forward_hook(forward_hook) + print('output with both forward hooks: {}'.format(m(x))) + : output with both forward hooks: tensor([[-1.0980, 0.6396, 0.4666], + [ 0.3634, 0.6538, 1.0256]], grad_fn=) + + # Remove hooks; note that the output here matches the output before adding hooks. + forward_pre_hook_handle.remove() + forward_hook_handle.remove() + print('output after removing forward hooks: {}'.format(m(x))) + : output after removing forward hooks: tensor([[-0.5059, -0.8158, 0.2390], + [-0.0043, 0.4724, -0.1714]], grad_fn=) + + # ==== Demonstrate backward hooks. ==== + m(x).sum().backward() + print('x.grad with no backwards hook: {}'.format(x.grad)) + : x.grad with no backwards hook: tensor([[ 0.4497, -0.5046, 0.3146], + [ 0.4497, -0.5046, 0.3146]]) + + # Clear gradients before running backward pass again. + m.zero_grad() + x.grad.zero_() + + m.register_full_backward_hook(backward_hook) + m(x).sum().backward() + print('x.grad with backwards hook: {}'.format(x.grad)) + : x.grad with backwards hook: tensor([[42., 42., 42.], + [42., 42., 42.]]) + Advanced Features ----------------- PyTorch also provides several more advanced features that are designed to work with modules. All these functionalities -are "inherited" when writing a new module. In-depth discussion of these features can be found in the links below. +are available for custom-written modules, with the small caveat that certain features may require modules to conform +to particular constraints in order to be supported. In-depth discussion of these features and the corresponding +requirements can be found in the links below. -For more information, check out: +Distributed Training +******************** + +Various methods for distributed training exist within PyTorch, both for scaling up training using multiple GPUs +as well as training across multiple machines. Check out the +`distributed training overview page `_ for +detailed information on how to utilize these. + +Profiling Performance +********************* + +The `PyTorch Profiler `_ can be useful for identifying +performance bottlenecks within your models. It measures and outputs performance characteristics for +both memory usage and time spent. + +Improving Performance with Quantization +*************************************** + +Applying quantization techniques to modules can improve performance and memory usage by utilizing lower +bitwidths than floating-point precision. Check out the various PyTorch-provided mechanisms for quantization +`here `_. + +Improving Memory Usage with Pruning +*********************************** + +Large deep learning models are often over-parametrized, resulting in high memory usage. To combat this, PyTorch +provides mechanisms for model pruning, which can help reduce memory usage while maintaining task accuracy. The +`Pruning tutorial `_ describes how to utilize +the pruning techniques PyTorch provides or define custom pruning techniques as necessary. + +Deploying with TorchScript +************************** + +When deploying a model for use in production, the overhead of Python can be unacceptable due to its poor +performance characteristics. For cases like this, +`TorchScript `_ provides a way to load +and run an optimized model program from outside of Python, such as within a C++ program. + +Parametrizations +**************** + +For certain applications, it can be beneficial to constrain the parameter space during model training. For example, +enforcing orthogonality of the learned parameters can improve convergence for RNNs. PyTorch provides a mechanism for +applying `parametrizations `_ such as this, and +further allows for custom constraints to be defined. + +Transforming Modules with FX +**************************** -* Profiling: https://pytorch.org/tutorials/beginner/profiler.html -* Pruning: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html -* Quantization: https://pytorch.org/tutorials/recipes/quantization.html -* Exporting modules to TorchScript (e.g. for usage from C++): - https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html +The `FX `_ component of PyTorch provides a flexible way to transform +modules by operating directly on module computation graphs. This can be used to programmatically generate or +manipulate modules for a broad array of use cases. To explore FX, check out these examples of using FX for +`convolution + batch norm fusion `_ and +`CPU performance analysis `_. From 101a6263309ae2f9e52947c7d02d630e1190b6c3 Mon Sep 17 00:00:00 2001 From: mrshenli Date: Fri, 27 Aug 2021 11:28:31 -0700 Subject: [PATCH 303/530] Improve `distributed.get_rank()` API docstring (#63296) Summary: See discussion in https://pytorch.slack.com/archives/CBHSWPNM7/p1628792389008600 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63296 Reviewed By: cbalioglu Differential Revision: D30332042 Pulled By: mrshenli fbshipit-source-id: 3a642fda2e106fd35b67709ed2adb60e408854c2 --- torch/distributed/distributed_c10d.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index fac096e339e71..302114e1c7bb6 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -800,7 +800,8 @@ def destroy_process_group(group=None): def get_rank(group=None): """ - Returns the rank of current process group + Returns the rank of the current process in the provided ``group`` or the + default group if none was provided. Rank is a unique identifier assigned to each process within a distributed process group. They are always consecutive integers ranging from 0 to From 6257f5b168782e026cc19788e171f06a8d962afb Mon Sep 17 00:00:00 2001 From: Karen Zhou Date: Fri, 27 Aug 2021 11:51:09 -0700 Subject: [PATCH 304/530] [pruner] add README to repo (#64099) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64099 adding readme to pruner in OSS ghstack-source-id: 136867516 Test Plan: should not affect behavior Reviewed By: z-a-f Differential Revision: D30608045 fbshipit-source-id: 3e9899a853395b2e91e8a69a5d2ca5f3c2acc646 --- .../ao/sparsity/experimental/pruner/README.md | 93 ++++++++++++++++++ .../experimental/pruner/images/prune_1.png | Bin 0 -> 170728 bytes .../experimental/pruner/images/prune_2.png | Bin 0 -> 100490 bytes .../experimental/pruner/images/prune_3.png | Bin 0 -> 87843 bytes .../experimental/pruner/images/prune_4.png | Bin 0 -> 132748 bytes 5 files changed, 93 insertions(+) create mode 100644 torch/ao/sparsity/experimental/pruner/README.md create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_1.png create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_2.png create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_3.png create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_4.png diff --git a/torch/ao/sparsity/experimental/pruner/README.md b/torch/ao/sparsity/experimental/pruner/README.md new file mode 100644 index 0000000000000..da0afb0bf3fb1 --- /dev/null +++ b/torch/ao/sparsity/experimental/pruner/README.md @@ -0,0 +1,93 @@ +# Intro + +The Base Pruner inherits from the Base Sparsifier. + + +# Motivation + +Sparsifying weights allows us to skip some of the multiplications during the dot product (i.e. in the Linear layers), which ultimately translates into faster inference. With structured pruning, whole rows/columns of a tensor would be zeroed-out. This translates into model transformation (not just tensor transformation). Logically, the process of structured pruning is similar to removing some of the input/output channels in the layer completely. + +![prune logic](./images/prune_1.png) + + +# Design Choices + + +## Eager Mode + +**PruningParametrization:** After pruning, the shape of the weight changes (some of the output channels are pruned). That means the output of the current layer will have less output layers compared to the original. This means that the next layer should have less input channels. + +Consider an example below: + +![prune example](./images/prune_2.png) + +The dot product of the masked matrix A (weight) and matrix B (activation) produces the zeros at the sparse locations. However, if we remove the zeros, as in the example shown earlier, the result will change: + +![prune result](./images/prune_3.png) + +The resulting matrix is of different shape (2x2 vs. 4x2). + +**Forward Hook - ActivationReconstruction **(aka re-inserting zeros): To reconstruct the activation with the original shape, we will undo the sparsification before pushing that activation to the next layer. We do this with a forward hook -- forward hooks are functions that are called on the activation after the computation is complete. + +![prune reconstruction](./images/prune_4.png) + +**Forward Hook - Bias**: + +If the layer has a bias, it must be added to the activation AFTER zeros have been re-inserted, i.e. after the `ActivationReconstruction` forward hook. + +The pruner prunes the entire channel by default (weight & corresponding bias), so indices of the bias corresponding to pruned indices will be zeroed out. + + + +# Eager Mode APIs & Code Snippets + +Supported modules: nn.Linear, nn.Conv2d, nn.BatchNorm2d* + +* when provided in `config` with corresponding Conv2d layer + +`BasePruner`: base class with abstract method `update_mask` that computes the new pruner mask for all modules (see Write Your Own Pruner). The base pruner prunes the entire channel by default (weight & corresponding bias); if you don’t want the bias to be pruned, then set `also_prune_bias` to be False. + +`prepare`: registers the pruning parametrization (called `PruningParametrization`) to each module layer of the model; also adds forward hooks for bias support and re-inserting zeros to the output so the next layer received the correct size input. + +Note: for BatchNorm2d layers, the parametrization `ZeroesParametrization` is attached instead since its weight is 1d, so removing channels would affect the input dimension as well. `ZeroesParametrization` zeroes out channels rather than removing them like `PruningParametrization`. We need this when `also_prune_bias=True`, so BatchNorm2d channels get pruned with their corresponding Conv2d channels. + + +``` +pruner = ImplementedPruner(defaults=None, also_prune_bias=True) +pruner.prepare(model, config) +``` + + +`step`: applies `update_mask` logic (i.e. prunes the weight matrix) + + +``` +pruner.step() +``` + + +`squash_mask`: applies the parametrization one last time to the weight matrix, and then removes the pruning parametrization from the model + + +``` +pruner.squash_mask() +``` + + + +# Write Your Own Pruner + +To write a custom pruner, one could inherit from the `BasePruner` and implement some of the methods. For example, if implementing a pruner that computes the mask by randomly pruning ⅓ of channels: + + +``` +class ImplementedPruner(BasePruner): + def update_mask(self, layer, **kwargs): + param = layer.parametrizations.weight[0] # PruningParametrization + all_outputs = param.original_outputs + prune = random.sample(all_outputs, len(all_outputs) // 3) + param.pruned_outputs.update(prune) +``` + + +It is the responsibility of the base class to call the `self.update_mask` when appropriate. diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_1.png b/torch/ao/sparsity/experimental/pruner/images/prune_1.png new file mode 100644 index 0000000000000000000000000000000000000000..f7f48759225729e932c79b7825d4c91cff6fd725 GIT binary patch literal 170728 zcmeFYgY)y9e)c z-{*Rt-+BIl=Wxw9+r9VNd#!KBXGN*0DB?Y!cz}e2gs1dWUIPgU%^wK~)f@{0IMZ&l zYm0=0YhxoPr=}z)N2liMWNBk>frRulDn%DlPqUvi=aq(%br9B5>=x`FCGQAcq9GRr z$0ec>J&VPb?GCq4Z?BDE)p6IBAFoBKE{ts9jBeo})q3Yq^?{Y$=`l*<9>Yr!X>4mn>2pJ&Fp_t&ufDH5mfdq(ikM0KzCetChrI84)0AgiT*i?GWbVAY z15oJ(Zj0Y0Z|APvX}hC87NbQnr|Kd{5rcqMnUUZpzD1n0NS`#E#F&hVyPgni_ZtTi zRA{Vnk}MW=k+aWFJG}*u2A%UGnNuXN45G*!(M;AnIM+{9JQC>QCJNF+3H-$yiF(11 zgJf#4(|LHv%;qB2*|f`+w0}{o(p1T4dm*V6KPAk_uU3q^aK$gB zHGaIqupFi!Yme(v|44-SW!Y%o!w5I}@DlWF_K{8I$M#4Ek3ULs$~x5O3$q#S;*<@) zbD6fi(a$Mes`xR5`b353)$pCdqxIqozTwB}rVqNbBFi3h_45(fJ>KCQRf^P@i)DT2 zVv1pJ9u2j))*B&DgSnceuOg$Gm8*lq3C6oZ1!_-Dx7( zqr%PA{tIW9;`4*&HvR^m*j1W&Ge!7SNIfuS{f@nyNg6KB$lB+CuX2e1dZVQp(P1MDZJw^)P9{UV|iySdlyRuf4C}1 zfbO~y@Z^rO|HC(PIGSzO9egAt4zxBL_CoA-{{(f`i9&c%zCxTA%(9>n>MD`bB!d^Uddu`HV$Yx~0{ zS}R*|>o@q-tadkkep*$QxW6(5L=W=^Z268v>qj(7PktnE$_KM1lJ`mGKhzH@m8+xE?yKlCSr>K1 zw|U~o2ui|V$2-LCR--76c>h88xAP>XMV`7fgE-CbW0t(I-@fkHFXMGSHI2VK7JY-< z5?}X`XWS9jm;y{J(3{0e8ynJXYNN-@+-YgTq(`n3_0t#+E^X93TdA_csM19)isKc` zAxfm$aRF#CX)D+$V!zAo;O)8gHT_~(0Mi(D51L{yZICipu8-k4{X^mbtU2rmtb%1j zTb5=PqNwMexW*~>y0uJ&UBq4ZTu@zjs-mW!`99xaPI?;tNKpfcEwzhr<88E(ol=uh z?^E5UyZJQL7&Eau3ivDqNlfbv>->k@hw6tUhXSrVPcltk9OQLrpuMws_xYW}JB$IQ zXA+;&isfoQW2g}q%}k0-y4vEMJUpQ}DGLt|4-2p8vBs-XNK$C%k&dgO00~L|aJRu1 z#CM>2A^Ym_oUGlag)c3iC;kW@8^58D%54obd;*aTkl7zMw-&c^6+(YsBg7$SCsbkG z`s0(ezFk!1+?YTP?o4hsfs-GB=fDNv-8b@5ucXQi>kNw)X5-XnJzWZ5!-%L zGB`gi`NP1ZwY#~yd8*?VL%(Txw_(gxs+KbC;PXN4)Zx_EI+i+lrIvcyuNJDl=w0c> zluqg8R^1w#*}XNbY5Ot)YbXoinyMSCn?22;Uf}L?tbYA@LCEJ*OM+kg4aU_Veql&K zNZ-efm1M@P-y5u+^UKrXvAjcG>E?PHm69pqVZ`o3gz zt8y!L+jO0qP)SWsRprpNh7^Nj*p?btC!=HhW*OF2M|VVLsAe&iXFO;u3qM`9viI>6 z_7S<#j#Q51TvzMEjHk6?E?|3z6Nklx?SMUjr~Kd**(u#TgCs|@cr!+}8V9Uur#`Y%Ek9#~-i$9j^YnFey(bzLCs~nlY*<2EA&2 zI;HAt&3=u!_eS`=sJNqh_wRy(@&kd@(MP;Q%*;E_pt<1Wdcgz7mAQQ<7L*odtS6Q`hxd; zqQpv{d|6a;6y^`?KWu-Ren5^)y;GX^a2chi(1XxP(4{n!3#ihI1+@83 zRs|z$V!lXm;PPBsh)ZWfTD-$AV#Ne#M$AX&`EfE*_)5I%Hg+y*=3*G4Jx4x{^k*Dh z8KgCEz*;lY_$y3W`>#J;Bm_jC7d7hD=;`>n1>FU|z&Qw84daTLCo6k>Tr02U+B*69 zYw7Wk>Fw$fr0Le+)@>1(NCDqYD!GKxc)69G9Fq^|&=V3Xt*>!SYR=ZT2sbsM% zr~c9)LviMd(4Sys3=E#-K&A@$K`%Pz~dZpUJeYXS3=?X=}w=w^!YD(_T;7A~gdHg>suDHkip zRQq)I>ukBW`s-JIwtm|yNfDWp*Od4-uyO>M9a(v=fm2J8``WKz7m+zfY)7f@ z**;e%*})PmUNL)wdnxl%twyJ9=dv5LrvW<-N464&yvHluir5Kz(lChsh5YTh+tqc! zae>9?P>!~gm(Suf;iAh$_cYlq&u1PViI}^|3%47NCSAnoLwGXUgk@Z zSKMhLh2RYjp)C*NBmFKy;$3ct6W8$|-650r@ZVKjhM4^Ht}<#yDh$d#6v&xCYm=U~ zkERo`A5cB-G7fBP0twgl2`nsV83;m&7>l) zu>;3gNNC6uNa(;3GVql~ru_G@0x~NS>iu~XB%}x%B(#6sqXPUQ{t|&NV$9!P)Z}m^ zOyGY+z!&-f<-hJm^Z$VQU&pBCz%?XUO*thc;8)Yk)xyHj&DzO5Qa2k8oWOQ|s_%w` zM9PHtA}eV;0!|`N*=Xsx>%Dj`Zsz2`ZDQ_ZYQYV4a7H`_NfIg!96DIIo6tcW>>b_2 zp;8R@cZdVWh|@d_boaNo+etC#y-=f*b8@wy6XxdS=4Ft6Ku1R>>1u8%t|71Rui?P| zq!_H--JQjGc)Yy4xV;3pom{PW_{7A-czF4F`1!ejJGk7u9o_h$h@OFxk0`8U&~9~hKiQUiRXvXNKS z0)7E0L;N9E1OKr8{RNJZeRGxeSSyi`WRR5PWwoHlJK31&kFV2xG zqhnVJO?f1?S&yba{n}M0k%>Zq&(|Ma zTjT%Zd?IBch!WuwA5lvjqyA~Wk%6JK!^ngL|MdB-hC<{o?r!{(>z}VD!t=!bCptZo z=}?WpFKuM{XF3x33u6A`LghcmVLv_pGZv)%?Vqp9bR@w3c_CQttN`(;#VfZ`|A8?8 zb)KG&Q~l#Y34-vM=vE5kFm)D){)rF-2&SU`Gt>T;ssEkCpES0 zjs>23cinq?QV}pi?KN)_LGz}NtFjHnvmKpmI+Boa=Vm`3KrZX{dJ{7D-mctcqCoXq zz}?l7$G*gp@6k*8l-&{Q|rvfIIl0iS3Mcb2;_h(JI%kuH|ger>C|t z=P(H5OIN-Gf?QV)i*06|Lhgr&DTktd!rtdN_4(-cVpxO%dpxxUrYFwlS3o18p&usl+GvIOXVhL&$}K* z8~g68W#7-^C*P>#(R2_djrG-DQ-$AeIa*tr0PO~Zwf%~U9GaV#AU(_c;=TEP>s7gh zzE16oZ!gDdFTy=c_G{EQTLF!ckZ- z0%q8r@zd#FUxtf=jEmky5gS8$3y-Mlp|G2dV#xuET%b0zt9F1M6Df2MDRY{*4GZah z_cP0M__pw?Zt;y>rpGV+s|7qB+VdH`0t2M?}|fUpY2xY!Mww_!&$>2nm8t>4BN~^*rMu zxpxrOgvgG2cLxap)83oeWP8^!Ty^QxZbN)^BjVe*(`68B$f(p=7FYdpu;;W%#LNSZ z(lIBH-g|hIs&5A@H{cddwB?2AaPK^+Rgg?>&%WQav|MhL+;AT~2ESQPQnX*~iaPzF z%!BvrTxn5GRr(^I+I>v+^mksu?3YYO+k!0D=qk={<4kST($BYYfQ3c@Zj{iZ5$jcmqxgA= z|K~nz8S6%aAJVH(DtqMmozBQ&L$DcL!Xl4qI`JjC_B(E`_Gt%QqA2qj5bWBO0ld6K zZ1ODYy_X;3N0Fqy9d`0Jli2}(=roAbdk*^Dywxi&# zlSS{NYb~PjyL(kUL3_z`y?i3Kuyl8Owq}BSFpUqncAm4VXxTBNz0%v3_`-+~yf!)@ z+#pAj+-3hunpQOE8wtNlcZtJ3#@!K#{a8@~^CJdrCjk`CXXiFc4Ifiu043rH(!quh z$X>RY6oFN`HuoMwyIHek7n11rdu?%2w`k91_{gQg69 zh5^-km|?W|W}8c5uYQfe4yR}eb9=WFnd$nW=~M)QKw}yxWe@$W0gb=)YWnjfbM+%K z+gfitHKlT^^kGs0E3uifqD~4h6Wnls`t4&At%y-{C=QBrTnCMJw^xiDI3jzDit%5S zd0Hk@U)mOZLEm}@pYYU_!y`o|{EdWCM(W>(qV)2@EJ5lQcf&eSX29*mZfVU(^t7|2 zs2X5@2!i;5fK;2Lq}6Q#933}j9C1 zc35P|K7!ga{RXfE`^p5BJJo~2Oe*tZgcC`?nj3=vZa9K1PSo$$K_^es><1NZ0&F~( zVQdp}=Phw_JmFCLQ^)R-_$3gky5B3qFE zQ@D3ISd~cp@0alL60%MOthF`X4zSDzP<#>NdT6`UpGv~)#bi**F zkI0pNL5oHHN}x7YKBaqT&Ue7Ax`ar(tajDQI$7H1x}WPjfPn3sS}(pVxQ$Bvnwzr= zxEM6D2lVG#?XQ=Eqz2CIqCI1nktI(r=4z1U~nXZZ?jOe5Iel|B+NPdGe0>)^h=M-tiGZ$66OZ9W?DoOAdV##{YU=haNZW)7vuRvC0);lT=~6ILo| zrNJ%>sRV4N3l8gi5CtdVw?7xY=C`dyLb4#gYSMkcT6NCzWj-^wNHF@?E2>>tl%N^2pC_guLre zCE}{6UwtqK=t~Ex_2(3w>IoHzVV&0=+TKQW6J6@Bo_y#;q=Rr|owL8zVu%;b@%zmw z@8rR(u?<;9v*)Z$0eQ38`~KpL4p0yJ>Q7)%mH-DonwS&hM!q}7B7!O|BfHO7q>+C@ zGlVQ%EuM80=H)_B=uwI$6$6>vq1jaa=PRc%M%Nt*2{hC$9|(yRn19FDPi^Yv+67vxhl)-~J00deYJAkpFfmhATj1=zI zuf-4gvdO}FoM3~d`~kta4PbFse@SJBTqFlFU)J7g!GQ8jl~s zJd5q~KPfS?0JKI0mZLEPM1$$xJMlwt{mVmVt(zTw?JW zEhzK4sk5AyH6IRS_()-FyYDXZxXp$T^%{7tFx)n(z2l?V)%hLcBj0n9*2-DcMU=r4 z5$D!K;%Cy@G2Ox;K(-=47FvTRGS6GiXdt46kf0B^Xi$yM_t6VrSg4f)GY^~g8aa*% z=IAet3u!$j)m`#Nyvo>ziTruz321!To`d^n23d|CcKAJ(Z@mFK=X~Zw<35A;OG-c6 zQKn|UA#UsQP|dXm!aZC@Zy>P64XOHZ9#-R1Xr9x~5+V@X<@NpQH8#-=V>}mcxZfY0 zy1mA2j3%zSX$Ci*;tX83IyX>D?EZos!%;fhOq|Ff_|uc@h^?*R`d0d2x?|2}cn7@} zLuGsI@AuA~cp{+$y;y5+ zINCir!23z`=(jvXxZUzcfh!?oLrN^!4-I)GqfWq_a;xYQxGNE-X8C~F`OSQa;il*b z?tq@HIT5G(Xbo`3~bermFntK;N)Gq(<7@9 z+$oNe!ZW4)-h`)t3~itvC9igB!lv-87hHO&GkSF&RFHs9=j%}BoUtI4d%L^Ck_s9_ z+LDu!@dl>@92M3K2y-gK1_W7b-Moc_f1N}bU7+=9kata-@ND6}B~?fds+6vf|NEcu zmRJxh+{vS#75+laDSC}zXGGyI)Z2-}J}YtBWCG5knoaY|@l}^J*O5$LTTP^{Keqn3 z%MU9oO|T>ujg8S(5llVtlw)pbGU00fJ0x|(NB$NeP9YTjcShCNgtU3feH)5VM>&GV{fojB{^rKMaM~ z_Fgot#YOq~uEp&H0vL}9c1bmY zuoaCm=N5Rn2|dw^bY6OxUaiALmiHi7-AeqTjD-?!9Yg$a*TQU+&2F{lx&m1s;4)ZG79iR<7d(<>TQks zFoMQe_Pkpv@zQR0B6fMiQDz3@4-die-^NIBXgT>sI6o5>l~LV&{vxP>kfhy7Dn|{? z=EQ;zb`WMco9bC3#Up~hTL8H!@;1oCqIqGgae4q2f$`ZhG-;?iYn*=eLmPxa(d1y~ z=3vHGW40w_q{yFVzFFVB+zG#w5E)>2MG;iFAi)Xi_i8<&o->J{#{HEN=Kr?CJ4IV3 z>U+~pbxf6%tXm9!S9`Qs|~gs`o!9Vx#0Um5&u7FQ5MQdzw#_=_EKrC9?yyPs7ahl~&I~e&bnQn6!;~41D3He%1UO*K&A!-1 z=2jsdge8J1=0r_Eq#Ebej-HDZ-8^j?(|sZ1Q?d)u3}KgcmhGSS>5ms(jr=XKrYJ1H zzo&iLVmc*zljeWn%EO35eUx?TW8aYet+lKEb1#inXKf9jD!K$f8JMKAROMcAyolPs z^jFEfFC&KxTMY&zrQ=Q7=$*e-5^V!=YeD~^TyMnb$Xbo_Q&_Hu9NG#MB^lfjeQ0X4gl@>6yEw)7Q9L;nK;r}rI9O$S=;WJm z^ZU-^;3@&8*=~r#sLq@v*VpwUD-uT!#+S_;AIHg-e&$RxGSC=x2hFEmzFsOba0@u5 zh!<0oRDo|M*qSn&sRGE~__o}f>V6!(xG@>IsP5*C6%$!_+spm^`mpc0uDdjUFL&$t zYWk6L;F9%dWsh%p)`WA>viS9$5^iWao~NxaEopY?dM+VrhXryG*qPVHs_VYH5+LUzc85bY*l>s zNSSVq?mSr*@8Gkw!H)}m@5A-_aC;z(f-Oe!-DPA(I!1al6P&Sc3ickRTe!5y95W6w zI4t`*SLD}Y#E}aypDLDC*gFMeMJS(teBJt$8;vEQXqsk1m@l+<>I zeUgJ2;u5ZBZdmW(>3GZ-{lvgqBSUkxWW${|cp7UFWv)1QCblFAOU6ki%#GPG+T%*O zdlP;(6aXa<=tKV@I{JDjib;`E&Xu0g&4lY}cB++B+4Zp3T>7ZR$I${!6hVQHs=mXsYl+t(n)IvvY;HS2VGk(35k(zkJUD9(brOqJ6q>jgKHXDR6eo> z20h_?P<=&5Q%S9Wo!c2k*2`SUXNr1|#M?$J5K6yVeomCv_pr<5f^#?KW6vqtN1Bxm z0>7tMI(Api0r3ZaXfWv0>7P;sTpyT(>ez|Q1?;)y#_U4wE=Cop5^VisWLy&>je}&u zT{5@A#5_z`8T6bDrayOh!XNpGnRV4C+R~_(QHRl@EC6M-RK8q7aNnDEFqJL0OXa1E z0<4VuK1&*g$D|~69{aTfWAh~}a$ozVaI9c#rqm3St0?oDCI*0HBw+2}nVnkD8a68q z!I>n+@nJ1KhAFd&is#OwnX&1{&dhSUPI1#uk299nBh&2zIC2>$B~83@V*jE=s*CJVDmJMnLl&sB_f%ZK?~24A)@7l9i+CW(;pG# zaP?!^c;sPlT+<;Dq?4fHbWW{ZO+F`6InB%VoMpphKBdPKreeoIL0N4tRL^7Z5XkUl z$GI}WT|*9Rg9CNGQ}Raekh#*)2HiL_Znkd_4~C(tsw&tkUm@a&`m4h$yiFfTCjw^z zOr_yH<>=xO;>+=Eg<~L(q+EOa8P!$pkRe|+}ilfjL+(fML z_rDLPcniKL`9JNXB$ZR$(~d0 z#?+A^Vq8l?z{znY7c2g)_Xwi#zJznP^~oQ%n^^)SS}atjLJBdx70k7i&n(~;PO>%* zw4hHf50*dA=(ZlF@V+}x=y$?NFgGxHSZy8~D?k${b`)3jN&v!lM`p}&SjB)Md*kH>!9NIM=rS@uZ#41VPKxo{>VAR1#i&UHz zoa~imhASB%NP#P{yh$*3CM(I1^b-5+s^m+UZL8zf8gw$OQpQvA!CJdmn-B`$u$fR; zfx5rZ;+pJQRq3iPT@2->x;{=8vr1Nam0+CMuFH2zcxjQn2*b3Y&o0RnMM41SW2adR z=^^z80ffwiQiMPHM|6O=(lkqdF+FwaiyMm5V^Ky{fr5*y4x{QddVAMhx|mnup0#Ta z{iL%xKz=f!kD3@ZrB*>x13($MVAAbNWS%rJPzU5hG>+{ge^MgUga8?+MMGRjc-Nt; z(NNc&s^0J*DXl<2GmM8Abm*InMe|uUf6k6plQ)z8=1{;4XH&r90a;Tk@L*swpu6Z0 zaKj(BUlkM@Cz?Rjmx@cuDSCK4Odh>8%oJWBe;XPhyM;toO(MCB>Q|*=kW)JoVI)g%f0+O&2oi4(Y;|x zZV)4pqsf}5VU@jlJ_Kv6N2pG?J$_*%I+5u@O2z~G`x)D&e;x*hVj}aL_Jx!YO0G$W0iVCdE5b$i>RP+p;k11O)T-#_k^`vr zdLwFRi|NIRi1cXe1daEaK!}&Zdo>zKmXk6z?jw5$i9h^&#YQZgnE{Um6RePe-ie*} z2T0c)!IPX%+8NRLf=981#oN=q7-O}g%SIJkabnH^b*d7IB`|6X2)Hoz*r>L}Z zEwrk`2RSK_@V@WXyx-krN?_ zQO@l|qIDZ?T_~n~m0kEb%>=?3Z^6@D2c{}IhaqeziqS2ur!!V(T#)s~z}_sYlPw&r z949gl`bWgZ)O=w~nYqOWWY35?>pq}nYrr5R|5o>@btu(}LJ+%8kmTrY=CP;$Di zWg|oAbHrom4a0JxTi8*8mDb2ZrwH9r{Wx%Xm(OE4Bx!x&?O3iO4Sls?W;*d^7SvDf zctYTAKoB z=hjM$%|8%M+$kqo8jUok(oFDSEv;NS%W>D)$M5EyvhMBYCO)y^tGSg!VRwxD1ZJo~ zg^-m9i)Z!`Z9#=WN#UD>ss{(>MNHSm>ua(>Ns%j&y2|=E;diVF-k~eX%QFl-thj21 zP%%1d6SbP-2$yF|D>I?aA%r-0xj@EKm0tu|!Q{hkkK84YC)kX}HH3Kr*691y!ahSP z5d-Qfc91Uz&W&Xb!`Djctc*F|9Uj{g@(@9YD$x2N2rhc- zojpRfxH)qa0K*K%Izpvqq&+IGzOgumJTGs(+!9q?NkeJbym}&$p&H4-IcT~!E;3>R z6I@y;>hy|z@_xenxtYNwvA^Dz-r7hzGV+P#OrEb{qgbJ_yxq0o#)r`01M*EI2k_I6 ziz}z+CXS4Hs|(>%pv550GmZNm!wLi4idB8zcKzSN02h5=P_SrwHM%@)Y;(5PqF8VP z{$wKfsT?L?gqoN8+mbP3Sx_c^eAD91`VIPoj2y~W4U{w(c`Nze@##g8cWS=r;;mCW zrw^0pp`5WKh>CtNS8Wc|PU>l~6v6FpSybacRAE8#UCRZ3YrKwBM|*A=fL3Z7%P=-5R^9KHzE$@* z3HcJZ0ZLd!?M9+Asjm-y>q+Xbx8(|N$Bal5$y)93iDgWAZ}Jm z|6Un}d6&C7*p!}!1>t-W?8jua6-KSxSkL_2nWz2}-PlDPJf|_4<-xA3vneG0^{);C ziwjCq21EPJY%fFSfzhHaA`>t@nF~q+v*e}m10FOrey8?0ag!i2Is`@Pg&23fl125XWsG8W~|hJYrt#@p@}dDVx}q6q~bx!stxwv(j} zWd!sTA^`RY}N z)5-3nUAwYfh8PEa6n=C3yH#+;e3e1Zj)Z) zXG8m7kT^6s0UQPw@YtH zTp!om5FW%bGkFf_VHrGvzj-{?upErDS~DPqwb=|0q3f9Mr-jspILzsv2VoC}=%t1^ z2YyI~!qaYFneF&7XqYh9HLdk4H(oRR``0+(ABvK+UO>l~z;`9(!5}4}-_xde zGSCi~tl6P{58eo*CT6WQv>qJ+`z>eqE0{qUHzg@-Vt^e`-Fc;7__pzFVW{Zv$%9{5 zl~5bloIyiC?VVfc3aS5cuZP#|#pb>bQjiG!RQLal8*MVGMGf8q;vJecZm80e7L^^K z-EJAWF-9meRaZp29Ujodp{=@(yGds45~V+qP>{ap2cy`3XL^iS z=ZCCr?H%*Zf<~9-d)+292GjG)ls|48;HHlC>6F~C$5GzCn3O`cT5=A{EVFU+Zk_$} zW?l0HLL#d}YYOR!9eyKAlh(;W-v&jhDXHAmX-D%+ zAGP!t6BtIScO9D>qwg0_7lVXO(^mKX`9OiQNnTK+?H-|V&f>znT#v8!yM~D9&y7bZ zV@|49j<*k&=F>%tV8*TTkG?DYS!}X4+v78J%zBbb`qG--ZJfEMt%ZI_>cvnPr(5T= zWO`kLVsJyH88>fFjCA7x)s3s!&w048$h)$Ff?k7bezKKMhA<0uXU~tll6o$&0z(-U zRNtyA%Y1OV(wlVCE~+JtIG=}IbDVKa+`Z-$AY&g+v+K?}JZBk~6iFPeahsvAD>c~* zGCfYZ_+wh$5E0b?H|7?7-y6b5hG9x8ZRXe*Nx1nQ2Oq{)8jOANxdU3?XTEo1%DvTn z@q3ZOCGP3m`duT`n7b6i%;=@%-mnEX(uN4zjSmcGlYT0`Am`110W{Jr(GK&vUKnbS zy*ywLPQB!fTT=h4K9Q<<-iQD4ft*pbHBAjiIp2^b87Gd|`60gqEZyvr^cgPy8Id5! z)u7(tR*r0KsJpJ-vcAon+ddPg3P$63F>KpvrNaBIUZp{e!r^3j!DSCN+VVa{a7B5x zT`1SA^Jh!N&{|nWOcUNo0rm@Z5`)FR^o7<@V4Flp;1{+H zF$XRmB|{phXB&uzJ=Jc-0`QhSC>Zt;QzT{s#J-YMdaEf`?ZTW5j7(xQ5KgZUWRG>6 zwQ#bYz9dF7>|vnIoQC0%yJzXR)%uR# zPr=9-PXPR78%6nRx?7YGF81$O#fQop)A1HM&dW_&g*YJka_S-7p7NLNB{6_e!TG{x zb}JCh-Pgrmef{I77HE@v@u`8=bimnymy;;3qkqJ{~cowp{AO@Tt5U??b4B$a&C?@%ot_|n{6$-v89 z1<^YV4${?6ZhxvnVH#juYK(R$hOtKa%AT+Tr0h~yQC&$rsL0!;@2ShxqqyOpeH;Kb zJ|~2rcP4y(%(*f^`Qw<(?0v&y&P2ji&ZOm;pAW7)gW;NG0jL*LBUn=TURLEqjLgG7 zef@?sQ>l_nspv@wYpb1ajEBIeFlmOTM3OV9W`wvm?=)31Y^XT4x4P~G?(}A#HhP${ z6TT{{Zi^&SK7DF7QX6cCam8TBxXz0)7Lwkf>7!x7s1tLeBV< z4vGGk)*Bsqe8O#K95-^l&a~*TO@q(hs63LHKzuyepcD`H!@EC_RVLZ; zgW~ns0jkm5yCL;*zqQ^3&`QKbLVBmLfmgD1GY}?_K7bgM^jAjw?&R~e(K0ZjSb0gh z65*C@yxt|oxJbogs(x~OEcic?W7#s-0u4=6;dmv?Wrfe_L5s%Ag~e(_HXtc`c22cY zE4RFHEH6#&0`^KXa8=^mq$9@b(c|CawS>RXbLl3LyQaso=iiwQ-!B4z$Wnh+YK5D9 zxp_6*Vt>N;0f7r4{7tIL(S-(8<4rzL=yZ#IZc{7&?3`ZMSkebKm0%sq)qt%zrR+{_ zoMw9YNf0I_6(M^AO}=+;mc09%O+hw;nt&kxZwrQf0C3=QTJ;JJt=Ga!_}(%CzBZ)$L!)d6;HG0~?9`f~x< zxyY$9&4RNYra!BqO4!A!UM%d>0SMMgXih7eypdzmKo(RtF~4&xmUj!_wcbSl@R!5b zu$O<^w!ByJ}OEYpl0HP{8t?>Ors>@bT4%XC3ksf_pd{Ue`39leb&?SE?P z;yKBmuB6AknJ^E5AY;i1d)*=s1V+_%vl%}n}PqV14pC{i~RSJ_uCAI{Bxp6D@%2Q=SV ztDiL#n&l)*RYz{rtklz!&NU!Y=~wo-Ke6QHSNEoKI6@kOF+?~3MP1l|ixp#J-n=d~d0GbUZU{TuVFBSBbF|6q4$Uob2u(8Vth zJ9gnM3;%Rd7~Z0RL@^23JcIIEpXP^_h9z}IEPf-t%&(S!-ao<7II5^3408DtCJ6GS zA;0MxQLyB$UQ{<87H!WgePs?_pWQj|HI;tb5@(lyV-GWQg&BtNRoKNTY8f1WjshB z=@FR9MBbP)u{?cp{v@B!(KB<9`#S$Nxr0h>D6Kj)DHtk8aRdNPrO^P{4)%E=|I*8d z)C}G`CM)TxT0#)ZYfB+CtI;IB%VWF-`yzhqS6Poa5$RLZgby%7FBAK>jlNM#sk~?F8But#Qhk*RzDL6ty)VNWc97pN zc8j5?mAU2VNM#>Vn@Sx0Nb{$dB8>Kt4_j8bJbigYuotWDRQ+KJLO3 zR=t#JEA*yYHEC#YBwN8m#SKX#3x9wff`^R7_&L?N^K>U|%h;&S$|$xjh{ADl!iFh| zD2i-|D6gxedUU4~2NZLCuI1Q!M_zMWEM)oQ%JYX+l)#Yj^B_fkmCdWc)5`9Nh_joN zuJrE@onf5CkPtE^>hBC6gd0lbUAOm_1!; zX`Y0D*N^$@^~)Zq0ftPP!FeSJhCI_roN_lRG-W`44fU;s)7dfM{NzG7#3;a^{R;#L zfO*~{sj4ABm@b-7jUAi5Vbo0p+=n?MAde)tz+{()BqFvrce<|T?>bj9laT*BGw6|S$%_m zzY)&Kb&MwBhTz}Z(@vES&uJiS%k*vBO*^L0&(-+^PLnQcJsDQ=F>Sv(@t@-XumAl3 z2yGfo(|;-<9{k$?rGDh??(dzN(Rfbo<9Eg*0I*CI4b)xcFKRm6w^rb`ne~Pl!0;NP zSP=VT`T7Bci#cuG*Ca0fkq`hsw%vVq@Lc=8Q1cPMs93SGZiD|-s9}_W4*}?d1q5U= zQ`E(j3Uejm;mp}mKmhdi>A4NLzcwd=fS(!EgjY7pzAaY2RS{>SQGq9-fVBK`X}s5y z`g(NPp|-I=%exn`^QlU%Bp$KzNydwS&Sw+>jYr@tUju-xahsgNcnYC`@|M%t`l?j; z1ht!N%RN-48XJND<-PVj=st~e^6dl=qTjob5YVo;52Kt)I{(AWheH2M6eymSW77Ka z#)rUb0Fe9~09we7EC9kEi}Sze=XcREGXVP8*Rk=>m-82IfO=sFKo%Om<8hN1S>0k} zO*2F*DHIS8`u5V0zkS@V(MTc}36NXpyw=^8?)T`nK|5dC^Y$m z??0jE#^Bb;qR)PJ^%?+&_@Q_DWW_7$`8xoy72g72F=5b=`6>;CnSAPwf8lg@&w)qngfae z#ysDzUB6~T5JU3;U}8?s)yjDsduRa&P#f`7U>Wh$FFo$@w~ZlfP5Z5OQc_ZNH>Y!R z2>jx2!46RQ39vKZ4cZT?_)J^xYxmx3O&(zAz>^R5K;Z$AY46$@UA+f9r5Q3#mDTn6 zfgjsK#F|L4LEe{N0C~Q?2i~DWtaI}(0b-J=9O9^G=KrxlB>(pDHQMSe0z=h#`bfJ& zRXO4{0^A%4;H3c;cYOpr;HLfE&Dl2u1Z?&>RU_ao#sDqgXZ(ES-1U*;pI3ooC|y4k za5cA7hd{QD+l1xeyMz#SYw^5|^`}!hBaz^hoY)s&6 z-tCo>lM@!CGwgB4sB_Yk3qYa93_y@_5j#f?=vLFs;;R4*Q`Z>X^$_Il`m~E!bvny! z1VG(qvXowOqsYn2bc_-N{W1aGfG`ts%K$1f2EY~cI5rVni$DggU<3 zXMX@VZ4?4n2$(f!YwHm?g39%&bEiOxy&+;}fJ0uKa57wTb|ybON@f#*y%94j z%0Z~Jj4lzK&ybcZz_}-&Kv>1KkLMgc$<}px%2~#LNMYP0Z(y*hsN-`l4hN*V@18Vrz59Xb`HML@ch zW)y~&?v|32kWxxOP^3F0ML=p~P(YZWV}N(h^ZmVl0N2IrbI#st-Ru6`+uWz|VH)4V zljnUMRoB6{{T3|vbWbsEp8|g~^yoFqM)1X!r8hD`YCiG7&fG0*0Pd{VzC(oIi|q1X z@WoOv`Q8KvbYTuD=u>dRVLN1B!YBDlOcE!ywl!|KI!^G5MwgD1j`5*6Q6zwJ3pfcr zcH>?bz{_Ofbpi9tsSenswz&8onQlg);b7;QpMB>3s}77|buRk1V&X-hG%!Uzr`y92 z!?((KW5Q~7zlkj^aEKMs9iCWZn$_*0{lQubRV-enTJB3U*%)?8aSU^8{_KCJ3ca6J zfbBW@YKN`++)J|8MlA5Up9WbJlkseovJy1wpHU!F4XD)^>PWrdt*Ti zIn>lLdEGv_rlP1lv+nhsl-3i~pe;xnQCDvY%7^Z`-1!?vP$30iM0_&wzINpmUOK zn<%rLswr4UDLGe&gasj#k|vIDel}yt*R>INBA`f!tc&MVcj2aJp%*>?z)ll@s^ohL zX(^sP6Ax*L*cimw(H(O=BRd(aUH>)yxQjmH4|B6d_60leCN(`IE=q+S3xrTV3*@t9BoTZ^EKcg=e>%RCGJO0j*z zm2m$Q_;)3;xZ=4RIPc?Z(41WIpv-HUe8eAt_wsLPrrLKboepdQC`xrr`)c5E!?!+u zR@?MO$sj|}VlSp|BjK=x|JkF{!K>YnTD=Kq9#H5xG2_Rp#b*MCpB3NgwQl0#_Xe<_ zKm0n;mVkON?u-0h3z)S9;Ouv22X^-20vz{}B7wpoDG+%at9Y+|fXf?5`U^m;S@77sG-#FG+j^XN&LRUjuy3CmghZ^Y{M*o8^;! z8u_Ibp8uZ(&_D{mF$N5YF9l{)@aDmGr91$RI~m)yD}Y@#Dqol3K?%aIfCNvvdQH!O z;MoNnk`mGqYaD>XA#o(dIZCSidBOSOh6|V$CvE-F2|hFU3YHp}yjLSx;-U69$6*P( zqlxWa5m-}sRi2tXa z`ptBJg1LgiE}NV#o7^&+Zl+fI6g9B{Q@fYg7Ee2?%l|4ci} z)1jyJz@#RGxZCW2j5sSvW05?LtsXFdq>0u?RU~sfmL6yNCI8mMUxK^BP93pW>HTW0xAR1Ui&(<(6*UpAptrJft=cGSJeG(GH+5u#2 zBtXV|c{km^+1^a&fA881k#T?3|F`11s^f(bD^Fx{kMLj-+c}9eL8+_WsBFhbd~Ei3 z0AKfO4HB=Q72)4)7ysO&Fp-|C7CP1kdQre`F5H2A9)P-&(XA25PQv{GxV`dIWfs6G z(jBerqVoC;M@pIDg>2+0?y-#^hlnQT|F=S}b_Fi}a&S-x=Rd6Z3ZS(NzTdr>$ct1c< zo9%D%@Fd%J-~gzc9n`pjh$#6Lyrm^svhR1)i0!Ez+f%;% zmmLubBB+4B&!7Xay5mwY*XPX)-fj@Np6{v}SR6^=YgV!in9sZPb1T@-?HBiOcy3p8 zcX7w3;RtFOqEVM~WeyZ*#pHhW%N^B?WO6$IUCH>}(;EZ}!$R%3|G=jN`cL%gNTz6H z*2EEYWFa9WgRS5{LoT(&bDTTu2<&-JWkvH-A$06#-=n&a30uw&GDOMcN34CZ4ag7r zIjV&h0K|EIC1ObRU**70Y87CcRg10gs>tlG`U6Q~Kl@YnxC>I81d4Aj)X&n&8*R{` zd2% z$cAHKkvh>%su!=59M58XllsR&IhE@h*{GCKUaoU`YBnBFLMgn?Y9)y5>H8Fkrq158}G_LtBoFSrLY%vcLtN5>~`MyrHC>$*yfdjDvcm0>Q zhBsBe{{U1mFF~f!`vtZ~c0}st01g^0s8vVd_O4r^&Ek_JQm=6oa&+Nmw}2Za#e!e{ z6hQ_aS$z+{utY-cxArnV%j(yT!KDLttv!L;|IG>8>C3*J}f{EEi82aFfepB6m(Cz3f5Aq^LLH&BAo+>S5ovD+MEk~TQWSrU_aubp>e!C#9Hj$d-O zxWRHhvx$~EyAWmD7auddm;QHbzh7v|AC={gRKIFh z)4G{$;E42{JcjI_FGOsvf1=8shq!DH^C3;O9r)|xU7J@UCoE8}KKfN+ycGunT54L< z^JM{xB5BhD@jqFOA|D@r@v32MA2jnqq3KiZ)&J=0=RALv?*5k%1Ay8f%zt_SP6lk^ zUWrsjkRk3KGsA++P6pZh{=EBAVDz~X8AiFY-KPi3v$Rit!lfXNdAxW#?$#=J*Ab7j zb#3x*)9JpwH3-89&1VGq){mu%NX6HK%0ZHQj4VX`SH~zipyG+uINcdw{DhhuPnk76 z7CS~@GXVijzzy%H|1h_y%uVIH(_KA~#FKWnvG>(B?Eev|X5 zeqd&5yYs$FeBq2ar>b+l^=jTZdqhwaR4L(Je^M3$RYRh z6r5QbX_HzGfa44sBgDxNYG6ID79x6n>vnxywZwQuksL+8xNTy{i84i{s@2Pua|VosSb!KA<1?$W>Ts@sX>_ z?|*f>=_9-_JX=aauJ-9TX=x}ORHoYmXG)})qB@q0URTi+Y_Wr?&7Adk(PwsHc)5%Pil~P3p;Qza z^NM_Y8CrE_vCT5}LL&^v3NgGE5L&y(=b%zCf8|TH(49%)3>^f_qV@!vbD~omULGq%X8HDqa8&pVh+11Jogo)T<*(LfqY9fUZ50puvxXuE&$F z5rz^LU_#~LNN}XRgy?2k2KhdwsW3f{+=C;Iz9c`?VwdjDnX?2BKHw5mxRK2f=fJAhkJC0)wx`8i}WA(%p*5d6ks@D3_y>T0G|J$~MbKkEYIWlu-PC3CF zzr>!U$k{FqWzEE0qXnwZ8ATTrym7v{5@+G*UoPn2}4mAkqH_4aKMULOrs_&)jr$7}IOmOCr{ zF97t#jHtGXJ{QdrPEGTSwC?w(@U{`M?wE*rLE_WcMsc`t5Dd;rh|V2~b-?~}&N0L| z3?h2BNJ1zr8et`KY-|rT6tD^qFQ1(c@~gKnb}r1fh{|xkJJ2(F{_)8v=mnB-oHyq* z*TBlnzJSU;W<}4QXd00Msh@Uf+ZAQ`mG>pGYZlA&b)@*n0>vbZc`V+)hBLsmc~)4U1W4g_cFj9};ywCXwo6VN-zF z?*eDvDPZ!LfLE%oTzn+8PAhP**;dAYHY(C3RNRB(tVf>#)~_+TQ}>9h7gAZmjv!n$ z>1$rWbcj})@_M3#kVBKje&Gc}_hlqC_o;u)PavaYVm2r=&A}<(M$N*1o^`2XN#kdw zWl1Hy@|xOe^0@KR%T?h+XH|)O5_=yJ=e3aMVD?^cM?-3PzbhxvRhk|1r84)pab9eC znezrRB?|rMibkKBwJI+R;!@&!YfiGY%Kp>vF(DuQWqr@fZ2OFuE%ke=QAoNEEJ^oO z_^REbwI5m)2@*ZX@bmmg8FRlck)q%8B)}v77VZ9V!9r;mJc@K%R)W9V@&R zGVGimmwtZz@Ji-*mDm|2p0DFO{&whydO0e~EVl;u-;Hst)}bo_do|SznXO(8h(d|h z@9`}PG~;d-#K=Un3%IM!{r(cgWF#R}`r_XzoK(K?&mSP4gt+1Iv7V`0T(HsOyBRjO zun$lU7G7T*A6qQ+>{k9dnP4dY^^Y@+(=&n*ABtRU^G#I+%kl3FO4qr%f$smrz_1xK zirw7!$n?28P-C-eh+lwar9pprdome~4olH8Bb!(8*m&`3&n_59OV`H>@36i3oZ%#~ zm_{u^;z@}t;Aj&VEH|3Vkh$haL~7H9_Q!KXaWCmUgj9YAbNym+()o#mGpaN$zk^Xfh+XXuifQzWFMny3PhHC0rpK=h~>!Ly`721>Sr|()R<*7!vZZZVv=f!jX8(gmX zE?KQC@R|GM7;;?QRI5=h^L&1mluJD7ccsiaG^CE|b zSaR@_ukW63c7790?53oaVu0Fw6kv^G;kGzPZ~JlAMP2ua*`#yJs~?rh%P|UE9Rd9J zm$Nc2Mtcp_`LEu?>kJ~;VL_9V_wpByn^#Q^e&^M#oOS3^QoP70TB!?#;&X>KXagcN z!`)EKh=nF-;H8T04D#_!l<3loe)`I;|k^ZUD+vOt2$dgV4_elbaRP}qvv+1&T*m!^X z`gfj*Tv!hsys9CY!7J;CthhhQ%hc1jOr-uM56(n-3*E+^5w-9FSk%jazBjfs{BbM& zxp9aY&uYUanfBnuW7?p~l!*U;((5ZuYTsjRzk8dU_knF~9Oqagl<6E=^yC zxM2N3{b6$5R%Scae)Tfmn!9T}{iy{ri7*Ka1FHm%-qW5@#S>0rQIc0C7bNZ_otI<=g7B@5*T3#TLLZU@SoD8IIRCLe(VX zY<)?y7zhT!(Y`~yxopvBLfiRpjnPy& zg1o6EFlD<>#1Z7<)U6O3UJ#-?B?p#0Pa7=zL$$2J+;88$ty&C^it0S_#pK_z!9u8- zJodY&q(b)fFQC_038fs({7E}ZUc!avY^9uo-3UA`c&mw+jS*x6k9j(|MAVv}4=>kB zY&B1(xm`AAck!O#)WV@=`ov$QUxpBj_1J5pZ*w~3F#CRNJD7fGpsRwO6 zzdNRE5B^MadNePLVMGS*KRH=79ob`?{KSpr5y}_0oID+aaYIG^z8M_omY^YI>86(> z^5vDIem+w*>e}>Cs(Bw5QsX4hMfHPs-o!9#%Z}kur^j&aY~He(O`_Fw4X8(cawow= z4Kl5IUWI9?9NgTk1Q9Q=vvISM>pk(Hv~Rjg*Y_a@9KrIU<01tQ^TsaJ+d867!)rsS zGh20yzYO^N&E5FiJ}b)61UCG^&+qnRk+HYPFrTPJZZ#h-TB*bgojt<@2L(qLgdN`i z7VccQ_M&_TFGDr);)2h;!jUNKY6VGy&(3APvydbVKhO~yV}5`k1jF= z*wWfk1aFC1x6`#lDbp-v)6JRN60Sl375NKPz&Cc`zFnA>@jyX>c~9gZC9lwIYTBLE zeE%Khmzt+E{dntY95dsVZXG9ex3&nSJ}yQ-5xY+ED_YS=f!)Kzf%3j5)Z-eef?rA! z${iq|oD=T%wwH1eB%;xT$cxlB*9|PI%owPDThJBi$03)+!BFkRW{h2RyV6yQ-uipT zH~j(~9vi5K>+M8Vv9CJ=ZXqAWN_Dn$JeJ+TjIKm@9)tQ>Hh$@48|L7xt(P{)5s4L1 zn{j&|<=@&&0=qmx(lj4cSzf$ANPnlwCvykmR+Ac*j_$y`71&?9)mwSP3of4?o0ZYD zVp<*uu9)Nj%jN7XtmK*uu@2F_paPRv^($Xa#sqJ@H&M4|vV=)~ z31HD*V>WJx_&1Z#?emOkUW#5f^D%xDkqNco+m9D84H&;9GU?+dPg;KQgQKsSxr?e_ z<+TE~FD((H+AuzzAUgJZ{xQqL;ju98YEDRxuf?K{$Jo9{j59eGOV=fMzgUl{7u7wM zLym=C={f4H?uUSIv-J6gU!@(5Z6rG*BnUooH?026b%#%2;^flM1}drrxY^nRFk9oz z#)my#yF)IO^Oq!yWy>ypFFc;T{)K$&;ma^~%=4-D_CUmODM`t*&oEi*OD|iGzJn9V zx($fBi#|w6EaH8qnjd&9X>wcmrLdY1kS!F&?yhCIX|W+zLZY`69+|jhnGQy?aO5Lw z%xYhTv4|2xdKnn%gwXX7Z|$|T>|o|NJXl7s6&ua}YAO3zhBvZy5h|Pt#RD0q|@&!V&dur(&C z$JD{mIlbn5UWr1lgTW&`D&JIO zyrPSk6z(~`7&mDKbFZC6x$cl;oYr%~!)6z&Cb558T6DjE$IsG4`zVi^1&AfJSno(U zu(3*=^|^=o+x0pMYN)rjsQKzX-5-$_sR{g9=F2Du-yNvzcS!t_Y_}8mqOD=1?eL>} zuZ(U3p-YQgO9td}uNN71`F}Sk@xUx^eHv zLeD&sq4xw-Y5@CbFWw2X5v*3fGG7rrhKK|^88phrm-)lrl6Rnr)XaWlx3iYzFMFdx z{q=$S!_`4BKFj~hTOld_-l;H;r)9a$EORJtm?sC*C zdZV3Ed$y=MW3QN0M`c$l#~hn*1(n&4A z{?4+&bFsnt(Gh!I6cxF1bYjPDu7W#V`GAj-r$Li_HsO~J*Jo}3w zgeNK|>rWQ1{Ue+*Vt~N`k<|o+XWU%57rMbTERAk;D$!-q(KVq}k0;6xJx#FUe)965 z2@yw~)Sb|1oY9+RpK+a5<7FXo#K6XE(*&jZfwwgpoysYD&*J$%WBmd&8`3PTUnj6Q zx*y+yGPfN;v7#rI+;tcU&)hF46%lx~dIzSn-y(+*&vk{~vPq&|2y>NYTe@R*?d-uW z3+BZ7&UKwN%n9eAmjo3=YSuQJ^v*cu?#+SS9|{Ochk_pJCbX@u(1vHtRS;idj6{1L z!sxJBoplB=;WbG)+fXiwK7SNM_(M15Ic%47g2W;Ka!cS5BV;Nt&+Lw z?#y@qdwIXkjpE-lk7{jqhM~{eknVm3iBD(SV>m5Bg|&VVHzP;Y^tMQfiKg0+rkd$1;uX81iP_5BYvxMY0xmhqP$6Jnl^A=b< zvE`x8RPNHb;GPp$ExY8t9-vuG4xP0-4vt^>pLyscw7tEoF;WKCju zNc06v>tttF>725(SfC5hv@k{Y1`j$_t?O>}!jsK0PfmqGc(6QzkZJ@L{U&Al8(~Xb ze>wNXJ&Y0cFKSm_Y=c9er|Z@MRQ@BU6#OGj-q>lw^0m%Ect)Wf7mp&2p%z4WsL#Wh zh+cPQD-R>m&KUS$x5yc!>4%Xz!Y?cW%^20Xl-Z0H?=5h86zH)4)0&NgvZuA_NViRxD!=6#kk%e+U z+gj~|V#Kr7g%>-x79X8dM42HMJmkQ{H}}~(8#W*PwQKKsJ*@sidZ`q8{p-*&3z26i zbJ<8NxT%4p)U^d=_0oT&am2Prh5D!A)AhQq|E|n*&=%6`Zw%IavJ8I`eTsO_yYWae zfa+PR?!)#3g;7zSYV(VK&rF`)x?*Sa==0Cu_=S!1i!W%S-&XxFmw$ebfGzFxjXz(g z|0kYU=lep{e*^kjQTUW=N}rjGz1+GZ3ilxQ4Pe4j`)_vdnjmF1G2LAI91Eq(bAVt8 z>E|1n1qdo)1tl`M))sws;|^`mjd!Iuv-lTW&?;@>ON*1v^3Id~3%0txFsGwmJhM|T zgS1-S-D;w8MxN@WqYbE91e)t+^G+VI!NlC;HW%Pl&pgu}MN+CjPLlSgwd9lUwZ16Xy9b$bj!7N}55 zi0d_XQh7q#%%iol4pzsuwZE@@kZW;5Y!;Ld`hcf)D|^pErGDS7`at_beJ!f?<{^fj z-CebZRFvFCx601*v3iN3_`X#mCTEP+W{xI@G-!s119gAd_I8?E&bB6uf{T!m>h`4c z-}!vvTZpxYP#@ARDjUkh7K;+!w*_e~I+6Eb&X`1ANhapM(asqb5EeEq+4>gFczqoy zCe)4n#4V!sbXe}1V!sq{p)}`nkaXV`y7UONVvP=oI`IW8W?$HyCR_iYFcZ`mZIBB4 z8h)KlTcTff}`I8#iz2w`gP8AWT8@09)Q6_QS==qYGYXNUS*ZZaT|B#L$~i)#e&vdQ&Okw(hd z@5`4}dG-++C<3d4-UMP?09vKPFDGyIVN-V7ZM8S2qh-FZj|2HVzDlzXn1V+PAS@pS zh$byDIV%HP-KXNf_M-NlSC0ErDho?WV}B<3Tlv-9n###Q9r;D!SOFa~oi{U8ZJaOE zjB3sj#cMwfP@~rBTYR+RYy5YzgnFZ$6IeP>&qtB6%zvA`oAH17a8&c>%I7}FRW>V7 zLrC&BVR(B{edcx)T*X=BW0=6@G4rFo1S5D7%|1dTWiU$XN&ejj_4hn)^1*4_|Jg~K z`u!~(mDN5oF58m&Sp#LMUT14vz$zxTc(RzhhZ$L7C0nTJO@5Y7eWUmqU*CI}v#f;@ zSsYLp@DEP55k&O_@T4)o)0g#jg0pH|wk!-&?uV$x}xv05(EGYJv? zwDspHElDzext`fouF1lE-#7~KY~5gfI2cL7PevibKA@$pX2O(TZ!%Ge)nQoU&daV3B50#EfL=|$hwDDgUILP?M+WKY2-CI1cm zuacJS8np6eHr*P@tr*f0zMJ55Ns_D?N|n=;%1M{fnrYHGe-dPQdR3HD3#4_!;In5s z-=4ErzOZMTmT?<GR^ zCV95y9TbQc<+#*8T-0q*kL6_Z`ZP0Is(%tv_w@)Q3pmAkD%`a`CfH=6Td-ZU)<|1= zE7Bv(QjShTxt!(PNWLr>i}6F@pO;+d%QjxV-&RtBatSkiJ!%dn&!i^(R^}LRn zCApGo=F!q`UgXj^Q_p1;j68>Ohc5l=Vr6w^*i?z01EVrzP~o1biSx4dS>o~-&Zxa( z9xFZdw7Kg?m|fJ#v!B6j>Q1&6U!Ph*G!|_+Cpobk(o}D1!V4bFpS9ygR42Pd%Pu1> zy>TP8v;*Yw`Kq@lB=gJ9^z(pq6V%kuOhm!Kaq*uzLM8F8xE}L2w*IZbBtYs^?nz)^ z(`bld37j8Fx&)rG=WM^HV1kdLnXq$DzcS+q9-`MA765QsY96GmFy@b;I)^&ojl95vuV1o#K zlcb|EqlIMZ38`QnB#I=;APVrIkn2$?G|)ElOm|nLh4~0^4Yjo~nrR&l3I9Hgd*arY zoDDX=MWtFWJ;-bB*APE!w9R4Z$M#H3Ksp1Z)hD1PjL4%t2yH=<$dU@a`@pva(VQwR zECC2(kLo95%h;^<`?E+`?<+#jA*4XhW0`!dwvu~2Dw~H;^~fj^9g}^X`if%tq=J{M zV>0vkLNPb@DT^yih_z{hNVLTgm?fsaFs%4Q`WhYhcL&82zQR1R9nG~nFA5}CT02@4 z6IpA0{iKoLK{KznOMhFd)iYmjcJr|cV`}*Qv z=BDPkH@~BFhYY5&H{RFypH%F{P0fI6K$e*ilpPo_Z{$3X6YPO>-*@o)jMryach{zqf8~ z`=4Dy0<)#Sn&_#{v!2JCqtZVp`n;qyAC$t`wYBdz{Wlg)!)v#W#Y|}xRf6kp3h5w) zwV?Y`NI{2rhE3>rSO6-@I=W$Th6qotKUr0)13Jv2kO1h(K-Rc}ZzaOL5H;vSp&%<# z3cv4)*HqQ65ro+kdSQo@%(vyex95aa0s|4y?22kY(WZNU$pjqBq5Y%uC#n~0pZP-Z zzW6A4PpC4fn)?!Fot*@P$La6DEqp_Gw*)^-Zaq;8Xoa4E(iAi+pX+O;e-)s+CE5IWdJI*>R9+B4 zRXC1Kfe$km8uDP@m37;Hj1X{6rT^oc3O@m}_umf~LwB4RS{YQ|aB0cL4Dde^QtE_) zQJw@5p1#8xrT+8n(#Tp_Ho|C$5=ej6<5E5FEBZ5^MJPu3To1+XAC04DrAi{oM+Ny@bdf^I6_|OYc>(jK$!?DTkij39r5oAujAwu&1Z?GPA3MyySQ)>4X2Z4N{VHR<;0ap zGEFw<)p!WQIvMA&VAiRMqU?z_YD0JIf$*q58yibjXSzJt63neFhz$!Dh$73?lMtg> zFrd2gLgw>csAF*#)vHzBdZ4QGHp8zQuosl2j#1X13YXD{4hD23xV6<%S!f_eS0y5a zpM3nT-I_blrQ%Q)LMFfM6H}(lw@>-%H1%_{n~0J}rV3L9GcX;F`JF2H;>h~q_!4V{DnyXTJMd4DR% z29<~L!XXZj+XzVxeZAoPhi#P5G=x+wJq6gl;|yLcV1p7`H|F9eoGks;6^oUf?!KVf zmd!D4AYAPye4xHiNgC8RkLNr&BN{32N?DrK#kx9`F9=*>Lg6k%G-&iK%Rt~* zD(;LdZK%}3rJuDK_>+o-a^3TL@51V1Hw*7Yze70qCND6n+!*_&mf8$#W?FqF8H8UbyK4y>cZtl$~WINW8Ce@qXY5c2PL#~YIqBJ}SVa^XW%QTim z)%6rT4zSn?;I}c|b)3GpT4rH7uZ=U{f5F*CcA$=j`e1huAt2kyjq@?g_H@e&ngm?_Ad0`7ypOa(9@~Qw)=DC6Cl}k5eGXv2y&^; z5+}vc-B~9M&!JlU8|gq)#7ma}@-&89xNeIo+8D4qF}-Rzu>+PcccKsPFQ(I)9^Uk{ z6(N{UT^~vq)VP=1hOF^tGJL#oonOjPa=t0`|LMlKIw<^`yx^b8YTI(R|N6-CnaX=nFgQ<|&2yq}3zj)9E=r5EH%hqxg9&^PB*sx{the|T#IUi&OZG$-^jo~lZVFzip z58utb_5Jf%LZX ziUm!HB=%-uN7%UuyFaW(nrUx8yUuQj7V(R?HXD|51a3FRe7jTc2CqanEF!aG^+UEa^{2`JY$vZv}c?BU9ki{uV_48L+!02(r2Rw4TtnH>LmCgpizx99>Ry#KrV7%uv5D_f30>pn;K4=(8!wYG2?DSB_BY z{|_3i=Iz*>17X2)KJ5m86JEy4n1^2h@yfZZ?7kpDw;BMJYTwlat$^hxxuHXWh)oiS zYVN!;=u}XKb!x7Iizfu>wQc|p%CCnF3V#%TbIy{nsk4&flTk4eFx*$j^#|?=VoQe}fC;M!%d=u{{sY@RNg_VGmfb4?1S%6KnAz^%v z#B)=pvE3!^bp$#riqiGY{h-DR=fJTOSuDESI-dJ(0@}rPG1<@`BP-$?jSM|`tr%yO zMsaA`Ux`g*;jA|}-D?+C;l|E=g#}z-!OjCw&n?)oH@=!b$?5d4o4 zmK>Z8Lx+hdqN?pa@LEG^IN}?*I&BKUMD|dc&l+oUlCpCnC30H{k&6D!K5+-A+q6MW zRIa_nk-U}&1CA4pN%UHOGW+wh$Q4gOC`NTm7xYsVK@L~*!Z7Pjtr zC{HV8tvZCP8>_NzeH3cHS@Z+kL{$&kbBX%f<{);K(@=y@RVZVwE}W!0Pj2m0xnnB? z`U83>5ZtX0RTo|`-=*`mY7r*Nl$8d_L1dMjc1&s}*)tG0o(ae=5i$g2?F#(0eZG_> zV#`Iogj3M^ZT@l(*HPb6_xJE1$^$+S@9%slxxSE%-!>sX7VpiQh#XOyCfO5)$=}&gS)I?01_FTxy(a^rZ3?lnv<@ z9{#=jQO3LgN_A*DnAGgQgnoi*PQL<7Vx;byqsPJfZ1Sv0Bj(V-9;vQgCLdF13Pb$5 zbhOP(R>}R-oU^#Pq?2%Ub=mXvt#UAb7q1u*O#vLIItP@iQR{&v zzus4?k@vs7|LO(eN#|!~ks#sL)knYL9r-QMMa%KyR{h*{#bYnnj=kfQm$$Qp1`4dY`>@tz8*4NAIlXK+} z1T^YZ8EEC+c$!yjz!$-cX)$K(^&J`I>$+V;8HiMs|7%&QZUQa{39>83-S;3$wSDtv z|DRh0+$zYMJ#rrf4=DRUv-D!J(h^r3%}3$2PXI_2(0qZYWYfC@c^V2P->@{EKVyZc zgo8s}-L_R7a);uj-E^~sEJbzxxw2`@U?0}|#!|T<&FgHt%F4 zBqiwzlA=%q8m#Uh{!g6hrSlF(Oj%N7e!Z_eB__Jvj>L=mJB-q2RKPUICi_Ywr=Kcz zTi31dZ5(Zi>tF#YCr(Jxx{C;|=NS3U%ho&cgGB(97?TSZVG6lfXYaS((&?I^ZPpi)%rVnzvY zy;>b)%KXD~{*RmKK^wnR^Nb~40GM4w$6UG&mto-D0v@)ls-z`}Y#(1RYk|4(=0v5% z2%XWE*RDj4_;Y=;1v6)$)5WwfX@1=_r;l&(1u$cOu7ka((Q`OW!>^!Tp%Hf~>$v^q z{#0N6KgP@>%Abqam?CUuWSQQ~UoSP!9hdz(vajFU8iIe}z*b+G6V;P_D=jgd=-2+( zB7ZqoTGJzEbV!>=!iy(0H#!U%{%cuuASd1AA2(-o`t*MRAh4V>!hUb{ zlZTk`fQrpY%;MVb)(*|k zw3bYmP+|+VgW|FX8RW2Q{|WVus?FavMp!4hsUz?0@BYJp)eF%@S^XExWy?2#(>S&n z3yLv;*6-*yAvgIQ>h9hINB#2M^n3(Xco+J#>ofQf%F!}8sTE9H{pV$G`>EI-FM z>2%AVlJ-jpVXfxqHANTm5o<96ql#nK_k5Cj(y6bfGC#d9I5TCw8g-xAT3crQ4RzK4 zH!cr1>UQepP`fP=7uHIeQTdxN9*wNVPL@GIu-b)qcNQW>a~BI6{R~m9CtWTGy4(IAx>)YUxsU{E?=pi zSW{3FM1B1e-uxQ+u*+R(r#T}_X9gVZUs*PL7xP8!A4%E`Bui7hi>v);L>*2Yb2q}z zGq6SdsoM2BslGn5YMUFTegMvKCmQu0R-x&S9xN^$s;YIJ3%akU|}xbTQUJt`oPJ%~F+%!Z+gHsy0v zt4eOM{<{lo-;g3toWl&;=1v6&G{um0A+Lqab|dSV+XMm~BjE7dD4&R^Q4py7Ls55g z`vmI_eGf6lk*5$#=vH3B<4a??PeK-2JX?J^_==R{AunuoVRBbDmGc5Xia1>?L)Dq- zA9mX=c9VurOF}5UMwzLcwhoc?lT~7)LVrUMlaXOGws zS*C$glhNc>GVD302*qx-XDu?jE1+ZtJ559qx2}87kRhW_x(`E~3`V$#_p8I%H#hux z7p{fShKTUSReLYBxn-iKo`&AHz#TAW?7%*D;jT5q4cdn`oT}jyDe)1Xn>OtFc;4?k z{n1(#m5zP9cf{`0CBGhpa zey2%{VJYcl8V;IW4qb}EH%=2gIAF>e9V5PF%|3)mij7KnJSDvzhXIN-U(%({MBb1V zm|BGpcYQyAsoW&GL7{xK50kZKr6X87_3A1d`;qQWxX~LO!K-%YaeUBGI2xyYdQlbl zJ_r`*#N@hN**>%RPBvOMUPMUgwIA|O=?}|x_N{Pg>jA2R(Az$TGfy4EY2UY)#*pqq zR%zT>X)BR4p|4{QKTQV72wktQE&h1Txl5Yhi2&s zK&={H0Sz&sw2THs4a=;{E0%Cckqzr|%jq%=*U)7cRdj^j7jY2DqLKR}v|yn4z0=8Y z{a(DD5mc#maJRe7rTauh*jiqx4cz*eS{cgJuX{~Rc8TeCcPD>|SW2vIEU`85Q(M?e z;rG@G1%CQY*Z!4VHRO4Aa}&iOK`woND*OY&X~oT9xfzw)B#{e%ePUf@gvNy%VM^E(!)~Dw%fD4Q=e6YOB+3L^tcXdL( z?hc4@`m2h)2d*n(p>(GRZTX9X0mAT3btR{>@JFq(sB0454@Jp5uK=ny*AZ9~H96&mOTscs?N+E@7E| zyVcWpHg}(a(!kipm>XuDY-{ChL@Q+6ovq7p7ELn_D1ot9Zy#Q~iR*~()wo-+T2;{7 z8f@~@>s5TcSt-XPp4C-KdC*J5hYujPf+2Hzr!w(!@4;X_J8*<%(S@sd<*iDnYjB5h zw0I7v_CiZ!Iv>Epmzy0TzYYwUjm3S4m$#5$9cJaR7Co(l3RO#V6+ptexFf0S#J!&{ zJq+gm`2Vv24459KGdouY)7sNjM+|hkH-z|%iGcfcvIucRtf-}`P&j)J+uk-uwb$~> z4aIWRlTIW>jT8}-NUNHBQ}fl@C>*5f^#<@#8O~o5iI(&XR&XjZzq6QpPV&zxf3jNq zN@f?>>=<2@c$8qg+fz^J;{B<+s5fd5c9&Wl>CPLWRInvox9*N? ze1@98+K*db_|?swQRH1bY*r#7^`bb4iEbpp;+fOCh&5@;!6DdbearW$CV>WWa#X$q zisW5Ow2feG&*Uh$dFM^bsy!QltsCB3mL3!v4N57g?zhCS@)X;V?Wt2nwxf(2lb#Jo z$cKnBm9ClPkMk@&k0a+k0tqu&{`?L1$amFz4{|PRe^J`NJ8z$LJ!oC*Xo_I6m|HyW zgdOV%_-o6X2Z-v3JMRSAp&#|i(%MuYjt9X1n6Qi1=%w5-CRn0e{Id@-A=ov0%a-GI z-A4NKe+YaVYgy@#oL76W=p;G*&IT86BYI5N&ba*S}yLL5}qu}5f7$tZhP2oc9# z$3A9~d8CrE6{5^zmp#hL4#_z7=J$B^dVjvZ%lC5mUM~K~<&ra=&&TuexR2ZYc2(@^ zmhoy!^{VCFK;%qLQF*<*K5AG{(kyKT+%M;NuesQo2^P73Z=AG>{UVehvC@%qS!l9t z43{09R{Hj9w5-!*q(jTE=5DO#T9e6Jyw|tVv&mYXjDm}{wO@>^#hQ%s!sXuGNM+iH z{XP8ade`>TY^`Up+Ye|z1y zQ==b4))vioHcga1Ol-#OL0HU`d5Ng)9L3LMOU=|RvBzK9*k@j>IILLOi&Zy&+z{K` zemytBNjyqk1MVA!ooWW13bh>?jU)O+_rW;~euSAOPsjr-7*AT)Pgn!`zfgM})_k8B z>=Zt(Z#@ipx3^e12fy)}?J!{Z5n|++;i18(yBRo3j&-3~j+UVs^$s^7RRph`FNLXuO)!$LpAO+>BrB!ni)4K!WGy||E#bJ+R5$nC;IkA~ zg5jt=049qJUm-*A(mg9tA|@4Qu(Y5j_Ta1%ZCMpVU_zI3c^!e?Fj2V;C(`rMQeA$X z=wpss0Et*8vK9G+aJLP!$$#8v$3dhzYS7EkJzJZrYMz*|VkS37K-}hb1Z2wXn2W}_8fzq35-9<1Jot$chw|MIG$A*8Xb1Y3k+s;B`oS%&u8C7y?R^K=FxO> zP6RlahpReHdv>)~Ybq=2X{f6VOx}0V{fxLPD>ThZ6nahfGdcWi=mmU4vtO8!jBMCm z=b_~-Z?0FBid77VheNj>Aif|zE$G&YZ>C=@g$Kizr{oV%PkD?!!t3rb32ke@=Js#- z#h&-xgF7XX`mbZ<{u!+8UxXqtcZ$YQioX_kv!;vipflFds}Taox5zjw zS`WRAkV9~wDTiZuLb1&fTzO1XB4($TS`}e}NMO>l)GC1tN9LEK5k69P%2=G!A^!T5 z!enF_eK#L|$fj-k-MA*Vni7I~Ef4s(YN@=xdr_RMh~n7g_x-8;$#xz$2C^gv8cX3$ zmF?!QVSf{y4qLVw?PGsStMe3HUsP%OJz`_#;)H5(x$#BwZq4tUjT2zVIDhh~N9aGl zj1$0tzb2&ut!X*P#4jPTLLU`4)kbSA;@6_pgq?jMjqG1tO81}HarYfbo)Gu5tJ8Ph zH_6d{_UncZI!U4R4(^&4R{|_u1(SVLc!$K34JYu(sjk7Us1@A3C>`Wp=t7^9SQLGD zc>O2+YSJdh!8_~z3Q%;n%W|hXc)MC!dipp!09Z){1>IqW#>HO+ffV5~qT-eS=;KL( zf0uRWeaP2 zs>rS4EZNiScwE}OeyvIjD7!ouVt({I`h5ranQ7sb>Q;cKLYm3y(~h!H*Lh3n8m4S_r(93} zdAEJScE8UK4S!4Z{B^6eagn3siFe(Z%nSHWNYAdK+&TlROyowF$nx{rolVZpadUv^ zzNS168b}xTz=yNtWGVlUL}}Cug-lY37;fOXz_{Imhz^MHXaKk`X`+^LZdlc9v zUB9IN;%WWh`#xsmdm>zu?dL9V7;};y$G!m&hhO6XCw>nA(8dXvGJb|;`&H$%eLIgX z7Op4bzxKf$wb^0z7)6>?KQN-Yv_0w!7uT8l1!XR%vQJ%d4#;ng)&c9eQ}jRyNA6;c zI!(;u4mfm8$9O#j+*`P4nx}qr*c*s~+PBX#0G^jY#+T=e zZ4!*i-9zZFMjU9|w|yb;u$kmSqok0|I`RM@!!00V88l$J7Lbu0PtL~m17osgrd57# zGvH`x{b-+E-~74o?weR#7w+xUX_W(yooe+0^><2%4#7q*k|Iw1oK?O*82y(6H9Zrj zQ~@jwnu@B|b)djPHKN6(G2$yQ5n6r~!tl`ea7+___?5I3*BIJ&bh$7(Q{xPJ;Smk;k(?*PoFe z1~i}K1OL2ah$(s1x`hrJPI1aFtccq42B+C5aKB^D{sC9qeHFv4q=@_ke~r`Gp+RRc z;#X@Ao`8w9e$`iF08K?aGDu+bOqrfg+a(e#ytpzjkRG%r^6*6VfTh*8Wu*U~=Xk=4Wyqo!8a=8s+ zD|ghWN}2=HWVO7RTffGLS31fMfrQl#eDu4Jjfq|Dh{as~w(T*clAD^)%1uIqqT3tU zQLp(-DQW!7Az@O%e|7FdSvvDB;7F%cl{_yjA)aX%iJ9)GQ4HZL*rPjvAd_YtRlw6z z(g(Cdnu^mYQ~H5GJnPc~=b9@ITpCVGnY&6_7tP5^F^JjNK~?iZ277Z0)W$vGuU_Yt zsmNAd5c1LTz|2vSHFD;7QY5+Jmwct6BCxVeRl4vEIF>*o`Agr;kAVepW2tWb?ODPu z_)T&pU?nxv4_MrFEACE)O!jotV)hr_43}6oN{%EQ*F7!0xifFa5Y=^woFd{>Q9ERf zqQFznt#{m{upv~Ns${y_kgzquGBpoe;hxPAP7MQt$9yHh#W)lDhrB^Rht6}AM|gM9 zm#t-$jG8lk@9sX{L6HTA!up~4Pxr^q&6zUAGpj8uO?PqnVE+?3* z`{WUFWu+#$6W~KreM&+nQ@4A4<|!v5VasNi;mL-OPoRk9foK^H_vrx2uFOu5mNlfw z9s%3a&$APZ0C1MYybnwREiAwZC53J^ttlcS^XeTq^*RrnzKD7wwhMrhpy>+?I=eCW z3mb=$_iE@Fg&<3xp}5>{SV3N!1^>aC$gs78F}Lo}a2mpWtTNB$yaxAU9k=d{n&y6= zmeZ@S?R$UHPKC*G-Gh9UwsXY&E>>@Y^47-L6)viCT*8DIp>*JYCk4M3CHrg3xQrSs ziQ0O#Z3uPDA?N<^s{i3GLk9B)$l>$=LQ`@jZFDOIVB62O@-IJ_^8_0x*ZK0B;D9rR z4!G#*uK(FQ3X!+(u5@RcAx&Y@gunbmagwquSSO96Z<#Q3witcV8D9OF_ithsdG4Q$ zYee%`hm}Z}VXYARDyF7veurj%uyo=rxr1kTL*~#-9&}_~) zBlZ}pdUUIic8W}7m&hgoFQxdQV5#w7^1CaRYio8$5>LG>WYf!>5KbK>#Mce{I0vK5 z9&N6|19sIafN0DHrbGa!qQFU`!g>ltylz^*i`@fP&gf?(dF%8#uJ_>%=v7lgOi{r^ zDqrmdjk(APsqQUSa(1`s`zTCQiGEWhfRu~M=#Rc!MhDU3Q@XtjOgq0{jr76igXC|ex|8a2gW4n)X8;LFoEl`AM&9CyeA7*X{mRmN{Z&jOE6LXPSE^h;^b&T zYxl2aq1Stzf{)U^yCg39AP(KRDp}rA^i~1jUd>G*=c+>R_fAKL9vQcC--d$QL}g3Y=t9(jk*Z{BOv#c-QRF=^S@V^~LH> z-%@;BQ)qQ7!1MR{#=8GLF{29na~Uuy8RiHg7n$V04)p{{9Z0U$?bNIegXQH6*?x+e zOv*l#+f4mpIrCbtfUZS2qrkUynu=d#$M>e|dDmb1ItCqU_`h_z=;hZcqKT8l1Xl$T zua~rTzidU{)EPw2>{I(~%Ol+@wvC2MqY3BVQ(}hUmYJLP9nL5p{c<1bpCR)sQ8P+l%aReYrkDZnbz<#y zd2w_30LPCGvX+xJzBv2=^K#l%r~4hD;{%2^gQKei);>sf#%GLiOIKBOGWeA5uC%)B z<(>g467_*n#rxkZ8f20vOaA2TJ^HPuYH#nZMr*oPoNLcqb97NbJ>9Ofj9&)hkyBO5 z6Vahgm}lmIZp^z%$0Y)R)$+V$C+Q`|8x`AVX7lgZK0#_cHN}{j)VG<)e1~jt!+%nm zA@+z>SUFu-yN*a<64(n}+1jnzFKHfsx@TZ4u=p&9ZzNr*0-yCKi=!wt+HR^LbaPxS&DY7Q7}NG1>rd4*6HnDz-N8*K0Q0j3yzF=ESCwBL zI*W}y4fHb|hVbaMwHGOpEy+EZ;QY84`$+!PEcZedZA=1;Bs+Pw?;U5pq5C|N$J z3Pi|0%nKE}BTa+97Q@bh@-6}X+t({xxCsSVzts*f{A$wk0YwknB)C{kw0r zSw&}x1YeVUCC{nGxQ(?`lGZw<>m^s`;~DOgp{sUiYwjl7b$gVZq@v@z#`%FiT=!j9 zA zGlyNz1Mtdp^RruxYcP4tpnbUjes+_ibt82{Feq4PXTW$ac*=XNnKisJkf#&`?do@? z28yjlhXcUeL?^<}C?SgT#$Z(VFI%o}EM89Nw2cDG$VnVE+)NW|&SFSq9e{KbrMg78 zhqWnxO$s4}hQr1X$QEGnZ}xa_j`~QG(<%V|h&C=>CnMDpTZmejtU8^4=zGg8dA27a zwJhl9PO7Y%U5OGD^hQ{W^{@GA>aQa*N!8X}bwJ~gbM?m`rJphwQh{oTYJI3$4zYTA zT=k>VwmlawmStXP2Itb47aSq&(%x;aDAfX4L%;?vvU#$@$5ZRdXkF7BBKG@s8#yJ; zI;%AaGYz{HZj}LB;@+n#3g@+#L1|m>I!I0*Yu|kajy=eL7=Nlo^~w~rbIdrBuH4br zT)`jl6m2J=FOeALBZHVi44yHQE;tMg49s5g@)1@9rg@rNC^qaNqA#f7g}Ao*I;T>? z0%%VXIzDlCDD2!sdSGE^Wk12GxO9&t#!0W0J1(#%>}^~*kUA1*3g2d041P(|Vx2&{ zY{OCT(#R@uC}60>l2Op`1AH?TEu$`9&28}#Kc>%W;t67Z1dA3C>Jk{{30K_aR%M)j zq5-rc;~PrNzM36xi?z>Q@L=zXkix!@$O8E0(FLEJBczDG&{DiWJ6reR&>CA185zS9 zhk{bg(Qnwpuz4D{Uv4zzp<_84Bzf`b7}!fyby?uJDc5lTh;ZZ2NkKc@stn`7IKzf7 zcY;{6!(mrC0&pA^zvqQpB~B`-QT`1~tCxmmuYajStZ19!1O;7FXDnmv&sDDlUkBxJ z6;t3GQLIsvN=P0ESaHQ-dd(<5SOs|BcQ)Vof4XX~#6d8&8A^zSocX~C!yfdQ2)E>q zV9-!`-AlimPkK%$5JH9L&LH&$ztO?#c+N(!c)>pL=)ZZG&_LcB=`Ab0WvM(J8YCSrv`~0K%$3XF3H#-Tdb64+!!ys4%PmZ-_^sa) zSL~Gzj0fLYRZo`(@=*T;FZb~^548x!08{k@IWhZbO}+Viwb&$4#}URf5IJO6SoVd~ z$l3grrVE-utj@ciU979PXM@&sZ1@D9sWF=D@KCKpudiXP1xj8gbR(tAr#ci(D9 zjU{+>IYS*!Tog-p$j*5yDr?0t4WptrAJ2_wYO$k&N+9@xX>W*7Ik*4Cw+sm}4s~sD zevGkZKpR!5`TnKLrLGrs-EaLW#Yl^)Hhg0uG``x%*M3+5iLO7`amMAfhcn|rKP|76 z3YXXPeFcL&`eh*6wRi;@l~O(Sf&2eDIhS-^Nlk^@yeSkra5OBHeMDk4HNUF>j&TNFNkcwT$5!XMn@~>CA0YtM=)EQX9JZu8B}%ZEEl6{d(FH1>32s(YWn9S z=XMiXb#r?Au7YZ$CRB~Q&?grM?I#h+H)$*o7OygadSN+Her7G?VecZ?z%S``uA%D# zA?(M!gwr!L66E4%C~>6(`O}S2%Py+ET4vIb9PK7v-sbJmu%d!6b$?WsZY{yaT{rFW zMZDsO`V$U(flg>V17S>x3q)E5%^Ui1MQo#pCzF69x_%F%oX0|mHsB{}g-PdMllW;7 z(-32Cp!nhHlnxeVX<|JjsP=MyuR~@}^7n0SV)NEdbMxf^)L8jPQb;SaDg)|S1a(J( zPN?j+!`4rD2^ANWAsly|1l8B7`sb9=GSgFGlgjV)nAq<8I_#2&V8nA&S-%%&X^E>? zo38e9jkWe-Jfl%7GS{v`eSTo`;QC{q9M3saKG{3j&UtsG3oE)42yetw3pM09w--d` zg6;4HI)_>*D(q1MyGpK6B{v4t+oo)BhKRk7m0asL`XvThJr-w=iFsxaKjZ3z??&jZ zj?z$WZq7X%H%83ZHTL*F|Na`NY$t6w_;022>amh**F!u9aKK_)5iLR4{T+Qj^~5RJ zo{}u^jQ&mQ?Jwvk>GK0+3U7W$Tm8Tz2x%~!X>O(iRLO+?Ew$2bR? zX|3hho`CruCuF@TjZ$&lv3JXzX3p-uF8<(44h{9{O)9ghj`i;g4M>6wMr!rS?;Etj z=uO&Rs!!jS$xW2mFth2blu*PeNpz(rlxFFV;?Wl_v0|9M z6xMpin&`RPK&OjPzrB53pj%e_+}pn)l)th1j8{DtvcoHAIo_BRU0R%TJI2=Xd2@oS zruPY^Q8;ud$0KR#JY(qF#S%>!z6R`U*#G|;l|r?HUXSwXM|_7b`K zsuBKZ!TYAb9qj<3H3CyH-F57R;MO*jjUAYlipgNdl}>hsNHjPjIbcfS)g;xIR8%co ztvnfw1RPSe@+Lu`N|r>ay4ye-d?fOM{}9cEQ%jhCQ*inRj3$B{COs=8d~RrxO*M=0 zaf%y%*3EIn!`RYa#^Z0SGO;h??^01@mdN4nE#@1baSx1ly?`Y=FI|FcDJVv^F1j|? z3O105GjBJ_!-^Rz+&ZO-Q*})^U}sEOV%6F#1cVns$!SYfChw|q^p?al$>~RaixO2Qr01D*B1E3BaS*}!o8Xn-La}`0W7ldWf?xk z#k}R$|6j@zKwm^qq6N|4(CqtfNFj_GPahb(Rk1-)zQ1s$K9J=T!lNxR2&KYnNOv`M z{*V0kB);*tV^Wzn8Ssy**Voe4OT*89(aGIUkza|MyOWg6}~elfWMn%BFYHyuK+2_ndBGv zS$D^LS|u17R*vngXJE`XLY!~wEpHc0$1OZ(l5Y%Uea3@81Se&^Z1(Q+Y*~hk^CKK) z%dCty_<3{u@@m{_*$5~0z^g35{vpVLBOUTTN#q$A@_lX$^>}W0ru%a8hj0h8g4b}E z2zEaHXOvGXC^?P`D)N4df0!y8(`tS^q~kZWbGdem!)Og&ntGsDuF8RR6+Ot|?}v5a zd{Hvb?D5fC{o7y_$6x070aL0a=^MGUh*TMU?j zI{7k=n@&SLvHGQs6nkWk%4>bPV#p2_F~sl=k~QGB$A9jrrQjV8W(1z*Bg0=rFaF^8PZfE-%>c{3ZCL%960k$H7BZn z?%Mw*f%_Tn7KS6vnBGv6S^QS{PN8sVEkxybvtRd`l|3c{jk8p#L;EudG5UdAzr-RK z(Qs(F=RU+^HkqKx#jf62ge_H?CLWK5dE|$@tf)5Q$T5;hGwJczs(QpK^ zD%LcbgOqa{=~W#bEoAU&h!2;c0OsVUxf)aIqpM)@)#rSV4i>Lspu5?W`)~ z_t(i7i7G2}_=Ti`@)2bhE;?x@+hRb#VGHA#uYVNM z7QkP}!6?ZoQ!jEQtE4k_lhT6u2zrDw&7q4hcT{8j2`56q}Ua}ze6k@~$FMc`DD9|gY&K1^L%rT= z<>?8}1S?T&G$ndW$z#K2xPU0kEmPo0NU+-6FwGTwq2i#6*P5s{# zL4@*k#>qNyXNpvS+x7{*vr)q4=1jM7$aC}Wb5QTQZjkO%n-cDj%Ar3; z;8l1G_j7+!Z-OfYV69FrUy=&fP>PRAN;q{Y&h1Ks3?+1X?;x8BcH5k4ZI~ z)WgKYf>V8jQiSZUc2h8qPTmv6=R~{Aap_5LfiYdlosSW6!&wu*F3@!~TkmxlRh_aF zyOGMVHMf`Cp+F_PUhMAs>2pKMG(lm`(#>^Q^0Pum8^$@Qs?CZk?;+a0_Dk-IDzx1b zbgQR~4=lwAa4Z_|Gr}w@r&qFV#VpdI<6Hxb08i)*!$`efh!@RYEA5RyYtul*O2(0^@1fXaTDZGrR*E6lmq);c z9AC9yMObYe`y@P9>vPFqiE1O^rO)v&aPRKixSHz2no4_B;vHvGO$fQe%58$1XxEIY z$5MBVh&fUo-XA3qQXH*2CmS&R<=g7dVfz((lPt*3*Q8Y~oZ-rja+E~DReQPoVuQNg zVlZ9bV-iR!)*+<(*h%b_Va=SwvnPXm(4KiXpYwlUi%Zu6bDN(GHA())-Q5fLvu0(> zzMeyxtxU}+&NkbtnE9hp$!I2n$(#qUTEwOj3$}!NbOJaPV83+ROLz2%Esuk%KdFA< zoFxm+p0r7rxGK-< zLaV>_^tMt_`r+Q!{f?S7Dhxi zPF|m8Xw_q-W9)@_sVrD*M7GoR@ND-c!%Su%wRV=Qv$ldF*ttf4d3M6?sKd zVssntIV?>FGAH2vey2yy`t2*)ovvzJPM^DVvKS0Z#44C#ISO z^V(bStmDYOk8>~0)H}^8fp-z>_OS*&G||qzCeoyijR^HNWVSs4J2ECyz<#VBrS=X9I)}0!GsJ`&c~Q0OW(C-e?w%(0q-fa z4)O`IYWBBFaRefWik^J{Wlq!b+-C!qaLtTiJ&2!=os9S#lP=DDJ}f=FR{hC2JgYj2 zj#Ib9X|`laqc%I}KB5)T?aQ`SF(#xDM8(H;JE;0*gzRii`%8W~G}p=ptB7-4QPu<( zx%IcZlAERSEv`($>#aLG-DA{kZ0=xeFReJkhE}vhdDNtHCzJD5$6rQ1OlHM}YA=6i}c@H1oV)9#Y zMkG4)VMk4nSBC^t=eD-c;*y*buX7a_a%S0qH~FrJ%>K~X$y`|%%_q&E!cdPJ;uY88 zi3(yfC;5c!(yyXrS5m4dy$;`v#@zi)}I%bTn)glcA0T^`(Cf&PHVH}(nBZVSk8J+w59j`rn1JcYYAw}!zHTjvqwdw{c4&3~i_E_ra zen%wg>~56B8gI4)l?0xtwJF6DnV^nrloqJYVZc$rQ)zTr=rmi1hQz34{yzdJN$U_# zjz}wQyvf;1@9XN$AyH_o)@^JG&ubK~KEPo_T!~2F7ZhT|jGBSg2cn0B40c^J8Y<-; zmH453w!@eE;RbsVf9#Il7p)b-oSXlXhKp-fT;4J%^c+`3=lJVxQ=2KEH%(NXtV1t^ zv+@$0A=(8$Nwa6(pHOGEKA*Q;RX~g@_WeiPyUdXt?r^U7B9qbe-49pOMnD*^Cjymp zHSTOdi6|)PhMpq8k#z+OxZ6QLDe{jI!8psG1$Tf4eeMdb+JfiVJ#^BP@x=l1Mz|Wi z8p9j)AHPV{^6&6rw$J$PHMl~@$r@<_5n6q&zv^V_Z@@5 z9ks~h(e>n|cHomI`^|W#4yZ`iCO2;-gwlaH!yIk|bI`F=iP#pW}BVv(! zd)M+wsrS$0gH>+PO3gof<2N~;RA%F{7^KuWKv(B6)H*1P%v1cYbr8l>{}pH3zlNKP zbD(>u3Nw;3B5h){Nel?0#7uO9>ZA>+A5 z<~;ztY9j<3djatn8&l?gOhiox#Y(t|fysamu^db*1`~rR@iS&A`hD4BT6Hq>zZvq5 z89arXK46XQA0hHXpdGEJjwb#x>&_mS%oH5^Lj#|LKU%+b+3m_jufxq%uiSBP#w0yg z2D7?Pd!TOa)ds`MZxG}=<;8xT1c18Kn(LyV{nuiBF9a^g*4z9q|JSDry!Y0K>19r{ zac{V4uw`Nx*dpcqGyh#JK%NAjRkl_uu$v+ZGI}=xe8me5#r|9Qq#MA)F7M*omI&oK z$YIf(Azu!N`GD;}_^I!wAoC;-(7e>-I_(`x+5jj)F$3*Ir{JDirleL~bDBjgbZ-`9 zA#f}|!^yI9fCB&?l|sUIxu!)IP`ye8x}|ZjJHOQ{Ccr<%LjM~DKDYwqyI=o(@NXJ{ zIT;`)yfq5iQP##5_gg>K^LN^K{k0S51Um8}uh$UImW~cWikf-<5f}vkJx4HUDFK7j zWiSwJ%Y{&0D5xxHU>BdRe*DD=x=cymp-F^m+U+zbacjackU|X zX@nbg0t(=Q0QeF$l(~)nEyr&vfl7H00&Vxre)&(cXw9|oO<=XT9IwVevzldhcpLa~ zaBip-g0GJ>qWmU!NOGA5TJX=ypC`t_K~RIFfh>!eCx8exI7e{cX5Gw`yRxr(j03%!ONolzz-8xVdltBVIk_*`K}oacHS{jN zfH8O#$R0*5!_SWp6KGbwkN4)CfNzh}w-en1Fd@<+`{ zz~g%i0HV&)tL_?uuHzC2A1Z*_r{-b+*gGE`kLl5Z?@h%8-F8uB7l1+-2HvUcw1~-H zjZx2?0Xw%~LBKgHNc!5K_5WXU32qL!lLPpAa0l%oM1?Zl6HP&ib6foSZw59={&nFF z0E`9OvE~C7DAH1{qDcZU86dQ>f=rMzfogOc$N`&zDS$rjEkr(Evv@src~B10YO8|% zEPzdW>)*@))rBC@?|7j+AP*&+2Nh2hvMK6@c|aYA+_5WI5{SzplS2(Ufj@i41a^lF zFd@s%^#PP?3lmnU;|9PX-sGA5)bO7(fhL4Kfqjz;|8GH=(f}62NvTLtC2<>Y0@Yyq zMAlkF(PkWGJ>pXB?(|LLK2xHuX}HOMKxGXG<-1!Kcuz>1bmNxX`T>Bmwm>LZKI+VW z#MjFZ2t!lvhwOjLXz`W>zzJ6XY2OGK1vqI{7JebLfeDMMn9J|8O#BA8=Gd=Pz2C`Ep~hE~1!{k;nPN zV7z7I6t5f@ zdW#_Y^bxqFVlX%?2%Q{vNmB_E;cI&>(zTY5()1DlbSA7`v;J2gj^P0-nK9`Zpvrgufm@zcK{AD7%PPA<%AZgT?XjVt@%a&ky4NyEr{T%BeW+5lKrTuI*2w zeex41X|z854ea3WFTw@nPW4lPS0qybmJ)$TyJx@~2h8dz1p#!V%(@`t!ql=}l>~|y zKVW3(;I|q0isl`#1hhuvfFwB(cE}wBz6JIG!g4}5RMB(XJf%{^ZlKSz0P&RXVch_p zMd*qy^mqt@ErP;rY|!D~9_dNBOL4H6Ut9r=H9r;ZQ88IuJ9Qr@H93xL0ED|P-D)u& zEP7hRg=o?Sc-Dpi3EU9EFydJRpOe&9WDQ>N!_TY6|9`KWK&94$Km}Ka`T}}dPKWzF(`~TLy22bIu zkH9==RNQ|p8v8{KfCz@c8gv5nnc7aUGi=E{YxWj6)Ngtn^Y1AzB)4VshbYt)-xgwU^o-sUXW-Y%Z%HT>_p{16D-btBRU4FQ2EYZc3`nhzKk z{|sDi>mh}4n8yvGuA1v#8|V!X&~v~EWVs%DzhAa8ZQ#7XDAYMlz5n(FNPqu-iy}h# z7tI)0W(o7*zkj(Fnp=jlECKublJXz-^H=7(vcT0Ogem?*tXsp9RCcFOl*>GC{{q{R zs|LGiK<4JOG*k*5IqF4OEj>NP&;^-10iV0~#{Zr#!++RqP$^qp`u);8)}^UoxrDUQD+E!6_;q)>xZZ?)05AUgZx8oTzt;2pl3 zfyl)oy0;qIOTm`C_mDZ`qRXvOtB~VA=g4*?vL<|16}|%h9cz{*NSeJ3VKbg)#~>wm zE8SKtOe#ImSr3-SZ&OWCcL9CbHJnE$a63Rgsok4n^uW)@%{hO5SKz?}jsAnD{0i0Pae6#qu(-ysOp&-o z!FqFo*r0<`{@{!ZQ+5UN`w>u)w}Sw~(B@b%;eU|&)YCn{eG(spf64VzglZjx3D(HA z2zqcqx|coc4lCT#F|(IGV;m-IbF2>8nmah^YYesZmJM zpO1p%rlm`}pdW7w9XO;~(0+%Ujhti?UegibCp#JLecG;m@g_jIz6AWd-H-~i#`P(v2{=s_$8Au7;@R&h2K2uM z|HB#XaOyL>O(2FjM|C=k(u^5wLZ5E}oYOuQ2wj@{8aC4@f5t3&@(uks;1sRF@6cFL zs!l_4*iozKOC!}@7W6-0Q{-=tek#?uVMoan{LRmVZMwV`jY%^=L>Bl@aC#yxvzog1B2bl| zCp*JCgW7E8kGz8z{iLsl;KVoaO+V^7*_ zxcXjHp%Uo1Ffs>Vug$8G)|oTA==AK%T5^?Eg4c8pv@@rMGueP3;XB4r&@KD5lak;2 zTc^T0GQoIp$phZvj==BkzOo|%;YL+ zv$^Xg_Q2=Ggrr{9$x6vfd5cn)atuOEe+AD9hw$XwXn#~zYCl0whB6Z>4?vzYc^)AW z+E#Pqw4(8Tz?{L@^+q0-$!qR!z=i$sK9L+%&qwE@RYdunsgv=F1|=0Q6hF8Q4BG!Y zx92I*9vF8NorbO>rzS@Ci$cPqMsG9PDIqWMqa}N#??8b=aBPc5rHbF=rZUcnwYcEJ z5ZTda@ev%IQt}{LmXA2sJYy01l9mfeh~_o(wU;NqW-s7%rbKD_toULBX>k$AG@X7p zL*YBj5vu?uT_#|0)G}vd%w-xE11T6_t1!%0jHga6?*psQ_@>fGmZkT_UqdT+9H(o- z=5Ob_zB|IMIf6V(LKomN_Z&fekwxM9LRN7^lGm*)gum~Vw=&5RAN1dIaCOuj`p!vJ zf3a#_!=JXk!}+G#(=^BJr7H(GW&cctyDE`P?1#bZ%iZ!1*e{pL2kxL+yo$0W^`6URC}V(v=aPKBQHC*YqVJ;oTSAnVa`q6BN?^5!^NcFOus zoV@uW@af@Yf%;oQ|Eb47{wz(h`Ynuci>7UB%0%;Z`qIY? zdso8X+~aTh``~cyC~qV7ltW;9<93VRvM}jWYs?QV9v!8+!{6eqvh6p4aJ+e%N@tdS zAIQk9XDObB^o%LTsuvx<9Rb4eRduVMkMo0juXeTOS#jzr8K*{4j&RC9oo#Cq88{;n zwa0z8!D@ZBeY9Gx zQ2mg1-vZprZ!Epg>Z2X&IsCocwrC=TL&4j9_AnJpO~~_ehS7@0qJ9nn=_pbbCzIvF zl1h7{;%Ixc^;tvtU7L)#dMG#E!2^YwWUMDy59aKpmLavS>FPpa`rxq3Ol_V0I&C&jNNy16 zPWvL0%vyyr+Kc+zaQf{-R@sk$zbx;TAYT+dUSij=3ihQ+MI+Rc_d=hut~#YvARR0rD<=V9A}X|i@k==fwVLm4>nwYQ@VQY4Z>ay6no=J>%Ni|*&F^uJ z&GjRc$#6*7-I&w7jq#fQ^OaU+7psSuH*85UI4=Yl+I2-%;rP+fBrUEHx26y@j<9V%AulT{^gV5*lQD19FFm( zU{-1vj5@L5Q~PIkc$dOVRD9o^-zyE4H4~Bk<$f1;&6|8{3FId9amCPZC!czh_1ZhW z&v%c920lyH_8aO>a(dY

2ebp9k%WN+jzBcU%KFZcC>tOT^uFKoz0Ogum0Il19*R zhERXB2!Y7LS`n7dACH~L^B9?uj%(`u2ep(UWpL{V;wRwQ8@7<3 z6u_p`7&|*WtI2ygB%U|%lL2$#5Q9=8{qwrU8h+s@#?3*)yHU8&*@vo|lS7vUkzZ?V zyfRpm0n%!QRkbru^jTEIOQDbSw{eoGxc+`tM(4H zy2QE^WAHaW27^D+>bK@ZrWvPv_i!!v)6Se)u%wv)o8 zO{qw6*j&twl^Z8*ADT>p1UAk|aBBWnn79E76KUG!^Pq|b6(&T2K#Ns=BR;djgch4S z+1nSPL_82a?9s&QVOS9UkYQJ7-a0XGT`vYWdSr*^k`u|PjB72lECUmZ^1_cWbb@%bYMv-FN~BBK>D6} z1~BrcNOfdB))YEeE$hFPKzlmw3}xiw(~L2LgOd>@^`#%`&%l}1r4wdTY{p>pdqhR_ zyGzJ2E(7Lqp~4G?J%(^yS~99P$T%rF+kK-Clc8#_ohUhQVUI;-B&6GQr3Ow@P*Nvq z%(uqww>wc-SN2kMrJwmp;e|sFF$N7~$$8y|TR&}Lyw+5T1yWH{wH4F{KLx*Q$|^Qv zHqURWk~Dqm)r3rEn)g!-OH4tX!1^T9b6j>m!@f;|WO2;v?-~bG#Bo`N8Jac+e{=)b zgQ>^G#y-i!s(wWnJz}jZB}X0PuGuTK(749#uZ3A-yzi%hs9Nr+^Q)r*VQqhbIltTL ze;;L){WI_frz0+_mT6!O%6xgqa|FOx`Jync*`mjKXy!wLwEp7YZtWSR2qD~^UZ7i2 zvKNOPN>9shQKh>bx}rM^?a{Go_AAK zl=fx>3V}JDD_tFGO=?3t9s1Y1y4DE&Le|alLnXxfOw9GI_H#UdR7t(@QsS+P=#914 zRv8Fk1jBl$A7Tfn+w1u#KQivb`*L`%r&gfa;@vrQuT`HmK3|x&TYshVu{n6%sIwj^ z_HBDp^@jqubK>ljoJAYpX6UH}hB41TIaRsowGMM!!mlyDhY-EjRgT8#ga1h6tETd1 zY+nT-*$$>u62!Vet-=`#+cw!4GmjGW84-de4S>2#0OZ%jl+UW65=MmbBN~hN$;{w~ z!K6m+8Qj`tOUl9{A@G!t_%ld&8)a0}Y|YuIWz<{pHS(ErrD0m2ej@(yj-a5csI2Vv!-Uaj6p4n(`F!t~|vw&lnBDbI(}WmiGa% z@=>eW8IYeY=6R8N0*0{b6BZp;0$<)*4-0lpkRl3g&~=|{3fYuj9i!fjSyNr&SEsSg z2v+Gr@?3V|QvLoO2mzA~jBQ`M55e63vpel?yL)Rir4a|0%w-g^Y(4y-kMLOQneo;r zNCipNSFbdqr=nR^pAm`8{$tPA>)b9)Iz`ln1#W;ux-2vCs1lAYn&sxL7M&3E&jS9G zFrkP|P;2+RSDIJYcZNGn;IRI@^94$7%m{*6Vb4cCnI$d%Cba&05K!$j*jNZE_53pp zz{}CCmL3nVG{)vm%(9{at{-1~rZDouaEdzQj5sx&1pWen=xOI@r5*q>{@9~vD|##P zVjCMv3rnMovdkTu%A93vBBC$YbtPY~oY}LDugMD`Od{k79!_dB-1WmGqgCY{d=z$f z-rgGGspZk)(NB2(=6=vHw!66z6HFFa_#EAmL9Fjm`ZX^#>h;_Bu9Ec)7~__U8PQEH z{Uy-nTP*MuSs%iXpi)=P{?kNi9RF*(Sjk+_E9m!_BaJ>t-u8wk0M^-F;d3})-?#@oL* zl2W1?A7WbDu&{Gk_>ymSXH3_c+Qgdh^~mpxY&#&9K!o1qrtKW~SD$-$0i-@@TN5pM*|vlcYUhQ zq0NS4r_`+;B+3}d5&RN+@s=^Y;jFa5^aRGTkcqnuXhB0Jr!RoJE%Kwi7WW&m9=Mij zu%P`cY5DvV^%EEqXm=Q31N9yFRC%e+Yf9>F3V}Wd-K3TR))aM2sxcZ03u===E5aN* zu2V`@2g@=0y^nT%_*+kW-2pZ0Lho_zOLK&ff>tv(476T?E0 z$`L#r0jezseqk>QAWY6)`$qeApgHBXx#B>63Mu30oBo27b?(#;emoAc1yU+L%f^cY1s2A{Mr^l}PBa#pE z0>*e<)x3Nr-^m`sap{o9MqpK;`Z}WsW2Z=D#NbHbGPKV1biq1b)HZzfpH?6#SXt3n zB8-2}82c?|>P~SVSl-SEJQ^=S_rYfO+~{>~w7bzVO`ISl8}ZIi*zM4@=DfRD(UL+V zbo`o26{t;Em!HS@A(Rl8?_jkE%v5OInXgR#ZNG!UDT(ks+`3nAl+(j)Pkh&DRui;( z#j5YT{8YJK;Z3{<0R=;L@TpFqEi9YX2RY3nrM5~lNpf_qI9f~|NgY+j!1vNyP6RA; zv=C{o-oXwai|OH!I!8HKkPkvc9JZi7$pNVdNxUPTFByJ7JaGr^RkpbqPjzOqr!ruS zCL73wX5!Fo@vWTF8waFJhcz&z9kmy9A!z=NA+M=iE!%2`fNCtlxoU#Zpn2@Od23np z>SpS}>4T}5P_6Xp4+x)0EO3CocNT4-`F;Vr6UcWXf#r0SG*6j?|A6WLkG;1Fi>i(L zMx|4_I|ZbN5R~o?5g3r}mJkK$?hffzM35AvbLf(g66um25Ex3odwkw+zt=wBhx>$! zW>{<8|NMo7{d)JSepJ~TAp0I?Vo|%*!X)wa$(;PhN`#gW ze=C-E1NuPWmJB5w7utHL8a-2`e~n` z3+}kv{11RmCdxB$cb14Yd$R2v10 zOZl&_gTP~*y%PbsOpfzg|LizDs<_({PfaIroN_hn<#BA|))Z`4@Rs)|*LVA;W9P-c zO+*XP32n{4r4XnTemetE58urkroX5R=n_Be6dy`R`kb01O&KOJ7Lc6@)7Ox+IYU*; zcR$JeP_-F(sOYnotdoBGWHV>gQKA4>;)-Jht61p!_3$gRyfltEi3pB4ES#hv-gIex zW?Mc!#auT4RQ*^$IJ6Pf_Rt5dtF$WO5AiESGm-*N4b+OuTX@Eypr|1Pm- z<(G{Y&`+9h6=P51RkbTC&xq~Z`vY@A;)s69*#tdtKVMu!ss<>{aJRj-Fsj%z@|s}g zC#(!uyV{WRJNfw1WU(Lan3u)qLq{y9>ce|~vnSo2%!2kjPI{>JnJev<0xwY#}R zAY|Y_*trI}&XWFr1Am3071FL?raTGb_i_ zYPt5ooltj$4|AXjrm?jCKE7~pc#;Pwu*gZw*XQRk;%Vr@3XBHaRuy|-%f{4`Ay7q) z1+{i);ZP|bv7@}n9^W@pO7Y3c6WuJTFmTWYxbvf!l)LM>&cr3jX#ArLlG|O%PGV#1 z0pun#T+35cFo-CS2#Y8JThc1xNl)^}7-HYRQTx3IAwtC}CV!89QLY(Z-NhC6k0*WR+uSf=1EqI4xUw*+Qa8ag`eQC3BgGM06tZXjMK( zi24ma+gxV8de|&PWP5VTY=~cgP320(z(_1q>!k;NYCX6;w6-s=l(jE(p=g4;w4 zfWZ|_%_E%%j~uucij7G>z=FDVA6+_J7%=ECCUG_xdg*JJ+wcHAkg}g0C)3Ck_XVi} z&UO4ZPk<5$4_2~r#@AaW-Y@_z@>zO5cFd!7>0&HI=ej=Hd6m=?CuaifgIfyun|c)n zA4)SK7=4&H1-@E*1#74%OG2Sj)|7O*VvGy$9AQQcQ2Cl8r25_x?syqz)R5S3do331 zL?gPWQp}_MwKi)%8hi+QWz~gbDtP?#I4;t8{qDN*`}fhq%>_$a-Xk1_3o@^a8`L?0 zP=OLSP!W~qWB>!c=r)teTBX%4g3m@bNHm8`I}Yc7wkhiFvx@)vlm%== z(1#TM@FmJkN)I%ciAIb+-=<-OHv{IQ+UxWUtiaFNuUeSxj&O*ZeU`9e=@d7Y7k~iO zST74pP)wZ2wo+cRa=353#aY?6ZKcW-#t46Du8BxmyOgvr6NXJmczBr-UxL3SBjJL8 zb{DNUWN<-tnd$N##N1N?$V%)eZ4!BCml8Bfe~1&U0_Po0>l-^LxE`Z@R?hX^9@=`3 z_|99m%OqOh5mN%4FvA~maV?W9oJ1Fn*Cfq)Cf{P=CnpqVY_JsN@MeX8gB_MAU(cU| zYa9LE0)so5A&pUR3_Ynd>Hak7uL*_>skA?N^EaY8y81902V3Pk*gP#J`A=z6f}$i0 zHdn1WAag@wTg(yNo&~SgBZ#|itvn=>16ttMCT?zZ*0-a_fw#Z%zohy_HScksD4Ow| zP9W4g`~AHM8NIq$5UoCRUf)UWl{-y%aHW=L2Q3vX^9Bny)vJ~HxVJyXjUBd(iVsyP zj0+Nehfuy?Q6}-t$XeNCw(e-9)L-%w1Gaf%$zG$1-SY>2QwERX&P;X#cWy zyvO%(()vKkAjGq92Xsq*w*)?V4`TnTr~J%0K%R=yw>kXzk#Hp;T{%IA~{&{U?s)B~6}hR&?hMV3dg-e|{;oOsyy3GmoWB zrfs(rBILam0DHR2XWYN%rbx)AJxI^sbMONfV^%=ji+tJrrAL|R*#hB<+?U4|jBQ4z z5yzY8EO{|kd#XRqqzCLTy5pivz^1^;ylqpA@duox2QQhDxuQ?*IS#pXp3mBMl^=dj zvy>AC;k`>tJJjl5Nw*aXMB4H{ns!S53cL7+Y&0fat%&%)c^{t&l?o#~Be@l`7d0IMWyibCXcDQsq(^v?0ud1*QV80%8MtqFM4H1#sn>mr6^T)Tug;n31|tM1zAd zp($E-R8KaWEq~c6(Bs4WpCIt9J?n_7*UZRW)jeK|>8d{J5)};h6`6IEczD<=if?*3 z9P7^>PSsOzc-7PPWVa}+XE1_`MW?Z+?oJj_95hOLowhUj<<`Wi^R~@CsAC6jAT1~+ zndkHpq4tvblx*MfPfNKu5P@UGC&ODO>A_^Sj5lRxj{Tp{J`($fPq_PTSO*3sSC%=ZzZ0F`^=0d?iBhpL^Q|6Ro25n#hM z!yBzd{y+U>_C0ZvLCcL|+LVC^6Q0fd5BkfA#+!%=Ea^waEc_UE-q!O=qw!Y_Aq z&U9E==di>bCBm7Tkf57wGqOX^@g`zQ@gRT4qd`AUFaXG<4Xc{~5W3NX$7kq+U%mpwlAVEg*zHGNZZ1tBlYQ~nujb)pgrAbgrHZcSpQi2 z^r-wf%UmqSZX>|c4j`r7=JQQ>&;)Cu|1#5wXIrRBJrqmpjBFF&L(Sp{6E!y-W&m9zjGU6Ah- zE9H1dGIo7g){Nc!C-23tUwL*_d)XUNflXhZZVH~V-K1~~_3?S>3Y^~J2UisFfi(P? z2~16R#T=aK|4YIvI6i!FD&y)bruY=*cmKS)a@e;3X&?;VA19Sov>+d~F4Q0Ir=}2U z9Ry(-(JS+j&x$f4sT5I-gkut2w`k_O%~-H9Hd&gMLk+k0oTXx4QER8x}=Ea7O*O~!{z^GSDqerO7K4wlu&$2rHda1U6}>HoXBS7oazf6;bbmCM1N=i`X6* zIKBPy4|Do|HAjJ{Byx%5h%O*1Ko*d_;iERDXKz4wdQ$dknYZ25!YHE9DhT=xYvVo| z{v;+prop1DjLhd>qsg*7ax&{aFc$nL24CO`{T>nn7c&mrUuvZNqts7dbqVwjDq5Uu zHd<0`xaylk#0GlNDe)8##p{V=QcchFJ@oFl^)0uaW%M!FM{8@Uh8xSrD<}j%aXEP$ z;^iU2H($qJ5OnavC`9^_@mXj6QO^*P5ZlZlqq9y}*#dydc zvjpeu@0({od zC5=4NR@nPW->jzct9+`hWBT^3=lp9?U)B#dxqJH1(Ao|pioL=cSeh=rv{xLz`Dc&! z20-uhv|B#@&n^0m?*2m>z#CYP(Uk@O)`7H{2K(2px+6JYX9vvG{Ts4~qMnlgLR@ca zih!v0Cf!wukAuwMmJI$4_m8GZN$@TCNP`A(#=HN)Ku_SDloc*9004s5X8wtORgZUJ za!m*DRthLCxBl0Kyhh(+zFjR=(5e=+?w|kkb(uyLrv3OA8!d1<_PHMXQU2QcT`72k zXpri*8_<1?{}Ky3TNmT+Xz7k6Kqe=@diVQ(ThhWANwW9=!sZo_W&J_p zTs67C45u|ymXOq9CNyOzqW||m7Lu+?B@|8PCre0p0kttJBm`x1gZb+;m=>_uQ!_iX zbFtxc0!)_mJ3+ZNqF^X_>R;CXnpScED0Ga_eM*b`2LQ-Z?OrL%mYWzr==FI21w`8T zj0zRVVs2_Uqrq5@*t|v9eHKSYL+?7dX#gv8MLJ2959zjkRPOu-WDH`K3CuSK0E7zw z`;A0Q8+E2qBBiDr*ouxSPkR3nVk-m8RS^)7mCXDw@B<*Yw|{w~?u{?jto-{&{@-n! z0~hM8Ws+q#uKulT5P*~SJ%u4hcRcoqv_aaYS7|3<#WFvu+^6#;h^m}=r#F$kWva$k zG8GXdL>_%lCYWuCr!Wv8Ai`B9BwLNw0!NO0{;aM|Q?5mw)mt$WqC>GQ07@-E)E~%o z)@1Rt8RvdBMgC|Gk_q!a%DG_H&v_q34(H?>v*ix3qxv_4LmEBRDR^i{Fe;SYf3ze} zKx7^L@@p;rDIitS^=O-uYP`P8ThvDWu3?RPDBFmQq_1)O9Zl+M94mydSD8^Dc?={F zeJY520}twf#gJ}<+>;|Id<&#eiBA2lgXB6)uX&I~N5TWJ@|lEr{|_f;ibSTUyQ}L{ zVfhF)KY)+PQV~dsP)XhW)@udrKkH0UL;18KmHF*DWpC~3kmqZO@YsR<|LC2MP?zMJ z`H`Af9!W=Tf16$a9AVWLz!X^EX&9Zdo)`lg*B@Gv=g(MO4gl%@>soJ=vZUnQ-=fyT z?}7gg3_o%(tY~olA7mde9t%!cmYCXzO1|eRj6|#aR8=qzhv{v;l#j;t{F|zsVCMY~0WPu*(X77w_)JTu=9aanI+I)rw!>kvQi#0igDUyUUdqg@M`t{WmGKsOS;;< z0)ngyFt$R1&0n_m-qH`Y?;*8bF9a8L(f9sWjV~ngfjhvA|A^rza9|Aht^4&abX)rN zU8eCckh?a14md9xs^04IIvf8|2_`y{$^mF9QU5qC=gLx^ zUlUFTilf%901))ycSG~TlwDCk>7NIr8z-a~0lFADF_j9s&+co$?O6ax&>Fdi``S9b z5)%C$xghzPG?~bv^8$IQHKFGqfPWSa0YPd$QTh*pC=mOSWhwaYx7LRvJPq++)8vhG zUD)MvG^|BZa=~=-Wcl}rxp8@;1dl}aNYtH z`3N2YwrWiIYW|ky#v?f%050dt1>0dN9YDGhg{lHq6D&b{Fn~W*#p&@BOh#UgiB!x$#@X z9phg_RQ{X23S@GRLSbJkr}=c`s@*ez1Z=O|Za?acvjqF9!&8etjJ;2yU*DO}vhy8& zo4|JGSHmf2O{1bO1~RQlUy$EQOvANSUZ@#Oahw=lNWmy00t^j{I?#db=M}L^;=)e} ztZUZh4(n2+f2VQMy~t;V7-(@sao`mUyaL|dFiF7eG@N<-SMon)Ds1W76{Y%vBZ|AA zS<|PBHJ!w}H6p0o8I z#GL3YYE@YNOB`0HRK`FucSRl{4rp|McVba>+A{&L8J3ddfllAh^_fxc3S>x-Ouly~ z1VGvEII8X4V@|+Zi&Rela}5V!(90w|s6GSLIQ~oU2c4wR_SkPbHgY^eFPV)A}LAo0{T0ptlg#5oUI*+buaC_D^3C- z!Y7_RfNj7aknU=U@frcjB?U&H0kJSU0QeYE+k3ZX6DvDXfO2`)t}Zg02%hG}x@Sl* z!Cf+VKTzz$UMje16-dpoyOWux2vyYWOQ_dy7%M|$MAFakWb zIBVG{C`V(Z2pFX#(35AB`{Ub0Hp5v4EV&a|xxFR_kbE&9i{@3DG&!Fs{5C1y1pw~HB#sp_BpZ#86^}v@ z1^N>9h(wL?E5Mbtbqlxk8W*<*yS}DTL&GY?bF_MbM|%w*@;&t7`?} zre39f&-OdFVO5GBEC9!1G|l6y-_=K@1)$pcwz*v871Dah2U7j%>Fx&LNES33YJUOp zP&P>U#MVEP=H@L=et=C~X%ZmHI}AxJgDfyPfE0rFDlV62EY~h9IBeQToE!jb$+oTQ z!4($})C|8&sd3u^8norMnI`#7)3nwT+aq|IDKH81DExJ3f)wG{XmLOO;F2i5?U3=S z`V|2YDo;hle@Js3WQ2NX@%i7sy$5KS+K1hQu)iX{FHgcY`J^z)+yy~ZQ}+;!My(416;e(vclo>303 zMCwGH>69M;xaP-2Wd*s*iy=%x20ETyR`2xYGN}#_S|^oZYNyVs`GliDBRr` zd$dBgF`o+_KCZJWogZws4g~)&LmVJ;zC=}){@2&iV_UcnZIpkpaAv+m`aUdRG5RMX z`B@7^vSIbWLBpOg5MkMWVWk>*nWD@<-Tyf%#I_^d{})Es%m8w8cCBLm%ga9Cw@%G2 zYmY4Vxd*VFoE!jru=Z)GKR}v)I1ywoqOTHVyg=o8&e)C)T+QR%4ZhZc)OIka(H}^W zLXJ_^))Jr#Kaj6=8RvJq26&0-j>0n(AYM)(&vFa1ZWws1uEqbI-&JF5X{v#OUbgrR zfEYOWTlbn__0_ZQ4Cb6Hnu@Y1MA+fq)OQuT!V{mqqkXJ`rx~sj0sDp{f?_U*hl9ex z#9Qoys}PGi6~~05NhDkLK|7%1(TlsxqkWsB(LArCY_EyYcITj#_MJR&zvG4d&dbq- zed$UCKm-v&Wg3C3gZjk&H^6Zz(qN14O_wKq*Jjm;1&CrvmJLt{X!Lgk+$bHOB=f5) zZDxx&XEx)kJuLG;(Hn}WPWphz925(X#@0>#7Gv?<&129;N5k0JnXh2G(L(kuxY3XEMci-Q#ovs>w^CNdB6%Ms2k)y zlUPp`1EaaLlz9UpQ(p`tXeAmZz5uld)c38;%XJPy+rQO6vBluiN7N^0Pf!zODu~_J z%V(xH!o1MwLKvd|y-KGB7~!yVH3U#*tv+|5;wj{Olzanh8a%Poa*@dODGeyavo}az z4`I@*KAd6K`S<+P{BEX17=tP%v{taq45kgh?XZKaKTxQ3j*fz4GHn1Br5l8#F#6uO zZ^V9z?-F47XegnwJBwCLji?;3YG1!R!@izS|)wLYkt9g_4-f= zT+CM!!G$Z}VyY#`RK_rhT4tFxG_Mj)uS292xzBC3@Ch`!DdK%gXlutE*Ui0ALd3=` zH5>Zd$&&6tqx`U(b1o&fmP6Eevs>86>cW!d|um2&>*VL-~QPa;q*+2Bk)w`;&?FuTClS?MT(v`T= z4%C_n+l4`YqJOVILpI5`;Yb?SOVI7+LNs61fYI7?;#Q;c-0MNHR|oSBAa4m#Gk@L7 z@BpgC^pHd;O)dxNVy zZ;F`}{xkf*FQ$FBpQF*O z9qaq4!b@wG&OhfhNRxO9{lvqw9jQz4$%NPs6?GqC>OUB@bLukh8S>M(>ul%0HUk_Q z-#tmqnwo)$T&eC15W}?7!-+(Gh-ye*ZIVtVRDR&^{P3$~ciL3@zp*4TsN)}HV)~PlyTh>J9tMD zby*-zGn}nE41kpA33#m0c4PlkhPhVnkT z%Um{zG{j?Fp*68yi{8d2pM|xPo(4-bef8m|`1o@mXy4!*ztV*i7U84j2Yzm#@$k}6 zYFw|s^gSz#Fse)vN{T<9Qs*aMKD!YD)`KlJZJVj;o1p%OP+$IdA^n^%Vemuu#1^Qg z(veiZqt;u3Hy-F}VZqC_ZDOd2!QgqeVbb~BN2o1Rw>PQz7GnS@6kG~gjh~5@A;75f zs{+zT26RP_jIN)J?rBS&&x{I}QTJZ-#@@>$gz6HnrjUI`nlUTP5)|B8a>-^AqMGdW zOR+%N-~UuQ$fnIggEFF4*;iEvRIY$poMnpoj18)a(+ecR8~{cAq70f57o9$iK7pk1 zCDm$F5WZV?BDPi}r;5uCz3I_6G_|D&t8#=8-=~JaOa$CTx<5tU;LW-=)jq3e>eGfd ze6Q*wbm_2y!I zo4bM4m?D(_DuOKL&Hve1xOLLXk^wqm+lxmQ4h?L9{@7 z)ImtuJ^K?oK7p+V(Hn-H*ZM-{0~sM-%$!*7jc3isi>;E#V$z+2b)okL3&GITdcXvc zz5*k*Zvru-9S9PfYxEPRVD>UX$G3*2-uig;YCI%IlvP^-ltk|65cZ8!V{9eB!1-i~ z^||ILPIxzx%gKrlePsKpumbhV7>Tq~vxQOf;CLUS0{_4%?pZWM@^%GAScbB>3Ggwe zIgN);HWvWYm$mVs5cTD}byACJip(t&>-=huX@6+E(~uQJZdHb0PJUG|DFh+g56T$H zr~7q~Scm;nKy<5(HydmMpo}?QYo@?tVwV}UMrLYo9sawPAmNP#-~6IOQ7$sZCR4nv zMcL-s0an+R-GV?1c`W6Et$;(rMnl#^ufZcuJUCR75nPtw)Jiwy%>1w{5>6VV9&mZ2 z#T#(L6~Lygp4cUhr(WeZ7;T8dr*SIAv7m8PraW&3{pEQutXRt==_B%VE{RzEXYwaQ z3#Y0%U?soeHQ>r!G`9ESxMm+h!4Uc97p=5QbQPO)efRBUSsha=9{u~yn^r%?;Cf4M zoH7w1bTY(BDUnnQfI7SXAaLgUNEK%&#FU#B>hbi^EGb0J0o@aiH#XI)px@FjSedK0B^z7Z&vd#csKbArR;nk4|xjKd1^)X;qIkRuSdl>gsTsC01>2 z*$~<%mP2`jk}e~_mgSYxNQV9cITqqd-2p;pUV0_2i;KsJb%wv_EYQ3AAG3=HUJ@jD zS))H+lnfC`SP!jl0AfS5lN#+GEysNF5nQj{iK^eOYDJIWcd924Nt^sdl9n1PQQ@U^ zw=Px5nCB$YgH+gPHat(@XpX&NDRpM+SkxZ6<)-A(g`k1xI|>6oH5iTOlME7+)v@gI z^BZI>bmz6oicC->tUBoMFi4dwl-CavJq+`O=zgKFGh^)Ha12d z5&JJ7y&}B64kqcfh6QhmvEOq=zf^hNQA&{eMul+kRN367VUWgpJ9rU+aVoQzT&ECb zNgD`r?!pZ{PL0@*$nV97@nz;igV?W;*Rl{O&~cyOLTwxG?+~2zVUor(%XEpk`WI_1 zm{D`juPxqdj?B%&ToXB%-jp%8kLp~;LF=@%53OaN>!>MarwaN@k@phvKSqZ7!Mh42?7S*fBQ` z8YVJ5Hu|&M03ov+pGH_^(CV$hv#WX@m(k*W+jB=kqR+K03!*>jl6=SLAKwX;oi@rrkB(G^9dH6p}vN!~~ZbqvBqoLCV;LCYU*# zb4JZvnn$|ogDukE(ZMUR*Kf%3okIvhHlLOeW@fQ zSxC8G;?xiJ*yiWcrWXP=NUb)z)>i=`@wb5|t+KkOUG;_ESxcqZ>Q_h>;m>6v`4jjD z2A-1JJd1-i?^n0Fm>dw#7tXMjzHgz_6b+dScDNN7@#(wkbitq&sR(me$EU3{K8k+- zx<*|ouzbdKVKK!rBmSjLcjZ^`0KG{$ed8)C!*9m_udWb7rN$O^hWDmHtgfm63iEpOclY}=3Lk-}UVEs})`%ZEeuPNSAD)Ag6Q|>d?K3JS}nV?3q-ix%r=qUd72NdiH zKzYJ$q(`AG<2#D8Qhq{9A`h4b+Nglk|~7!TVWU0yQ01c9T(D3`2Hz^<2`#IxQSG zEgh@bW9uYIJ1-U5SvJ|nb?YpFN>is%;ypBxuX0ismi3Z+dmlUId(XzaM0?@JQRPux z2%(%Mf^KMGjiZY_Xc7|Zn=?#Ck!WIcFn(P*u_!nGS zj*a*(IRwqqA_~u|{M}u<-#sO#_mZ?eazNsljVRi|>z6DEFkUMEEZCQXu^`#6O_#B~ z{0LnAV~hnt55o1NP4uCaj7{w90ULMwBa8&hQO%1VA14NT10)6q{WizBW&3^H?bp#W zjF4Gpzd4B2FIkXbDf>q1G8RIy?Ey5v>sGvNVAti#zVTSE-ol5T4iyV{|9Q9_{} ziA5`EMN}kQA{M4^EUOy1WQC^jZAsi~yG*}9JiM(7V-<)cykDcJ!JbX_=4`qCJYOHx zr-~8HgpHrR3QWcXT7V+-L57HIp+0M17tU)%fd_|zHt1!~4MO5|<2dyoOfuPNpF@8T z!H`aTnpd$enzH5H*)*>}DQwTRVf43ZLubZwER*2UU4-EcTmXcIE<9nnG79GJFQ-YjO?MP1yXvB>kV`t=?_tX;-$OvUMt@z9Aj7**USWm zVE8APkvgGRBJ1Sb67_rmzADo}qVN@|okZ?oPEW6A-8)0JD;)u>-oJLAmAeR4@NrPs zP))^;UGfKAsq3k1cmg!h0aD>cfsTxfTC~n66DJuPnhb!p9FIXRe?`m5l#4hH4}PQ+L2OjGbmhwhGC9nq*CqutE&fV9mZB%#7-hmxMHB zG%^v|(4gZwQ*U8#*NIbavUvtG&g!hHD0^sCW&)V}Ude-|@wXB`K; z33W$bFwOb!mK1*x{#4NoWu|4;PVoR&ybg=U*WUWm)KOWQ)?P9kY%KUb#fL<1EGNXH zZ$~LiaLX!P;D5Gc_@436uC-0-`4v6HDn!vhdzRks;~QI>1T1RiB&SC8;Wd~x3GrW( z2k3bSu4xwp6j^J(#1@vnEjo}n1<7zQnvus#Cu@qwhlnl$cZ6h+itK?3W%~CfRN8xT zL%sK%k;8_#eBn4eQD++Ueo;Xef1L|4tRRU=-#mIZEszUn4Z=?la1N* zF~ZBlxSQ#@Cx>dHV=&!MYXc_elRV-5%+FWWQtkoF-vo05 zP12@t??R;A%;OtkVT4eSNy+*c-*MpCV~2w&_`sf31gcTClG3*V=ZMBmFks8hFfo=C zT}!4NWB?rF%`6 zaqM-|llARPSn!vs6`;JZ9IauZ(5%dcx3gmc*j5Fjv{ZV~&SbmQDv#iJCfpUcq!;QH z4yaTkzFRF`7r7aX6NRq-UoC**4pM2O-Is?xU@&(^6G#Zo>W#6Q2Xp`$If^_M)ez4n)5V?+Cz!fZZHJr&hl?)RYfIn#emXLE zm%l>V-wH3+)>-E1OQ~*)B|b`TP1xBP|L?;G!g2U?4cOoFb@}rEw5N|F9y5 zSs@29Htb-eYu}Vb*86WHgby@}-%0PKwMcSmz#fMD#!(}!S(7}nf}L2Py_Xdd8tAiQ z?7>+_#fXQOmTBgSjwNpF^<~xLx|wmlaLHp*nj#wlK!*T2O2&d2xo55|zus54@x|k4 z=zgqQw=|olb8`PdfRENcW2?n9%Ea51j1Y1>i-ORLPD4va3nX*5^Kk>F?K5H+t7e%- z=5?$C(ju`!8w{9O-IdYpc?O_gJ-F%jFf`n8tUo#Gz#V3*jOfolkm8;JEo)TjrGAGA z51wj=q(=2x9lS#y+n-+d1zuIh=$0z=b@F}pvyrRlhv4L;`~oHz!)yuE5)^K*#&xp@ zKaRP_ci|+<^4Fdd_vF*{tG-X+9*VeEn(9Fv(J`U}p}qc*d20cAdD~QFbeDqt-EE@A zVo{{h2_>24oye0(@z!9L&v#mfW3(2G678k%Cy7N+vQR2Ye$E zo-1pQXsMeCzsD+**=$PY?E56S zei}+zMqQ1*s?o+(jkQVW)nz{`>P+%hUUUiiNdh1cE1VO~M{+hDMbHU`v9nIG9MuHI zr-e-=6o_`BDaX6hd=@HA%~2R)G%U{U+(4#3$3mtM!51 zxtKyRma)zkV9a`GkrDNTdyb3nJ`?R3uB)(h32}6l^`cEmi|^gbx!1Bv*xEKW^rVHn zHR#m6jcU17D=}?>E&HuZtQ(3-WBm?w-%rQ2X^)hf5RjzLAyAylthO(Q2)iEmQPb!n6 zxhE4686GTVan_BQ-4hu;b_NtG9n@%YX-xR%M=jPuRlPFh2Co*%EbHVBP_K*mVwne= zh^3)@GA3|J2v-C=^Du*Bh=_!X)uej)^Y<@sRIgs(tv*L!v{6JZB|5H+JOvyr6Na`|u5D`c7m6!8 z7>rj##9w8)jmFL>hjYLbSUa!EN?kFK15=9=?-qfqj^1KC<)Xtwggq%E)~+&Qv_a$) z=U&%m7KNiL#yLxJFN1RTHjA{{*~%{j<1Uz5b^z@c(qQ@d0m-0+$qG`d^;;jXc4QRP z+D)@qc9hjZs~$!3irk{34y#=ZEs{6V81FOT7(x*BA9m*p-P`L~vx{NW8fqLrrkW$l zOkg7)jXHKr zgLgfu{3^?{6#vgEmbAb+b}FiQz1QqgsfJ2p-fI)3TB~fuwjej>ghI}pRVx(p}R8AFh`K zEg2%$j=NdbiGr!|Elc+9(>Rym66;us^y^XLs$LluM}r-hAy{=-y69X3?UC?MKgqK+ z81YrMGe=2r;~UN50K)39T02^OEKiG782Y#3l}nzrC<* zG*DMMvcA0tzcw{Gb3`=mQGux%hO^nxFJB16eyKBCezPo)+{R{w%!X~&gqQkCa^q{n zx!PUzAjp!H2A?jUql05d7E*G#25y`r#lKGFKIaSc(mpxV^+=A5p)vGa;-wEhRuh19 zP0eKIpf1bxz(N4U2u(KRIWe5%17l*jrd%GPm~fCyB^GRBd&wSGai7-HVys7v>1B`2 zrOsBW1nc4jgTE$Qx%)nERbj>g@?sgc})3}K1a!8W@Q@V^e5z!Ibej;2_hRv!e~a5 z-cwZ44U$L<6tB=+tJ62J8}#D?E#*Oj1O*221~X1?56YiNhY`craVb3FYVT>h*Cec8 zJ5p-9U+}Eq!oA#*o|Q2mxjKJeOrpL2Gzy_&5n}t?fXVYnBm@z{)E^*dl=UQ0jFCZ5X?Iv1Z*%`#`Boe0Kda2JGT~#(XIp zK`v^eWQ1Va^9T|NYIf=f3Jk6znqE7!(5N#yfGNFr8JmDr_edx57aBVi9+OI5QPnA# z2)bEBlZ>i5x5qk`6%PCTR2G+h-ETUa4Qe2iAyoAw8WtNj#NZsYo2xNwx!xyu7<3ER zC7Yo_?{8gZ>h6WKt@>?auvh^<)wYB@HZ{LfAwQH zT8vzg*Px6_9byshR2NIg54FHbF!TvZ8MW6v;(M|($=fnYGPNm>DG!*oba=YBj8ZA2 zp*3Y(HpTQ_U5}aO1yEGKf(bpr+w#q&a8X-d#Xu`f*_;r@?hzyVVsxG$fCc=K(|?Y6 z+eK$fx8O0sLOYL~&+b!X>pC!bg7hfK{F^!m3;{<~jhW1^fw57>eiYmzL3SLzN%xk2 zeC#q|RnwFsaJ^yj_J$>zaXZ3y+j#pDOD@$}JYmV2cU0%Knn}XdT^N2+&B`{rlO`d! zk6TPl)Y7wMju6iT3sn^pMjP0}_YWbbPx_DPFYlYE`BJ5@v%>wE(o<5dl=_vbcVW*e(J29{`>%WNH8wwOnW zB_7J_LFTB@jQaeGWb<725%rV3U<`}-5nri8f6yf(Cp2a zxobZ*Bi7?I7vyE}-8@%uxR3;!-r#GC9y_`zf&-^M1n>oa+MQQ4`ksr44i7e1rkdt5 zopg@tjB3mDNz=1>_=MUY8dRmX)X1dY^9EBrom@q0=|%KhSRCp;V;n}uj#xO20d}zr z+lE~KxjA<~VYrZ(`~#0Woi4s)$+ic02)QL$zO{8V9^zRA%>@_q;nKnuWe>Pj#8v+tK z&&-vzk3)k2lq^`(&HcUt8R;o9H!yEGB@(T0tf!W|>38JR%=9V5hR*_Xr)`$NazUg2 z?{&>zCpEAlAMRjzbBV}9{Q7xzJcYcA_t9gS0(HW5zA%ydl!QOWL^1GvJSa%EEc3J$ zE~mQ;gjHD!Hq&&SfSBTT4he@Ta;V4Z>hoC9*hv)Ph{2vaCa%U%=Q5vFF9?7gBV$%! z2x;G6nK3`Q#dSXGKG?8Ud1C$~sd!gNs>)S#(Sfyrj(L_q`*wb`7$J{Z>k+X@Egjfq z#7?xEbD)PF{+calbotTB;6Fg`c$cye>)@iOFVB3hjrvZ|6RoD zucjj6tjAK3pU?V|Cd)~sJrr_+%-P_*=Y7{*0i<8~d0Ls?FwZdKVlO3TP}LPPF=me_ zMR#t$bM1&?Vl^me3Sedxj17%ns`1f5(;W+l^wZO~wUO=?NsK?>NG5hxgG9>xVPV}= z&9L3P#QDk}NH~utyUENNP7m>{vDWT?p(cdvIM3z^F26lS-!68BuY*hqYS`r1U=qxd z;mqBB{h3%<;etemqQ#jOBdq4Jk1VVoEn*(pl;l3tU6-A)k^^c4+>z9>EG3 ztl5%&U-Fd;Q{$w5QL+ar@N!or+9}shJLJ*mR?_QYJb%_qs;T$W(In51TtIh(NtfM7 zDUpurV_BS3#8<)HNYk0`sxzT-zhmsm*QU)fUh1Bzr`M9)$K?8mZhWJaq5e{aoFrSw zKG708ljag8@Qq`-B$FeOwbzy){Wr1oLMHwwB3`prAd)wy1t?Wp$NLtmb8 zs_Bff*QMgWlA6Um>{j~}wEeLcOW>XMPm9=oI-&qnF15`0=Ip;8$uP97|dzuqCQQ>uRkBY{1_aUnnLev{OQj@j)9`- zvo&IUJH@c!3HHZ3TV`c^Op)Yg@4|B}$Ijgs9m^NQo6Kg6b$bG0w7Lv=sTDK0*XYz9 z|Hw*5-+h^svtL$6m}kzLd9Vx<8IUv_l51u3AUvHCL%pSNcqOVkqB$9b(({6LPDv<2 zq^GeA7UHnzrE6%VYxO++-d8VoNp50C=k)zgu@U+uARqU4O(3VaI^NXT^4k!; zQmJSXc10HkztDKKd)e{VOkGFWRr)qIpF9~|43yZn6itg1O`SgRAN@3=3z=dnlCCxb zjW80I0wLn;S?72;rstv`yM4Vu6;vEP0e`aw{m{#;{Vwu8tsMX;R9O5>=J_4M)#7IJ z*rcTvziU05;{r-#(HB~&SMRQM%_hT*Vx1ujnIpdD@x$?XXF2$~_(aMq>Jkp<$?o^K zHGcZi3{UUdbf2P7vYEKnmJAN3EBw{}iomI7Rj2Dc5pV0?{~jjfmee$C30LC5G|5P1 zhdK0|6TNM>p&w7M#W66?Iyw_Je6%b&0giKQ`FYc}{yUflh)H?A@4_e6ZfXm9FJ%n` z(9HFHMj?E>&naD@y;Z(e=c}iv)pQ(L5(Dldz3a?ZFY_j##2&{wit|e|?y9|VWy?Lj zOY7xkpZ6*a%lfp%SkGYQ{$qVBQOELk4?8fa*FBN%S8_h5@=9jrGrSEzNX^u&JO9u7 z9Yzb_hmFwN1GtIZ4QmWa_5+ZCdyAMe6~RRkU$j(RKm2f8t@}NL95b1yFcC%5OXAX~ zc(W#_@B{+23}c+^RxCGxEcY{0P9lY-WFq{xWM|B^Lb;{O23<5>a+^$a4 zHS9IZJolVpDq`&ob3ftXpj|Y#MJc>nED0d_s>cn8r27)T`FWv#OakTO_rl%+o)A)L zN#9qdqTo>iKIioS()EAXd+&Iv|NnnD9D5ymbL_p2UG^rj_sBS8B@)@$d#|jJnZ1Pw zIc7#AQj`uMiYPfmUC&qF@8`PxuIu_63NtW`A^rGE51rKfen z6lHx&je3X10cgFH%IgMnO?62QvE6tRoAGD30q3AGg~7cp6gb$%sL8q+_UFla7E<{R ziZ_=FB1=ivs3wX|B7mExEg1_YnMX9lLnUy}-0sJQz9K8KBVHxxM$=Z(9gVE{Etv&7 ziCYovACf*s$J0gkaE|xhf(>vI6HU()Cm>wcO+Bvv8Zf(|FoyeqK%Kv9Ge3k2AyZ9l zQ{F3ESfafw69~=WwXPnSiIjaP;m%3`Pr~yFw!H(fAsMN^d2a{LA57AbT*jJd<(5-5 z)7cw!E&2r>)2LK@O+|#EnHaN^SyH;%S7w*raGfN1j>o&1S!1sd2h+O?Rif;v1PVRY z@(ou~`h${e4U+orCiRQiG(8p?t02Bhe?(&0fVEX*!IZ+ewK`*M$e1yN(St$@^Z^OI z*Na2nW9s9ADQ9wp(6S5-Gq!8Iyl*r`%|z%4BB&lD$s3>ycNcb#TKW<-}}u;M`nLl2Wz=s zU4wsTR2g(FYZR)XQL%WW!0{AdtnQKGy75I`xx!0U=?;3DF)wP0A`J8_g%PBCp+KP0 z`qjELFQXQVR-x(wjKzJlQqtVOW_hWu@&H-e_4w!GE}6>Sz?WP1e&`J|g%c4NX8CsG zb__og==EabEIu2HDO4vIBV`0FNDV+EQ6!BdIe_bBvBKW*Nr>OxEgznM!FR0hh;FC22l-l6DVqg3 z#L?gpww|H>d>`N$RV*D}Q|$^tV>$c}gjG|+P->;&D$rPk zo{{xsX}vR+gR3s-_2@WP4L2f-AnFjXqyF!2yvGwCV6bi{0zWQhe(p>=#^*4`?b6 zIQSc8*i7SnVd&n3*CLT1mONnwAIxu$hYvviC=lx1MiPLH$8L7GK&1;^>3)ye+zz*JYi>x<^eyBhw5nFO&$C_ayB7z44i!WiD2*6lu&bIfyMRH z@83mY{`W=x_v8BiZ*@J3#X5h$Rsp24;xDTKo^Rh4+}YKXgo*R>lSKbVu4c!HVmMLQXQUs}Jm$6p0 zz>0*O;l;(_-#@Jvz$+vlP-mt9w1N-7-U>xP{rVaA@zes02~bbOAZ2#eNbIM@p$C8Y z!bGhCP+RXb#%(&?i_##MlRkpgEsvNqsatcje=_7TM1oL}T3pI{=YuKODV~A@ANs(y zF=qqcut7Oi8Jd~>+qJYRu3A(oGqVsUk$5_wt$5=E@R|yK%&XjlXpT0(btbLN?$)Eg z6ozV+(3#vLU`dw+pyoCNWQ+(fWPaQPOhMKqDltac{M>gzCvSBU{-G5>U&bga!2h3f zlxw8i-uLWFZAn_`0B}4&0cRX>lqTp&;#S>XDWpN6;w&2lPM1=#E zh=KAJ5Py<090!(Zc4ELjOFxIp2AL2-hHL>N5n!eMkXAc|_F3hRFjC|nAaUFwa}G$? zlV4Xmc_*>sEO)F4@dzoTndAUh{eoiuVmZ0|YV@(!Z8Z>zdWH}90pugP0Rcjqh5Pix zzqgK$lPAj`neu3sE4f@Rmv0LAYKfNv_XM|SeEKXpD|-_l+{Mdu_1+U3oZG#h67u;X z+dS0AfkTV?bP^p|wJu$=bMN!4BQWsD0$%3#CX;HGr-fTZbl3v(sKVj*E%+j{04(fI z9snz~Xp949*aws%;MZ>fCP|&@h6CWd0qkr0Da&DVDL7`#=0c84prEP;2UtNKnopw; zR?smP1kW*}K+t!tQZE7|r>CC*JT)IOMOv&g;z)fKr-iS7qVng6vL1P4)|v(`dW=G+ zkmBwFI$nv5k3~L~7};^~F=U7B449KSc?SaVVcArfq-n^g;Ks$3yodiTG3eglU<}sQ zE(vrz;LF_Egow8X`afFmp)jC z*Z)%U2Z$^V1=cRa{Hr9T0(pV<%DSgBzk%12Zh`$>JsY3)F%rep`|-!ez!B+9%-4>x zKB6#`fBgSwYcv|1Ky$co*GY4jv**C{}d= zjDRRilPYOd06yjzgw4WZ0t7^h%<^~HOqVN2mwh)$y9=zhJ}hqiI{bK3`Lu6dL2qGr z8poT~g|-aXv#6JaAM24$6BBDDP$reD#S%;bLV$4Rt@)nN>U;6wqkGfWBZ#7;n4Ex{ zkxU4%d?WL2GLqL`E_W-i(FUwGpWf#HgL@2I5bc&Zl1TXF_DAKHFds(F@oxdyMh*ZF zBy9Q;jYl|LVCU<3S_o_u?4JV%rMK-3*=m!U^eWEFsUd;jC9vA$`hxpSaPyc|>>o2? za(^-LiUJB&gnN-Upx8-<+l+|R6}{^G0@eqD8|vhZ)kC15U<3%4qTE4kohE=@oo&m& zu|9V?AZ|D+>^F6y0CX=K!kR4l1T+bO5d`MebaMH&1oT`krWfBOmv_v>Wmbt2?c#4h zi5bXg5qQxM6FA$JPxBnDXu614tb4^IrSV+Ld&hsooK3AvW!5^UPD1?E_|Y z^0NQm#e(_=lG%04edak4D6uJI3yT?akMo* zTb++|KXp2HZw~f{O~+l?9{FFoL@8KemQ5NXMM+buN{`_vob3Mb!o>nb85WPgbjdM8 zQ7*2Clso!t5yAE8S5hDWP_FJc33b94@)~{{geXvfp>75$Q`&8!9g|$ywnPd?xiF#6 zQr=XhoMN2JOSi?YZl+0b1;+21v*4HQKe_Yqd3L|B_TD)i#MB3&UtQ?(igja{{XM{% z(}FO$)&BLB;KbVb$0?XB2fWz#fK`C=N&9_6_W0yyw9$F!?w$#MWPi%q(7Sd%K#ZUm zMRvJ^NGmvMfMt9a4I=F~)%AJk$jw*#Z{$BM5o_MfGU;_t+q#Qpo3C9VNKFQD@v5y; zG(H!33d=eGEC1qYK~{8!R0#BEDJe!cde-EB^<*zl2KbG@!xU-0lx!dCKR|h@Cgou` z45$d7zb>x?OIFbq*p^fQJl|E*`~BvNWXPDh3hVYGT3S=H*y7*CbBS-CfBti2B*6{G zQZXyHt}L`cP|OoW;ce@{^6kUO>+Mfo3)+KSzYW;x?%2_d?}G#S`!x_y%9`{*S~l87 z)?5Uzhtm=?$_YVGGI7oLG~g;Y&fMSvF<3RGu%8P$v$*)DsgRYc2)FU>B1y?hh>(-* zu9+gciptf+qp7plmmk;W%evmH-ZoN~+4M(a^cBTR23hqxgC~~KI-+jaQBji5tS?eR zEd~B^n+;auoe$86s1^ z=lf9boQYfPD)au-pIu=9@OOX!{@&Zb)%Hbmn^vBs>X)W(FaAArw zEw_}LoW759E9sR&Kc8o=oLpB8n>Rgf=E=;3$Hu`fVE;*2(QXguquE_Q+kOC3m9)~O zVDp70n@*og{3cB6Bs7|0WCFzQms-p^B7XwwPB-Xo=9ZGK6?P6#Zzulv6f>@r7_^ag zP5AN^VT9S!1$7oslWK_$0Sww1cImu8W~x8Ps?}CE|999nz%SgL&(3TKBntN1>Cv=2 z&V?Sc1zdouSE|1Gi_YeZwKjW<+#w(>*8`}3ka&Z78;WOW*%?6MqqmB{$J-Sx^`TIL(jcD2Im_iWodW7y5|h}vhYJY7p0V{^h) zDbnA&juhE6dQG{GOcR@zSrS!80Dl|X4d-Q{kz``sQaYc)K;k(VG7@CpZouaFE?@~w z@;Ph=jNRq$t@FaK_(t-EQ@hksVwEb&dL+9I;ys)J4H>9Lx(l7D% z8Y*x3O2RgH;9_dY?E-qjOXa{?;vUf8&}pD8Ath1^BG!)h+CK3v7uyxYS|v)aM8khW znv}wS!ka5A+LICqQa32ZK(ZU(P)z)0D7!)Ju}$-T!XUcU#1t5YJ4 zex%%t_iP>7Ug+=Csa$$i7d6iGe}+WX#~Pe7S^y<0btI2hDx!m+EfkWn=br7Lg-nRH zGpeTB@juppQHqFU+dGqEG|HG-Dj=QfHEZ{>iTdo`VuHM?jlAo6Y(B|>K1i4%p?Get z{}WdO>~1X``7u1;3-Cs2a2^CKw4D!lPa18OeGFr!AfwEd8d%)6?WE8Q+i5lMG{1Hg zNYGRQHvg;d<}vdaVnLHzBN6`JfOOf9nNW$w)}~FhAj*T%c;n59bxfWA3`W!%|AbL}%Dxuz2EC7*v0-rvcCcB@0=eB$DwHMk|~qPjby@k%WX!OjY38 z5-*WhwR*vO{F67KFK~AA&$a*EX!B;LsZ@3~oQhzfQ6?nv!Bq(1VGSZ6~)c1WU0A{_ugO&dh1;KQ@xaY1LZ@f?{uvgCu%hGLM(4E#W^^88-={<`QGsf7q+*l=dDErMb*pvp!QK z#oc`&(FDz0xur0d@uy8HxFtq~5+i=@NdEI707&5|xBb`w&dK_C z)3c#l&^h^6@r=k0D)gF1-h@GQDtip{)Z=iBMoflvUuNJY;!_^!Ku?`s4pkLHi*OLp zr$ejRI^*HBk8n)Wx63?-kn6nK>_?*^j1T(_3y%{5VH3OJw6qGCK&M$Xdd?09-u<|K zfXv8V_vxHX17=jwM5-CR_RdYL+?FI;%s=i9;}E{BRSxHPP$Sks*kq#n;77?v%{`K3&O8dSBUg|p?M5r}; zyYPADov`nBsu3!#py{A({IV6C*viV?&)q=DXGU6c{9s-Y{NYc?F;x%Lj-)5QA4-qZ9o(7M%6jyo^vdU|wZN_?z76Mq zA91Bv!8PUp!G~!{r+9PLWvbdsT#@pW-sJ(R`jQXzI&doSsOeo`*Yf}=Y|O~2;nayR zdsGQwZRs5H@Tqp+AI=^`^7-i@#~^Svc_cAOo#V}dObpXeLQ`r=v;;t8S~u3z+}A&A z+GEnNoSQ6{Tt?jS=b%W8vHsS7<6>!Cpq0RT5kLsz8jjeYo|(+{w2qA;=nc?j)}QV| zPmm}YKpACIs-2fNYPT)p8?_6bkxbKU;Tuhx6iOFgc8+;%=lEjACfwSI&_rw6k9MaEHzXdHyl8B|Y2hBwz7B#@cl#ZD zLuV3r$sPj-*J~foZV>wFHH{?8B>!fUcV)wMrq_)L^=nDQKIYgiCiAtI%esm-*Ke^{ zRJhR6dHxC#c*xDr0XJu!$mSAp9-{?rM~O1hw#_}2jevCDv&kwaokNbCEeFf z;y90ycN6K_jF6|?iD_d}xLfUznx$d=SK4-W9>BRwUbY_oe_31hE(H~+N-VW>KL2O6 zBSUqx|5|H|7H+$ofRDMci^*5x9&D*4csgz2y+8&_gr3OPt@w>_^R;UU-2^SN*00*W zXiW)wa|R>jU5~DK7PuwHkfHHj1SXtkX%>^uZsV6G--`QfS`bJR7}_~T=}k*2t4kkq zx=mFC=^M;8lawYCj-Fika}mGv(Pp2ibe}WQI`bz^Q6E^)w%^@d&Utn+16!Ha(f}K{ zAEf>-`|}LhXEW%i5KLSFfY;r&u{@b&(R&lYa7rms@*;x3r}t2mhgXs~LPq@}PIh2X zE&^#)n)#cC-Un_BTTEzL{2F(}7dTjVlRt7Vrr|&ztRiv?FRkTj8kPE2=6eWe71h+v z5ZWQ2RYX*KA5t<=qJ&n_BxB4}B{pY@_{9}o!R888niWEfx|xIy)H=1$084b^i$Y33 z!SON?;DN$?CA*V2aGztA@j~($Uh*229VjQr=p@x>7F3Gt6VWbNIfjhUni+m!V*_1R z_o5dA6<55y!cw(pXm_evm-c7TluR3b>o4xQ4-nAJb44Nq+p1sGWSwNjTOd$vy_k0~ zO_TlX@{Zz1qpbM2G2l!Fn8&7=9$?MD-86}Xth~k=^Y)ZBlYPU&P;0Bd@eCtHWyhDB$yO= zVbrl&Z3n%}FSsUSpRRwo_C?#5O?h8uI0G6xa!#CfM&W`RpMMV%enWWh5tewh+^;d# zNf%JiG7#I=Fg5MA+T`EVlSCbpRJ{*a}-G0|35DYnnVv@=um+Bsh1YFNym8Gp0Vrf&k4^OeF;jvvk% zPJ{N&SagP=&BJTaww!2--V> z1UiRMOCwa^9|ufP;11M!hT~BB{|$zt%%o@fk>KfnP5g^R${u#R)jk;QDH41F}=M z?PCc_Ya2kzRbBuruW#1xRL*y2=~dwJy@+V3)u|jbrF;xk99NuHl8Hu-D9J~SyDpyj zPk}lrRcd)xAIXl!mvztr5j`ODPu{*LVVv@MeX!|-KIf&N!a);KuzJ``=B#j>_iqAC0BY0XC&fj5&w3{#(M94|A=p(EO$S`j~idrMN$xT2Xs0>vK>doMw z&=9}xs0kt3K_V}A{o)lNY%ht5d#2bIw9a~btnBZnbH6R3HlBEAW*Tp1uj|I+$2Pk!ZfCz`Es|P*yhbS?0OwM8b#Ce5-g?@2blUtHC zYO5H1x2tX{7+44uUK2r5FiH$hMxeGrzY6|HB1H~epX>!ywN1~PN}#kJDOktx+H{x6ljoRuZ)CcKtV^freiOg%WB@USs3U?IBm+ ztdRE#sBu}{jOc#i!ngO1r?_~)Oo>9mxBDtQJ)T=1o?QEeKK(Gq_USs|8)o&g2Y*5q z3{aKzO(UxG1^bYEcAt$>QHEa%z--O6QzdRTWqb;Pvbp5=<9j$WVn zC;GsWY#=lCa<0x-tdicmFM#HJga6^LJq(W*73Ur^_nsGDgJvH6io~z))8`BP)s^(u zdc@pk3OV(0SHt#pa~~)GsZ32#Z_U>!sRYcjJ~j{cehKKmn#zp)$-Q8{QHe7z<&Vcd zJICvl&*z2D?>do@4~&9M^VWzoro!=W{bAe(dcWkZ%l-XVQf!5omYcqGFpR4J5XQ4# zeRur+8Z%sD>B-@)q$dnwYh5-QW;y};N9e(ezH`5U{CJKv3oP$ z;Mre4qs_9HZ>8IMr|25@?(A!->S-=8T2)qeH+3!Q0}R_~zcy(=)mH11W03}t_lo{1 zs7Lj}c>>NZKV}qqgYxYex|`a<+Q-{+l_SojL#KUP%lS6J0MHXapjp6zQ`aVM@I^MT z)IaIAb3>WuSmw=iglw@7D|)3}TS(MpLwY%dEF@aWX0C;0XNRC1Upf{}RHw~wgMUu9 zo&E793dNpnvdJ7Y<61RDvz91BQFa;Be)BaUrCZ|E#7ZhO#9}b)NrkfQbV%@GbUTJT z-}Zzt1ouh>YGE6eOs!7Xi8Y3`*4n# zk&?yRrN?=mP37i((z+c~8Ul`4LIRK6cq89#J##;MDG zcR*mUWXV-Vdz!IJ(=sdet|eGiM&9Tz-_VqdowX-ZZ65Xv(Ymn$v85)dg52B)X6w!K zuA7K)#BD?h!WVIM)n4O|D)ZK-H499lWaXrUcG4`{{WX)`aX_b@Titb6?jz{mm+~gR z%a~&+8cC+IEIbW6rJ(goy3w);_}!C0p^B?m&k(o(v6{S8%jIyp*&KD3xD(`yC6?$0>@k|n+5_cB0Sr`fk8cH!Bz7=ekTgPRt$N^#$8l2VI9o?Xk+?-StK zJ$QnX_UaTrN(emRH^s;FZ4B;AR!os?K^kD~$G;i2Hf4a&0Y^wM-kBLbQ^*E7GPa$7 zDLdU;-cb91#!Jxq$7I=K5`bl#8yuoyIumteCnRTHYD)vjSO9sbcr!F7UEbImgxYMB zP@guDoVG+OiK#57;dhFX^C)vndb~R$vL?+CsZYm~T|AXaFiaQt5FV)1QFkI>Xrqtx zTe*JmU6Y*)hM!Z{YJKuu*@jWh=ESNHQ)j92rke%xJ~u)d2os#9Uc0C3 zx$6G_gB0&_rpEr5qR-QyX#b}4(ODsytAyY64nIgax^d>LE$iPGC@ERI4zU3eoav}2 zI1!9Fp5Ai%R-uz)qKInOOL#%O%qRM2XTrx?c09g$k9rZQ;Rd-8O!S*kZ}NsU$AjfJL3}V zQLXXgnTf;eGc2xXFI@+%bdhCiV`T96js9j{1tX?pj`1-tKVTXu@sHua#!6Nz6%<>u zbc`3Vw8RGqPl0zhnYm`~-Mxh?wHHP1Ei>=cEwbWLcVHFb z{zoJPt=a`T-sMN0$W*#k$BpCPsNdiF95IQ8&&KJHT(Hmh3`nBPU}rL)Q2&Txzzuja zl2S!~MtfNG=rETtp`EAQVEK9^$GfqckFaNy%z76u+C${qA=YA)OuN`}FPJf4KKYm>AOIc>z7Zwx3O?(;?(|Gd-ti69JwO7*B{9F#U9c9@wuO~szX zAAb=stQ}<9TGppKXJ}HM$`$z{IA>pkbw2cV-B%mehQKwquW4EHuD1*@I;Gg;s!xY6 zA;h0s2^B{s7!c=?L{}!4Taqv&mG3W2M>DFR@$^!UVvl1g6Ze^X9d$S&x{73~SQGhA z1?H{EHz;aHYDD?S3z!9hE!MDf$zcoV%vv2et4B_pW?R;cYgSc&#Z?@biocjAXr>t( z{1W0UT6#IpR!*wv!{ta%>mYHhX*iwiuh3+U?@&|!Gw>>scEMvvIgB_A1N;$!xX;)9 zS`m&+Tkim?_4cyGNsIrS89gR3aLg8+pAjs;y!D|eB&jBDjIjV^eWsdXmf0(ZZm(3P z_Xh^t&mBIZskrB;wydl(;|-nI*ys!|mi+ZwF+@MFLPPREGe^N_&>>p7cjm+fkY~@R z1g;&$n#SayIih8V(PT%lIcgXAsuyu}uW(e}QFRh75+V}|5yde}_SCziE7UPZ)aCeN z8AEV$*`Es~j(Fn(1b{&lZOwqr;v6?L-o*`}w>t13+*V8Gc@A$c^Lqefuae2=ak^5U zeoL1pEFHcNx;7GH=U{8@1K%@=l+8F6o1|-T!n7mFn*{b@2WeV|1Wu^Enf zEnh|BsEr^#5JunDcr2PiT-1+9^)ULyo2T?)vv{Puc8K2a($}bWKf;z{Zo$lPgrtLT zV3>LAA^%^2r&&ZWk>##!`mMscKX4Rmnyea+hoA+~#f=baFEK7nE{byr2ijD&$eeba@d6R`lzYKFk$1cf>|mvRKe5|=W;`5=~TD6l4&%SK4pe* z4u1u@alF?>+BNUQ_GvgSOTiddegj^=FN{y0lwJh2g?TrT6W`4`pD|&Zpc?yg-Fa$*5Af8M9}}9LwB1^h{q7<5M4CHl*}$<)$j^xqeD4 z*-xD3uub;}^;()Opy9&zzU#P?W=boZ>dF;GaXn2nbCqVzmES?IKk?vsKBt~egEn9p z8%GIvtX*u;9W*Ma-=8S881eDkDB|(rVe#aj8MJN#TFhP0k)MmdW!wc#OpC4tHq0y` zCIsHYwNS+B3Y-!m(;Szd1r-7U*G842_|cRt0X}=UwF2=@M85=LXo+MoIjR+al-L9N z%==0dyGS66`|dTv?K9UuZKj-fmUnm~{jAWUL=2qEoZbwj0yzBNM=J?=nUT4HdNV?K z0c!);vY{s2o>;(>KTiKDI@oOhgaDR#c&xcPzc_8ZSB-@-~azwO;-Io~C3#VK!V$yRV; zpC|}qQYN%VBpbJR)4huiob|pVinAFN5{#c2C4M*}oK+ZJ8`uBoM)cSKQ{ZZ=droN;u=gzJ!WMnN56|Fy(ik{D-A{NaW6f!0=~S z?9*>rAXE8;IlOfrCX{}~NIW@q_@>{4N4}~e6}hBxj+&ia;Ei*DdC1=l{Rl1!0L|JJ z4rt0DSF|3tzzNXC;Rg*slupm{xvH<{R4%4R@6~5o0ei}>slIZmW^1<;tH*^YXN$%_ zuQu3VZ~_D6vhpH>{5lmYoT>>`Kp)v|!xr~-Plprz5SSvYo=t9>#jQptZGx+^=M7|l zYs%w}alR76BqlOd`|@8~1Jx!9Q1Y6YJRY?tsIVKy4Jk?oTOR|FzbT;6-==yFf8h?siY)GTJ4gSYwY zD=;bR*eMHJ<|*tLH$Lu0KSVYgE{nT=WDOnFf z{yha$Ux9|?ByYfz2LMp@63DNjfWAgwV5;hh!(fW)!f!v=F<5B)92|dp1|hV$~@ zeF&BZy*VI1oYu7p21Tg4M?no=_OuAvz}Qre9oR^4KOj0-dwS<4=O?wle=fcIL(|!= zSspw)R6YZc%ssRbr|`rzf7rWJ@ch1weVi}*&#ncxlofl$NMw|w$k5g%IOiNZ@}5m` z@dNejO|ZRmr3c0TVfVba!M51}+QFUp_K+!={11RQKy3_1 zBkykM+5pUiSM9BxrvU&=bPV+GrXfEJw9z-WPcndj1&JSgse58Y`6{{GQRz}_fcq6I zE|4~VoW`zX3!pMwC=E|H&d%v596Z!wwO$8+pzZjMr?LV)6|KrI5dfzMUIa^Y5S0pC zVZ}ZmeE>-9Lpr;Ym{jpH@9t5&Ozddp_bA+#< zHo@dN6+a=UMiJ!^$U0;DYU2Vk<`#f`<%22H zl8eAr&D-V$3yqrqeFC?4T!rvj(Y60JwR;PMq7@tXu6_p4O9G8e9s1XanC5-}o!*zg z&}0jMQ1gJuenV}%ac4!S-;O)g&tDm;x1sl(p1tD10YY;ZWtd7Qw{ZyUZK+u>`Jo-J zd2r~ocksuCfU(~|$t5FU=spAwt2lv{!>57XFi`&bcK{!hNnkP}1VF|QKH)w%|58^l zeF=~*1(3RPleNE_uNyc3{>9zlArLettkjF3AUe~z=?aO>_CW9Becdo26rcvlpyxWA z_InxpY-jXLum9!@^DQ7xE&4SYufy5m9LPh;YS{1Zxn+du(+(>1;}I-uDUKP195+~>(;dy~qq z$iS{y47~v{N5c=pKITF;_jk9lb&4%+NNrIfF9F808$|?dS5|*C)p7cra1G_ZXY)^O zS@(y)cWS>>{)2TblD%DQP_FDCPpHJ8>=lPk?bw7NjTc95XXPDUp`aN>F{#EI+Igqi$c6PVzn-G*khb1tq@I)dr)gV~f zE6~j;f#kG(`yQR?^+RIq4zRfr2&j7qVOT#%&WqK6b0v>3I*3LE5nYLejxOLM|4*$M z%<)I^UT8O8%fi3TaUOmB6ya>LuecS<%vD8lB!n))ZL?d)QaW0Gr=H z4lIr5J!to*U5O^@2iTgl-4`xiK)YF{vEFh*(hHVp8yK7RZ_85vpU@L;@xj*KA#*F zK*WGz+ZKq(SpXmXUiKzjZH9a6gJ<0%K1hb$3<+NI{`B$T`;@3Zh!8+AgUFkcsv8q3 zfBdUg(Ol~=Eg%JES8eoI%Ini>o08m}5f(lFD zkoRp=ZzrVzSWMb~;V1i{Blk;|=p+{CUqBX#pG}E=Wg!q06Rr$}iSw`2_4G7Eo(IHNNWR-}9A6|DgI@=}+B|puKx$IEC#j zzZ3yxCEZ-zS5lvUj~AEjgbg-x8_VNjw}E>1e?LI5gvgvO+WuuGqUsf7Xv#{pKTmj7 zfp8!#w#*Mdfss)||HBFsTGt&^0sd$s7P-E=q$( zHhpWDoP3)Nc?hI~7JjiHclF9Y0|J0!?sg~uI|kVr&QSsiCW8OERj7@|>USahZ*AB{ zUD0=YCK`!x@ti3q;!<@`s{~3c+>5~)3;9-#OiBQ4o!xJ&wMRmuwp{_&Xfr+qFDWDA z>YhMnlERnj+^>w6UDut(0J2bQQl|*0|J_MY~L=npc63& z`m}m$asqScf~_g#I4D?4SS zn+?X?#?sr#*(li3n_h(kBl@YVuT{@fxVd>I!Qm`lq>%SZ^6+YZtYtZ7TC+VL*ayyk#9Nj!Jmg+Q|R`gbumD1N?(llYQsnN7JQCDKd(95V3Wo$Lju?vgZs zF73gev&0o_wR!)X6xjkM9tB;v3wimNN&c^TgUx)ANzqX`Q60k^)3_{B;$vLHwPG28#xp5O zsi))b$VvQ-qej-R9}z{#q*x(1g6G>i;N-KqX=H#ETb$kOaJz&z()#8eE?oq$8~yUeFb+t^HJR7B|aR(hjHzQ+a)56$AO6E~^!DMN$)GvL8Wp zLS=O92!>JYk9i&v-^vRtYx;v>7;5uTsZQg=?|JA#wed$)Vi}yCSBn(_YMp1pnr`&A znhzq%0Sjr)1&BnR8V7#sxH7c&oIblIjrLmz@60C1ISq>Ve!KxiuFs?Ge;yxCS{xRD z4~u&Nm-W$-aMgI+mn@S_+zy#d;){&4C6qrH{)F$jeVfLTT$FS=~-iuFjgpeqf8%PEjRYHHz#ppDca${p4 ztKgVJMSlb+1C&$oAGvaKh1r5z+g zc7uuRp}ND44hlZ()9_ohj{c*?5HG?Fm?s52;CZzN_=QfAa(mw<_Ad#e-CZxfm(yOA+= zVsiazs+-0@S?E8fj+G)?`DFJI$f+Y2Yp`}b$t_@%Ty85vW@b~1!2yLuKr)?9hkYv# zn90=DYzl-Zy&*QnuOl?u?`+-vEZP4ev*8v`)u1%CE)H1jIr$!8y!nIiCst- z75htu0-KmB$X9AHDm~l1LXwo&o)R*u@wmVqZO2rUjOxZ_6o;q?7E7LG=~MDAVaPp?oL7%A(+}$%8@;@kA}lL=Kk=WmphWRuqCnG+-cYX5e0DHu<=V4Ms?fKSQ7s0k=yt$^3<*V{lK2WuxZH*0Wq48)Q z4~_q6l1j=wx75JDxHK9xv~g`&Q8Hh8HnRe0+>!obG$&E)Mz&jnRk5i!awz0(L`qcJ{)?Gn5JmZc%K~)|5`N72L#?Oeo>qw{+x5_Jw;neILA5; z5@dkc2KNZ}~i^ zaE0Pk0rga$p@0vTuBqmDZ$YaH6rW>d%lve5!<^(_7IZjnv(TrH3Le1FBpf{muj!@25XM}B)=W=OmnfK*<+E;XJ+~AGiz1r#@$J=Q4*Rz;X^?kE zF!12Ac_su`nZW^WCSWo7c60@RAN4!))Dj{K3Qj0u1)$`vng|SGE7Gz*@3MTvCfFRc zaYE%IQuKP;4bBpxlKnf8KT5RiQ6qTuaV|+RRAOwXUn3;3r`ELX*^h4&A5Ta=-_&Cd z3<<3psr^Rz!+l)$qv!3;@z-;kX}L$o=nQ~**9qtOZj_6o9`jZju54Wir$LAdQ=Wbsf2#0N7YszGUtg!CjoE zT6twnf!YrCySmLU6GFEpvwOBXPy`%#2H%~PfD04g#}!T~ReP|{{hSJy;%K_nk_h~* z##;zoyd~+kCHL39jmw5aj;G)wrjhhf=!O`W`gqriVr5Wv8+fuwIH~^3Ex+JtE@pFm zK3u$CDnoW-BmND`K}b{JbEDA|*YJr!jaUMfFB3|MA(cuiMBa+thA4`?k2ha|zD>8W z*H=im&*iZ{`6&uRC`^g^I)$DGZA_myBIGY~48L`qjQ)jcDd={+9USwvBE*^BvuH4W zugnMvDFGH-E!2sr+nbK-Px_ucdEs7UyriNmbN%6;aOKcY@2Ag@=p zb5Yb2)uD#AWP(rKWNMIWmYb_yPkCb}b-2@vQ^kk|Ib}CC8+b9OLQGMX2X~Vitfv8P zV+(jTFm$U@bvzXiytSI0(-8wQx#(WelALk7c3G6%<7R^QbTSbP<5@F92U+_s4E~wTYt+$V z#k71_I_;*#%@wv!8?DL5mc$EBBZgw<=-NKG*k{LTRSRGQR*dC zo4&BMsYj?sSg7GEXOWu4J(s{}Uby=KX!72@2l!rvGQ`oMQ*YUZN;j?M`UTvHEvC^sv-uPd~(^(#RYQ-vAe}ij~HMNUu+1{W(cQaH5KegwN zg>yQjlK4_qH|&6SjfM!`Je}+2UKJ_eG#8;x{6*KS$ zC7mAFM*zo|bgqXELpG@#+XefK!c>2liNl+!GcRk~GukCS^^3WxX=6rUOv$ICw)4VG z79t;WMjpLn#md9lwv`ujH#UXv?`IIUZQt+t5b|?lE~IgNsPQVUMDc&1l1Mnv>~z*h zLMA-J_hrVWxkjdOm0B7~*P4k-f1=!nv|{b1 zPYIs)CutS9+gq+iMskM|4~kYOOs!pwUbfR|mM@0IKVKUt08E@EY@|KXJs{@JV#fl+!?*W>v5FEIK;YsnqaPm6=kTid zK363$rxLikdIEHH{kTy|1TO9DZp9(2?ALeZl#0XLnMl!;o8Hoou0g(zFNkd0I3KIp zxW!d`mV1=Q2KR@5>0RCi$;rV(_;Ht-cFbe|CG=W#_YqZf0pgn?XNCWF(lAi1=1L?_ z>c3UkF`aM*luRV9mq=eR?tL2-Z>?&bRWUQJ7xyaT|6uMdoT7Z+@X?Qmv~(;;cXvxD zT@r$nbgCe=AiJbA$kIp)A}An8qm;w~3rI;wNXP=xQVUXxoM*ph&Y3yCzu?R`I?lYj zJns|tbJulUNxWI4{9RKf(2IlDUndw`$TjfurHKf1NNJ@zoNf61@$*qkkkYnULP%n6mDgwnR+x-WskJ+VXYRy)6O^~K|G}) zS?#%q**Dp47H#(WW}Qk{P)ogfp*e@7zL8j7g_L2Tg;smu4G@N`JeMb^(0^+e5Mc1= zqITKwYl*z$c}X>t1=B4P05X=0by2rUT9Lz=N>W}rt~#l_W zt3+ClZ0rD0L{uNIq@Ph$Aght$Rw+4Siqg>lP=APGtKtTRf{>#6u@-B41gQs|(4^J| zz=1Y-xL(3{a2e7d<0C~}nZ^T0p?m+n+8j5#5DDcsL3%c;EP|<`l4=1@^IC%LR*oT& z*j1+Me%fafE_|<_X4=Uvc-EVImznOmZU{WB%A?h1?z;RCmLT)J3#2U^B&hMb#n_*5 z@(V^LKsM<#Y&EF5@0qlt1UYkR2qGgK+}?JF(RyDoZHwO>9me>%jpc3>JdEHM z>TJE-x(Iv}Aomv%y4MpG=er3-JFOxQCSvLBwUH^=&|s0B(Vl(Xx{wXcqQ3P$6raTbPuOV{09oQdZ z2!B6rRmhzcB>N(Dp#96SeeqlM^id_M7MY8t+GMOz#;Q-q7&`cf;ZRCr4O=q?+TtsX z)t&8&G+q%g*a;rzpiRTHZG%j2x^*r*41V;MjQPTG(^Z{vR!=8Jes8(K?CxT!RL3*09uoF&#Q% zcf6qnchI)u9xg~D(?${9jJ8|mvk^^*)|~YFheg2|w?Ez}5o0sAt~knSlO<}1%_yY;D}8kG{rO*Ag3=f1 zYilD=a9b3?zuYd;?Ysu%ine|VYWkMOA8Ne4zd>vju#4yQu&<_s;P zBlgoI78=Lm`CQ`F@C{V>^w0}iA(!NcIP87)vKytMi+TD}m>`()n^Je8D1FP>V$EPp zEc2_MepeW3)pe02e_68OI-ExRSZ9;J#g(C|*4t(Rjs&8qz|WQgPngdi9JI?m{|5%s zr~QEOcavWdTr}K+L9D%0Z19P}Pw}I6WXqfFFJ=gB3_<;(%(^9XE9-=xb_p6}w^I=6 zN=v_{vnMze(xuT=Z6aUp&i|6gBE7Axxhe@J>xHHG)R~UC2i4Y!xrnbV^ zeM_HMft#do2NO37LtXWuNY2}~1lc76ecPBW2ot=z4&qwwtGZ!z1VB_LrM-K_d!HVE zX}x=w;s(by7WdB)Ajg%1}-D>AN=q-S@N+=+fZD)sYK#$+wef6r1-fXgP`#pgkC=X(#a)wWCt zU8n1PPIptS=2Qv8Z@V*|wpter_t8GL2Y0hyDa7rq()NZWB`MA@|8Md5X9Ba6z?sYb z6ww+Ej+`e`1lU6D>d~g+A#Wm)s}X6^k01drAR-?{s{BlzkMl_D6H)cd1^21(T%-Q9 z#Cbp#k-4eVNU4xLl^Q@gbDBJ-i5uA{+1RNAgw!bA7Md zxRa0f(4It!%|92N9TG8YqG#0Z*C~l(Yxn9V6U`iqN&E5zw+j^yqs(4PSvjg`eos}d zTN8=lpU6(V>+?BSc!588_N#}IFuH30!==iY=nO@mvu))cPv`i~F&O;)msHz5Th!>m zMPk?=I3ovwB9?yDGh0F+8zPw;m%KK7k^90yB<^*A#atOpNw0@+Q83r*B%X^lmQriG zyHz_utSsyFux$!t=d$PCTEK=~9Q_RKA(a2w&qkNJz*oeOZ^W2HBdOTsxFg9lw2m*Sa?o{iaT2DhBe~Q9 z-6*<)zNN0kN04Xnxk_8|NS3GM*M+U_x~W#@>a>IJ@RbS9CQfB+w+(5%jV~ISzPH#( z$&W9a)CG@ci)FyO=j0H(-{33V+ILEO=v?>IVk!A+3|;oN5*@yY^Ejukkoq?~*9uF7 z)v*ryuxWqflz`SA+8R|qnX+~}ZSBS2YJY%V#O#T-#`D~>h_r*(+_%=1XhTF7{`2iF zr$4%Rr@5QqK1b)EN{r~A$h+e)OQ}#`LzSc!R;Ln*KwauU7+=SZXEupcA?DfOew5P7 zkyOMoRo|ssw#n;1NCSDU8lgnL)m1|X=jC7R3(D5g@XfKpM0EWRJXI7|4e#CQ7uE1B zgTWmsRMR9Jb}CUH!?&Hkexq$#Y1u&~;0ki{w1Y*1;)pS!7nYYFILt@nSg5S_9}ia3 ze@MG`o^;O?vv^jxdRkZkVr|iw(|b%Sz*~t zWh?PL!EGr|xRVbiPz`buHmWlkEG}%QFhv^C6Tmu3Rw!|JlR*-xsoZ-@5|1MbvI#Xb zxbL&}D7Qvd6c13gsHIk(*>nkj3Irs2cC6 zF*>A8VsM!CJG4&>l=iq@+H#!v4bP;c~Mv=k^jiyvA zOjHw~3T|&#+ThT<;tV6d?-{r#6JIp-Ws)_mWenwpTDXDl{{*2I+rvB1bwm0OHS(6Q%X#bdUTN5}OQ$ow(y&58k?aF;dT z5z-*Tly!6Msf=*}hyHJty<@Gbb=Hp=A~?gQ`x>l;z{&{4;#P)~i=Z|-=fxXgUPzC+ z`rN4+4=i`q%3=)JL#&CgeXbLf_||0Xoxeq0GaN7&UyxVQ2&U?(N2T2#9J#9;2FZl- zB(RPBK4u+j`+i&Q#E|(G%QMY;`quT zqbsXTsmexMyQ4@@B@Qh>qpAd7PLx)EM*cOQnn5EmmCce15);F|?$Gm=uM=z9*6!7) zEq1|L!(hjM97DV4F8sYExERR2*!phsz;>1!h?akwwNj96K zCXJzdlVMV=Y?+!?_FZ6*zn_V5Wb9W6<^dbjSz~M5k<#xMd(;R|88;VRe?xv^M{@Mk z`fFc~5*jY5U6)M#WW)us_1!2fjz&D~tG=1=ql#1zzM-gY(^M;FDkPnrP)9p!>~Q!9)nq>rLMIrx0H zd54=K@xFYy7;T)1=y}5>qK?)1^-Q58FeV4P4&s&uI}8_+=TwrQp~Zbn-~{y29QQQ_ z8DaqMjD3)@owv?ti^g06qgsTalm*=cp;vsoZ7D^;kK#aQ!-o&=PU{FgXKi>pw$VCp z?I<#M*ITkM7!fGHQ=`K~FxP+%=w9}xW~#qMCBTr=aSgbFPT&qj83~P-pqn~?i~jtn zO5{m%*CW-qRkQF@g_2zNmkVwKss!t1i@x*cg<-GGbfu@y!*8$f6=_^go~G6jspa26 zb*QY3HLJhl;y78v!AZiy1Xq&_*6tDH1RlF$W-PVPV`{>KXTYl{EJf&$ODe7{zet6; zl>VA3Iq3$7x*x8zd%rQJlO^XSl74rd*rm8O3ZTCfu)^yKwNad5ep%($U*fyMmCPz1 z64BxTCtW<@Ui4n0{P~9%stIINao?zYB4zPlE0is^d(vrqQ3bSF&?lnl6Q~{wH(Kwa zz6?Z-bCP}NOAbwJKBAA-2nI8mb|ljN!Wt&jDDV4G6S++9qLr9h*D~KI{_HXARwGf= zQ}o<-WM4OD+6XrM1< z78!Wg=MJ$89BORJ&>itEmS^jsu zHY9&#+#qnIZ5lWnkMw+UpZ2(ezHGXQoyMDOC zNy{|%D%&pBjjP#btWz~ajwX)}?^)DA=Y7q+iQTVW{^KGvk&5L)KienWYx5sNUwt{q zV_q*jc>5G>HGXLYDZXj|?TcSyA{IGKg4z11M&~$Z%t)OZQ=U7^5Vg{4JRx0ekCFRc zrE~wFQ9Rxn_UWWW0BQIb*7Lwgvn)-lJmzo;d8b0t5Ygse98PF8Q^bSpu!fk1E+L|S ziXi{F6*->8sL-PI7` z<8PuRV}lU1LlT0@I<#IguVE0v9adB_xOqCfai8qukfAV2MCZ3ktdVZYc`)4_Y7yBv zYuJA$z}TcINVrPN-D+8KlG?SAL<*7L+w{2kJddxE7Vq(W;3<82@~*H1=P4z0G4-az z#A$K^-L!E$?iZ``Nq6J`mB&KjC0_LXSDE*TG`$e?+R1?vcxYNXo=^_irsewo-2ZFk zwb^N-Vr?O?j+PAQd6558O_)7bDN$2sfl*nB+6ls^#e20r+UD#pl{T_S zx#-8~S$SCW{o;gz?RO>AM4r$o#*2TUFQYn{=}1)iPeiGJSCz{jT-GF}2tNY@jb0}~>xNs5{h`ggxAd(8@Qd4De!i&{)iJ7tDW9PJ~v@apYBPW*9 zT-q%9m-;rI=<0zgSxs@MD1E;;AbF&SD31rhC4v zTyVwsOUJpCTvEOr?`|E6GDhlV>@YfJnli6e>K|2W+StcCUZ$(EC*_L=);ky~HEQBd zwn5){A1buBwTeRYbnap-VRxDiq+31OyHwR!?+WOI6O|8)O(lSfRO9++yi_w`h;w5BSwG0*Xx#9QqhRgB%Q$w{E9I7>u2>(s7qbq;q^fcDN&6A2D zh+Vp^%GHV8Ah^7oh62XW+O>~m?G^i;hCMpCFmnwUwlnk`YxhH7C zR9Os~6Qi5A-Ez_=QO&ZX0ZiHAXF+!jsWZf<2Q3zgddlxue3jjI6wv(}zKSo_@;p-M zl7ZC*#d52NuRf_eLI`W5Fy$dU`Ewslv!>QB-~K0vm8aSDFz>E13qn0DWJ3!6lKXW` zLv0NDPhss?6;E57OMVC2}b~Dt1Z5^I& z$L(L*Ng7BSOHVv}$6N7AF){eNGi9#Gk-79A`x(wm!K21wANeb**%a~VW3osoM9l2r z8I5RJCt=H+awv?2I4Qo}(j7w}Ji@Y$U=M5Mi>K@yFrDr}*8j21nKX?F`lGg0yT2dP zKjvs;+(a5tnQ{+~wFzY@s*l#h7S#XUo&#xxvEk=<*9Xy_fIJ*ZYSfc_a-gV4E&CU6wfq&QI!)E~ zkeB;|T(~mTe!c3>V0{^C-#Zp2|LBw2vJw%9Ls^fve+J_s_ERSGGh9Sy$Q1F3S{3n* zsC!eZIz(2N9zh(%eP_2-YH96_Nt~mryPcA1kHHcpaVaB`Om5JnN$giY=tK{+7|qHs z-;AK3;DhN*pLOcQC)vC_Ni`ne{qb9ZBcha9cIzo};4dEo!ODG3vlb)RV|M-(d|=O7 zP736zwO^1aC)8h6VpVt7>e9F$dGX3qB*fS4UidPXEGP$2x>avHKlwZFecG_cSiPhLAZmPg&<}!cUw|0Oo*I&cwdKT@f(fZRKV>XEBkcpRIr^nna>(NC*B}h z3hl4MQDj?&=g#fIj9K!vLRzV|gSwz%3p<*h3|<@pT9+1A`e(pc)hW;qA*(_)QK$PeZ)t356qEJ{z{>*Q9JTm7hUKIP! zo`N_t*5ZdKelqTQb2hH@5GBDGUXra=BapEv;KqB*69&;DTd>vgiX4^yD<}GM@biaw zmLisCjY<2y+xzIA?+>c7)*>D`$@WUsj9fRNo8H+?kIbBRSTyrqlU+icr1*l;i4(Ge z{7Y}UZUxPdolJ=}YhAmr8MFr`kPAWkn~g>t-^}u%R(D=3j-CGGUvXzE{aa(&JND1= z(TLizC0GIl`j*x`P+45r;54od2EJj}ASXDl8!k6K5#mB*H3vcvFz^1i8DEolrhwPj zT1qKT2|j}yzv`nsE|#|8UkGRbre9tL_cocN9~gS@k_s;4-mwcN;T88#gUUWN2L#yp z&KlcSS>QLh>;Eiw7Xa@b;Q$YyA2Zlwb4VL1B&IXJv$2>8n3j$t;il4d^!Hp!V(PRwg9_Oorhej$&SuanX$UGgRdKl-^eDJTn!WR@8``i{aGr$e<3_^se5wIA%j>v_fC^R+3WA9cN(SG!Bh(DWCh>`$G(rNr z4nb9M`dAJ8sL0cj!uMb#vbY?3zNxOVF_;J&U+|4q_T6A?DNma$i4eHoiO~7_iyAi) z-8(13U?jB)xQQsx+l~MevHa?POF{z9;s3cLXR81G_uut3L)?YW|05&5`{q`=&5x7M za<^W!sk1lVrHwOgy!W1qz7qV`Z8@GpN=o7d3*CMzUDTM(=UZTynkK0NV=~r zxA|ydJnFDKRbuehn3+3mgdh0tw}j1rLL!~B z!AZ|@G4Ol~7dx6S(DWU7!b^aq{onK5|9^Y_6Ma}t13;m5M!5gn4?DdZE*;LT9H^C4 zw=pY+`8MpFr$0JB+$4xD%awW$mC1h@UF1gd;~dO<5Zw)8)vwPKWgt|P5qOkc{HBj; z)(y=nV~XseY$SgZB;XJ^I)Z;-i?k6f~mCDG~b}R zLA6PT&y78P^>m6I42eVotPh5-(h5ldjY4AqhxM71+Kq4=T+Q)dxf>109*6)e)EgrG z#MlxvwB@U&ddWLCNsu#2a@&B`Xh*>A{`c@7=&|37WtF+6{(*u4$X_(*$`c(1Trw)C zKKjR1=PaACQf^6y;G;+phd(`P0!$JWY(ozF@+Y{6RUdShXEerxS)z~l@f2*)AVS~hHHH-c_dG9ry6OF^tNasb+qqQyGSI1f%MYdV;4*b?!Ovjg#;PfBj} zQX;lENmQRAK;_sR20kJbfxQ9{KZG<2Fq3gJ13hDv>Ge^woiTGot@T6DpGyJowlDf2 z_AvCV{a(=ZwTq`q6;mXwh3ZqvN=a+_rUEofQd3NjMmsZas=s4!Q-@_PCj?tnffsDm z{*01p+*Z}YgSdcwdbeqWI&5GS3gF8z0Iti61-pcT#!%idCt%vDhfHdi5-5r+!^413 zL4q2oD#Ca3dodcoSoR>R0BW$}42Y{lLJ{$gVM(;02s*~#>_VP9B+<_-WgtDS!I9(( z+UNbeMWgScEX~ky5-UHT0M_&Z;R|Sx$wM#y-k$etNDBVQ;q?Z#l5lkPbnlOr`;Y*b zO@%j*0DXK7b~Z5lHD}f-QZTcBIoQ!<=g>tmbex@|K{!Aeu>O?!ceWY__;@4{Qls){ zAZp@!11Drau1Oy@0Rbeg&TOVz5Mp=rH88V9HU|P94@{pTObsM7Apr3hYBx9%ePHc^ zF8c`~KH-#`1m=N(8+%XhlPx&lPqTBU<-zJI;tWDr?G`%~CD@A6U~q4=vf&O5(QNCy za0KPFrY5)DDKDDO1MyY&tU?yG#EErD0@J_#TabPu@`qt4HPZep*^SG<}Zvs(CiZ#D!O=6ec)6 z9{lwU*mFxNJxoPJ@%OU95VxpRScLJw%HE~{uNJS*Bh)#oNGRVXuhJz}eMCj)#D(EU zmr6To#$cJwpxlXbbR=IO-4!&KaJB<3bTokO5d+5jy0)p)t;sSt!+5UE<~b;P z_5dX9n$|F=z(Ig!f?{VNLT(8r%F4i)5nZ%CRv2AWE%6#R;cx|Oq>-JQ+SMj8uqi>2 ztBm5&E#Ri9>*+i3KN07RL&Q26W??^miEsk_p*&`d)H%geA#L6XDJq>wxrGOh_p0-M{xYJ^UovK>`6?7`#Uej`zFlDi)8bx&j_|YhWpQ+A*Vd0N%=q zVc8rIKEKrApm>dv4+q(pjx09Va>M~Rwqh}#e&|gVa&%arcuulv`8Lsfw02)*cEhjwMKMCeRYLt=f=78futa!A)~TY6v!D9IH!---k18vVa)?4C7) z5AVVzD3Vct;Z=n@+CxWVZl(Z9*&xOlAU`C8gB9-ablb2aS-=JF2paPPsH)p8tk-tB z80)B$tDZ&=0_hR{s`jIAjsW9IVIOj+*$m<_TQarp?EtKOrM!;QC7KEHayW8Kieo>G z20cr&F^zz)TtK+C3s3Y(2LJ#@T+;32AN>TabOqT3VCR`1HvRhijBmW%XocX8CuLdQvyW#$jYY=V(m=W<^GWBlHjN+_yDmfJp7->_lg=gW zp)_hh?+6j09sV-Y@#=3arK0-o?Q3dz{ougO0A2cuP8FFvR_vFt24okBFfYMvl zpm{(mA}mk{P%0fM{L{SuxnookfULKz^dW~RAX&!Q+(U44yaEcu`h@PJb&2!donN6i zt^1M~?mj^KU~fIvL>0Ju!-yty1wh@H0K(^@c~o(y7Z6`uUPV5CI+A}zBVs0PJ@7=j zrfC*vRXssFbS&L9G=HfINr}0Q0_8UOy#AbU6^%pWXVp2~Sm>WL##1qGzm6_3IVyq`TqV8aKLM4P4_ z9)cF~KM&N8gWo@To4D@!$JxdyfoBDWd2N*VnZoW18vl==4dzoC>_+%!CszNrTgV=D zPb!+N!{D&i-ewh0j4rEDJK@`_i2wX{$6oaMxJA9F_&v*J5m z#(aC_J2?3}PWT>7#eaax1lS+sf~zuBBTzvcOlcfRc@@z;s3_LSmvj1P0q{X!mm;1Z zmSZg=XRS(6&D>b&KomFDKZ(!dA<}3DvW|YT4lV#zH?`3J)8tTwiMNV}8OfvZPmjAZ z2vZ0Vjeb41#p7u~ffCy2lO%sKW!jG;a9#`8}jy_b6+I+uE!2474I!9IxDO z23IBD$}6F2h{&)gpb0>FIjr6;HGlITf>Kh1TOt|Qg>kjbh5S$Hse_7Nj9baQqaK-C z_1$`>=h1obXx>{0K?wuP%E~Y-ydcF2Q0aT0%oEr=@-E4s$eO>qkDqq7XOxcNTmyZa zVb72e09Kw1<0B335Y7Q_Kg}yQE^Wor6s!O zLQEFYqJg;AT+4j#5M-jN07;1?WWk0|EbCau@xddC&q+k4TQGbrjRo$ zfQ2>ocC)^y1xDXJNWXgrhyf+wWQu}yb^#pJ%hL_6?GyIdH>o$ree`ChmZdK%1Zp!b zKWRTb08r;>M+(>_rCzoS`Cnxqm!qVSkUAKSU1bXkp|9~9Ww!s;G99#bc62h#Ea7c6 zQxz~;_;XagO3m5^pqsHvb;FWFFiSj-vWLzEHwnmm(C(k-+%VtU^e*3@QVWe*w0>~x zyQ-iUTQ3N0@lsR?Wu2X#Lj~|){j=GOp$cb69N?yS@9T8)E;m%K=T1MFDJQj{ zAw!>5T+t(4Cx<g+joYtDTOs8E&o-%!sJ$u zC_0Gr%(QP{6}j_1EGUG23QV3j4?3E|&vjUe=th1C`Gg}@DSTPbWZPk9S!709`fw3I z4wdQ}h-XyRt0a8L2G?;uul@W&(OFeanWOmqMmIIM3WeE_lu37Gl0+bPRw0m^3Q^Aa zo*l6Daeq>bCLJ%K0?;7R%8!97no;;yt_6Q(P_3*v_uI8my1J<=UXO=^Z+1|x=UD*i zG;s9Kak;Yy@R7^JAB2+RGzeeo7B$L3*UwJjf^@`#--1u(zIE(Y$(UP+7GoD$OKOo)6-f_ zz%>EN4Ehzmu-E%T?KvAUV?{BGEofqoC2f~lABkUDJfCsA(J{oMY3tc=oWFTrys?EI zk7x%C7$hqN+>WK^Cw827xz-8*1pJ0$WYYs4#T>I|Rqp{?ft;lZk9(ZX(}YHCG+7yU zyp@`XJ*I%>#jwclArW8yw@M0lEFMjnQqUXD)y{T)OKcjf8r;=`a7xW`#pxMAdk&vc zg^yGFg<&ZteX#OWx0cKhr7VP!=hu_zrTwE0DHXvHWdP~Dx0!W@~yQ-ig)HX=Q6Vi>qs?f|Y9|4y|%Wm!vA`24xJ5f6hV z)w?@SkByK`?ZLDaVNTA_r=hm{}0J|_pUIHg$ilwAu94CKv&i8Rk|6y%qFTE9UV2w+4{ zQ07>GHw}@K>frRcS|fhl&n;NRL>&^H?8`(zSLj+?Jo;T&({QMqTwu<>{aDB#1eWd%)^CRF^ zYYD4oT9qqqEm2zpP*USM68fT0mDTkKx(6DsK;oqa8p%UJt=p5WF%cqK{?q*1tJMlg zoq=uOCD@q}!p!JPwjt;&a66TBj&4JzFKwa}EFRSet0i^T7}ub3U$b`SnUz}z zq&fW?Q4*Z0%z>8K1-^OdhxLC`JPKU2?}^0znrGedNO{ouZ2R>YEx|%6hx)s#3A;3i z7R@~WeZzXR88t&xb2|sE1GlLx?ae=Dt0{qLQCDY9QI4k|1cwbiL_kS1HYx8-yyzuD zNPX*b-t{&7eAt!BRUyseAtEf2=Wg78_&aoP$V0DQ6zF`~Y0;&1s~Ibz%$BrKI!inn z`0Fv-+98ZpQ_JBaGBdDu&-FU<7)%IB*CFQXPuHNR^JS9jso6NF=~J?CqKK`K$|)2x zgUOkM7)4D^EaZN@Bg#~etkf4n<}Mh-@ycZT9CtYRUFyY{MYtu?rQ-3wn^mvCtS>y7 z{lDKIeGUjv5L^cLM6Nv9#W}Wpm^q1da#vyOoD1O4E>T=KvFfm>RoJt;UyFO8x;FBf zJXz&@fw%s`M53~C!Gi#Mja3P$t^p;Ev}s30a#kXHK(-TuVYi^^KT?@-{aSBmwKL2( z_I`(PoqNY^k17^$cKzfagDO=~CF+PK9Uu?Zefv2E7caBwztIILJqGH{R|ipO-5Aky#mAPqjo&j3C|JMOb;k5H z-;Xi*3i3i1hY+XVNugA3z^JjxRUdHbcD9y54}TM^BA`~)Z^>85ZjPxXFC3_scj>=t zucHX*9lXoNn?DS8N@^oza~&&jT0V|V<6*FtFuGG`hKT6WSIM+Pl&tjehB)j z-fcISL3eKSJnTy<%G^iD%x6ubDc*&b+zF+x@;8BF;2$P?uaFYwY@=}LH8Y1 ztm!J|Wsck+cZLSOJausPs6hDR@|+;5Je~8E@HJl`NM@SQ!%s( zt^-!iWn5tN5@+RBhO>iy+TikyL#FP)&~3Wnw+dnpG=2Zhxd-+z>@$=7a;eHHLB3s? zfg+UJjm0C+)6=alGz(|qZ%04F1!SU>j`&<|rKn$7e0@n$(YKYDAFe7!2T=WWB8D$Y zu|L-zv&kl0(t{L&eANs;%JHY1*cg#0;`P53iFBT|u2H|fX5*KR&va`Sqc|9A5N&3j zQFmD>v!oSQd?V$vnJ!my??0j5*q;C5-MFC4#fW@rUpS|lV5mtKMrlcDw2WZTTQOHO z{1Vc$w($zt%GYhm@VLgW{`Rtli zRa|pw4iIG69IMN7O4l4?y5O4|fB<99PB^Q5_Ew_*zjkFQUQ}YL>$&fO)a@HQ`Yt8c zS>KhckDf7j9E3Qx7sL^C9?Ok|$Wn?^Wmk9-UN*kDI?U?9q{2T?ygwBl)r2g4p7+`* zjE2=q%v2o0mmA{yDOSD%(bzQEA6EwU+}Nq4%4!xK>{N^Zln)nI8Ysogbt&nc2@LyJ zlsv%s5QFyu6oC)1r1@3|Nr9pz*mTKwu#VaWiDxR2t*3G5$A%=UB&&4e?y-z+@f(lj zDU-wmA?D_b{e28}n6CeNDA#6!5H?FChbpKU5bdm@vUy(o#qxKI|3fQAM=UoT#;IaC zmP~oP<9V`QsKWsh%Yprs{_S@Ej&>#IftA~lanx#$T4o+>^Hm)rvx`x1M)Hg$jj3E# zjghXFgvh?LViK}(Brt_Z)l`5(l4C4Ut$#*_G;Y~U{kvh~*)v~Wj-)7#fYox4EJv6M zl9c)$k?|d|aj11MrV;fJSBtY-P~zR(d5wY!Y&?uDNGvcIW}MNa?GuWDc4xxxry`?n zd$ZzykPZ*?yNrVt`Jxy*3aR3VO1^nG8^Go)>wOu*C|iY}2Jm{Iy1$a7$nCMMoklQj zL+#%2qLzCacC@|e4{z4`cPe2+yKgbT{xOi*YltBx^lnGx z;5VAnFT`-R=bdp9S@g10`mM)8m7QaLgK&9UOOhir`j_j7SSVVbZ$qU1paIaVD!3z~ zasIl21YTBj7YB-+HtiRQGwUdt337jW3A7k*T)Zo>Vfb}V5yXbJ?&VVPAjRy2XG;|(-ae2Kl@o=&+jaE&;CEYk zapHX~J2Hfel6DnsL--)R$O@j?4GwW^o}E|pAyW-oGT)*2J3>3*HK}gWSe49Ikb)HZ zHDZCDJ0*Ds8GiH2P0{_DTks>ZDQ%=#5J`pBK>}l;>%lb0n_|a7; z=7+=y3UeUerW{8TD?{u}%uzYrtU{J3jIHIhq#z>GbrE2bT5U{kITY7lN5qj+;LA*_ z%nztW$uTCpYhdEg2(&6>OTWoADgv??tk%*0Me-Xj<*4`a7ET6CsI(gvX-2ah&D0iR zemeA88z_WP`sz;GZDIOhjxp@942-9#W>iU+BpePzcI_5nN;7C!v{QOIw3F+;?2oqD^rB^yH4f(n8HUaU}`a25ea{tas|LTd;W{|;r$Z3217?)D02d3XY z$&oeLjg<$x=Eyr0SI7>G3}5+99@x<_9N+2Cge62harPq;0|R9e$gl@uG972s$gpSz z4%XBXf}TvtA#R5chr=Gxw>9o03p{5wg7kw*+CmO*DVZ2owf1j81p47>;tclPSkp6N zRlWkB$fCqraXn=A->;1n@U#3{&Tj})U84O|Ki~hnj*CK+*B*2FimbllWJvl0lcP+J z=E(m+Ih(9>q+|5C4FuEjY<^>upQDRMLNdop6fL&Z)4-Y2#idU-5MNAN7>J!Ugh3jV zP>F$}fQrjfRr<5(hk^p#Xcae^cFhBWd602z&r~SZ#-$cF%0BXHnJKb)6n3n%#4<1! zdGt=S%RKI<`@7eUf*?$(z^lRDx z4J2%)X(*;|uoh!~^)an=sU8L9yqa8lrTw@)-dnIjOtl@oi= zmnKFNqR^|3R}wD;+AG-Yx*e)hm7C%UW-9A#lyAZ+hF^385qz(61lYx)X3v(wRVIzW^F1 zPH77+kGE)WSdU+$?c%C9GO0U*R;w$9iVsg%bMp06*sjz;u}U}gYMBT z5xl+s=%MeTUQygs_OU_HL43FIDkkR3<0b{Y{*z-VkNDu59<<}65%p7vNq^!kh+K$d zt5gw50D87K!#kf#xF-ZFKV+i+eLJ>*fj3^KHj)aD64S3B|M{Ul=7H+L3-y;;5FU-B zf9y6Av08UFK+2!LY5P)6A1InzQiS}olpCgjBKjFP#g;x*Sty-YntHn1@8w(Vln0Hq zg?lh@oxYurNWUYNl=cmt4TQIA?r^D!bI3}iM61(`&d;x>1@m(9IIrEMKAm>Mm32Zx_WQcpF#1t`!TTI~@s{L=^jgY#l-6EZuN0N6sd-umNw* zf0PEL#PoaA!os{cafY?_i%~IO@JT00{t5-v+LvN2hp1?UtN3#qvjcLIPg_3CGweFB z@^`DDA{ke@zqJtRhuslI4BJ0eNp(O4*4lrFjM>B2KU(k&*<$0x;**{qU~=nNgy%d# zSw})IaB$?;0dT*%_3ja3WQpiwX?KTOPG=Cw4lDO-+a84;Wm3FOH&vX^X=Tdcp5fofdr)@t{g(A+TP)Azmv1CLtz}J3e_*jljWQhat>*L zn~`%#SR}URdLp@@6jCj`JF$@;ya%#KTSZ2w_tWw>_QH9C2=!Ii4B~HNf4Ol!>)~_q z8g@p`;9rZ?YdtU{!CrT56kG@AxbJ=C{Y-$V#RNDzd} zR+x<%xSo58z!6qh0z%xVN5sdMa;M>YS0MzLB}mB?JMC^9#dTN)Y9KUvpX(`t_um;L zSNq?-T#YVU2{A96jdS#NcUS>&R#k^HB7_{mbuH$SG#*xt86&|wxy8lAv;hdbBCpD+ z>_GDDx-;qe3&^#L;mWG6yi*jwL$#rP5_=Fe{;ZIPbf}yj(o9bNG9Px!Z-sYbsCrSWSZlARN zk7_Nm_v9$?h9|$>ems=2;zWRwUhUKQbV|xL(+dO=rS>Zv(9rI5dcyU}_B!uzQ1Xri zEhF<*R+~|zg7E@X_R(w@N#gYrH0OU#TYwdl5in#vkdsy*S5F8?wlt$~uO944*Cf;z zWmy7U30*cwMhU6P!xpKU3bdWe?iXF*3tP>QPhrpS^w zt7T6~aE{2eC@vamzxVN#{}!w;*MnH~y+-T5K&coue5i(wN*mFaEzDrhZ||jzYsB%K z7V}C!`5q~DAa!Z~qTCuvfaFw~%6;&J%Xx~Uj%L<&_>z=O*ha?&#{QaHN!`53Qydc)~uHmty zz#_R7IdAPdMJqSA>m_*Jc zlix{A#{QsdK!RNUF(ehbK4XecH_eH}-{JX0LwcJO_QrDHU5)qOZkpTb;*8`zvKGOo zv}jWKf=RLA_hbqbogb_vofwYK$d(fxE}ii(0Mvv8R?3|?@ox3tdG5_ZQ?{X$u^|t6 zjKiNg!*jADQb}+R(Wj71b5iWFz-*SE_XN4Htc*wWlgD~630*oKaG3kCjz)A_|IB$j zEg7qL$yLumE9HLNcNEq<(~rvqu=+n`RUq`t$fi3r>UCO==$Kkv>{Eb%RskL98X40w zG$zKK0b^#QA1kxd$XfFwXmsvdC?p#tFNl6O%5^IBFQuaFb#mH?b&z-E1`HGvj3-o- zwYt2#{@)^HYx*kRYUTrYe!J>g9Y;}8%u7V(WfbXbHnY)By@wV@VL4Qr@aW+u$IQa( zB9tQ{i_j00`-MOFsYVXhKcN+rc87eXtG^zFV(jASk;}TTMqVcI7Q#ng9d+k%{DFH9 zT>mGg=PpI~JLy#3Jq?P@OsS}>adcWy;|`B)nGOz1#r2$)F5KT@5{QsP430ajjPe&K z6cNJJCoLsxDSM^}( zxU^41>@9Fu>%phLQv?e=H_( zLs$QPql>4fr(Z=e#4--b>+TxFaFO;f_QBR+-+Zrsi-r64kg79BF-BFrVT!~b6hRx% zQBwcT`|^XINVrfnk3qk&w&V$%?cMKjh%4LD%cAI_wDB(dQF^BhC-O+{*X5SpzekY0 z@8G%e?xfkwn$gaS>7^MTz03wn>M1#mD;#w3oIhHoAwILrgK|du3foTZcKB<-$~u&Y z^pBZ#_)Sex%+Vz=R1*(6RJT?{jE{c!`M@Rgad|drF_G#z`ZuPFd1k(u9K(w)vvX=X z%{Va4Px}kHNvJ_R|52o3$7^8A8`G;tML<&n3=d8hCcaeOtRBN9nCMXMC_J7H{JG=I z7g~sWnfD61&XDq2{AxrRzZHcFys)yOR7u2 zHy+G*fU7F#XYx(^taPiu?mLi~3Xs*^{`=ZG?~Y(X?j3leOHbH8EyRl6hR9KH0`ZTx zl$_keNv+z$}MH(akgj^O_S%Cp+-#y{)o zWGlkli=D=)ZMF5Cc8dNyoHp~C$F&tHpUQ{{>&x#y%AF-Fjt`FH;XQrzVwiM)66R)@ zqZ+NC)F50cz2cg=Cw3yfp#-Y@9FMM5z(4Nql{McsfNAm1DoEZT2RX9VsQPLSFh7>K z`6247eWt=18DVh~Wh0p>(JK!7l$V^n@cEnVA?PFV>k(pn0-4Bs%1cW?LDJ54khvU6 zCOt%m{{8v9Y3XC~qhmjT-BQ`_hb{vv9FYgk=eY$U2*;Q-k3p`I--dovw+nZpAZAPQ z4Ev9hGoKKrl>BCTW&T6ZdWmUc=zNxM#N_<&>$!|`EGa9id4wRfqBh(I)_n^dD=s!9 zLvwhqj>Kp$&0LVTRuvN+h?|8C*5{d-Uj*;6P(&MRkm5h+O?cs1Ws6GIrY#JKmpq7Z zwJw649L#;;UKl?*K2tQ>$+#rzr*e9gLl(HF?YWUFE(UHq;#&=R@#9C-N&fp4aSA*4 zkw(fOovV#Bx~gg{Bl6Dzx5Q}rr0?SWdxLL+h1=3aL06G6KHiAV-oh*2JjXR%GhMs~ zD`wOhVnEHdkaC8MSL~C%pE)(X>GN07ZNGn{4F7k0w2}Fe_mJ;(-+G>hXbr7D;H)LU zoDt`H)+g{O@m6@~kZ72E1lj08EHQCg968H&!siMgXBBC^jGD9sMy&e8(-V`%Rb0Aks}9D%_t|UeGk`@%gpfdgR&?uQg0l{M+BpkE3pQ z%MjyTRF(^vVtjX^p*?z}m+m}&Nbh%;XVc#|WFNFj; z+>5z)MX?R$Z*sB*&gLlL`tcoSyS-LxG|7VpkE^LpAs+0y0d8d1p#?e(g+KKka5RCK z*ygr(h$)-L?q{#03pEL)Bu5(h-I4y8BLPg0 zF8)%(zVugQPQ`Mn->phn%$cVQaq)?+a9d=UrqXowo?KLJ3U#*YX)+9mx$b<9d1y0G z+AW-bL`9f;mTA9*Y7?J~;&`&~=7y#cE^mpb>LaNRQR?+8VxuBTCz)C<}PD$>I2aqe>GF3@sC3f-uU<2cISi_JVMenYpb9zA6=~Qj2-LA^w8cfxo;A1 z0KB*O|&qQ6=T8a$UegX%*;8 zr8&lFzWU_BPu)r1H6}HB`(|>Gpc*z*p*?g=6wPL(*}6(+^J3DA8!x_g52#eJvCL9pouq%0|$^a0N0!vLj z#^;I`Pdu40$xt8<1SY9i|1{WQ7o8+`9lOqTN8o3WiF1$(W$=v3U4O$co4J13uU+Me$nnUDUHQc(48bEG-3O)Y@WIv6peuU=q6a z!qblyUDscr5G+p(XWsVdQz$^YUI?X%sS)wI^eFLLRia+ra8(<$LY*)v8tIfh<(QiB;fmZ9vmgsbZw~P|~40VFR&KvEjoO zofC3d*F%DJ^7Bf_SHWXmb02BBPlXoIGeuZyaD5E`Urzt1d2hYpOZ5b%5x`}%P@64` zmpcVCQe7V)gvyq9))GF#Y)7h6=m-X5@~&4eji+JqkeSgQoSQ*Zn)S}LkeyC_5V-{t z!c8isYcC@l=Qa5|Br-5Z;_|UlF^LW)dLIHNGuYayI+HJwVT-IitZ)X(?3uV@zpy~j zYsEs+zw@hmnPAc`4?G;@En(D*Rql&ifw8a?g{NUbBNak%Ng2)q_jd!VP#Ve0-ZY9f*!8UEeKF~Fh=^IaTb`Z zhR)7{^4H9z!|?Q&xHO<#S7i^`CgixI)Wsy+HV=NwD5o&w{^#CFy{%9Hv%gt|7c ziP4eV%G8Htc#=KawXGVM@)&T>S6o>Rd;HUn__&hZI(X^2f~6|1yX^V*^`<${K#zqh zR(G8w8^TdG_+yyoa6_BKN)Djq4EeD+wu_#6ORtbEHO#E=k|P87g`Co?#` zjQ-P`BfMd_>4tU_*8lnCQ%Vqj6xp?=ft&R|elhZ8uo?k0N>b9lQU2$b|DXBj4;)d4 zpgh9@jj767rqk>;M*&QlSTFNmE(mm_bWgoD><4(FVlu8K$FQ>BrEG;?NO(!)japq|@F0(ahX4U{php zGr>XRdortApQ^MJDm_2#cUBvpaI)LdoLmX{Z(hRc2F`D>--5Eb7j0P;zLZ8t z>@Yo^Z3ViHwlHpw-V48(1!1Kk{~4shhAH)VW(Xf-njjml9h;zhngAioPPdWt0-}E8 z`ueOX1|Sc~7-*5M#u?5bw>NkLgc)Jr{m(43KO2gLz+aKojZjU$joZgp18E}y(ax;R zJp*pqpEYaw1#32_^?Rd0mM;I(lQIzXJrfv^?O?l0uf`Ct!>x|n|5fDY&QOHx-js-r z*vLh(y(&Lp7i7ByTPYno{&PNA|1^-m+;!s_F!=z53IV&9=XgBBTG0vo(jQ^z+CYd9rSvJRf$Ss$@-R@x6_{wa?vFM1ukDQ#ZXzN0F3K|u7Orw&=n4F?=3F@9 zUV=;e?|=p))r$GC9L9NxUxt?NN8)k9Dsvpi&G9Q`eai4@0NSk-cqH_^*do1`6Mq<~ z3b;>UZQd{!?jcWo3yKxH_*bvAhk56#$)6d+NyK&Z_)$`qrq7o6uKsC1iZ=y|ujEJ18|{plNaLKxUpVoH(BX?Vqbc^q?G%jeWcWcN0%qY6q% z|8ctAJwYHD9yq&Ip12x&nN(2doB2?6xb&kJZ`Ot3ob#8wX45;p1aVKoZfr~i46oG2o@um zUZdbO4%nsMYzg96e3n&^f02N7-Q)WzR##Gu+oYDuDFh?0q5BcyH5_r$GE}<=^_cCm z(9-Y{$6eCkY9!GLgUJhA<=`7&3mEn7Q1sk)$R~(kdYCR^C))-Ec!fpknj(=heak7m;dW@uvAsF%FQJZims_tY>Nhzwh8(WcJvLeU{PPns5={ zxBMVna|KCGLmhSYee^S z0+VdYvv>$hljPmez+dc`gUsSIG8+hSF0#Ba4ivR1R~FXr&Z-|6S7eMF7SuDS9{v|M zB$kQ`+kmCW5uYMuA{4VJ=nKzTJHq0sQg}9I{GNEq-~{_W<_#d4{PKl@agqWYKRzn~y9Nvu=8(O}^BxP@J^cj!0cHMO_024=_P zKBpKJgf*&c3rE3p2sKfX2ZsDNn1nZh{~ih6z|rTH&XP#X-gT0cRr{cSD@N;>x5-VZ zHF(tJ7J=etEy~2V73e!BOL#IGv<{{#z-5(|JB% zLZr6+taRLK+awAy&@0)HFBcL}2GZsc)K1{edV=UhjqrZ-xm@(SGLEZH5Wr>5t&5KG zoj)Hx$U2V9ZE_o5xaqE$v%9NP954LLa4&)((8qzU_`gmey}Q;!!+(+xfWK7tg{mU< zL9O;0X2N$Ms;JBeOtf5TjPOkR-RFXAl!3e-^&x-77f~y&()B@XC zWH~4Uo=Vtf(*xOmFzH%X?WhG$CWpz_uRFK#j$`n*m7*Ym$E4{E(A5RuNH zQhPzzrz>k8xgQ`z&ay8{O@AWass`*{>AuEJ1b+|@ZUSn#br-FN4rHPPbD(vjjUc8m z@ghRM^})qfU<@$%F=XjiHjHJ>Xjv&8zg!#w+Rk;Jqo(95?sT>GR2yF?OBZIR~c=3b_$3lCW#}ye{#VIqozuXeypsD{Um1hyK;L}Zm1v`4SdoEnYG86?`>6rWuR$xhCh zz2WOGla0C+0raNP*J0PE+pC|?_pdQI5ImC=Rn6rb0e2jlPbX$reX5_x5p1_jvT7&H zaz4-Te6LBDFOLymYAU%xG5gZCq15siy<@i9C*V{sXV=8+_I_G_fdr?%R}y{+`vFx&5v898CoqGuW5GPRIm@5w;_G-}%I~ z*J?iK-0<@4wHyDc50MPwG*WW3^E$`C_Q_#Sz{lcOg#&T8pIDqof=F`9`=T>?@Nh!j zrsbG;`GLh277l)mHxVKaY@NupJR#BYizLJo=)6_o@HA2tWLCL^au~*U(25(>YI0=f z>Z`vq+58hS$4=t&<1NivC@!V(@*;EdJqg&8`)(kPdgvcE>|yz0Z5wnk>SZg7&eTt0 z{FP*559oa>HOg-;9c@DR7H^$r)`8fjzB2*VmwM3Sghv- zL-jOmdS6IVJ3;G=Ea!EoY%Fxm79oA)6b+U^$*QTX(i-XBjR`7=)jEd77bulK+Klguo?=H_fw@wc4(~uMz9|s&J(`0~t)qVBY0q_f z@#`cL%XH*+{lY!G^x=`5lAwTXJeDx#2nwJjvmr$KHqxZvmGg{py~6?-+{p61$fwej z^98_VvhwJzcW^*&0(S7lqz^=Vn+3xIf8w4W2A(Z4# z>xT-$dg!ilH(=V`>xy4jKq^?z`Q=4z9!dgThnzL{LBzQThhH&Quh;UdyN{6j4+EFe zw#O4IMA}H4Gw0H!Yv1Cc$X@&N5y;odjK4S63d_s5n#l$8i?h5w<7su>-UacdDZ!G@ z-$VnQ0augg`y~~*-7zYRByA*CQgE<%K*n(syj+Vf?SBB3>E&)laZ=+_jkVJA+{ZgY z!TVOmZvKMRozjxW_flS}$&`&soECC{BHJ^bL8gpTGA{CAn(i#LpQ9ojFkLVaz|)F~ z&wu^bq&s90ufr&<#>Yh4`k~RAqH*wRBW7z1)UIoL#F)GfSFfG5=dcW+F+0hr|*u?1JVL1${z!v_d3mUYTj|I!hY~TaDlN-p_x$ z0JuKu$ekgOAED&L>4T(h;q&X{4UeUgqEWWPJ0P;%gek<%5gR8wR9I_o;J`<%tJ1_U zXf?iqK|UX(@88H@njqKa9b^JJ+Q{_TBCW>c19^SvRZ+*V4G{o2#`q2`;bf)D_dy|P zeNdyZP4Y(a!J|i~k%ABIa15+(h;Di#>{zHDwwMojpLR1}J{JgT9U9+52i`o^!-QjX zK)JNyBo7p3yx$aCbUK>`Kz}l5oZvj|%;U)Myg=*nY*1zxsBFY$6TkJd$mdRnceavg zpw5KMX^a34(`(^fztE9bcqeik>xSL9G|CBP}}cQ@gx`0c8fl#-v}98 z+&rSsZR$_>!2uUnh`Ysjm@wzq@Hx$Sd@kZp^zEk6$1Tk^{TlZ}&fzkty~_Fh)`M8# zIrx5#&?TcdxY5wvQ_r`mHFt)-{OveJ;3+mT2oo)V29mmrIs* z0q5{`evg$3Pb<4j8rTU_0Ty*6_xdjGW(2PY=A;A5+hXYj8PFZyfzVl?1A5e!4$Gnorr*G$m`Nb}r}&Fh z@IU~7)9N|EuX=F*Poy0M?ZT{aLUCUhY6rkFW45@2O{h6Lj-SA*PY8CCb?7D~Ag3B8 z7kK-WLJd>w3MKmJnLt4I?(ah_{TdRYp0GSbB8vDPY(fa`u1}f?ZvhJnI)pmps5=7X zihm{l@kJP@GAM#jv<;u@QH z(!sY?8Uc_3YDC?xH6*ZcVjiUp*cl{a!oe=W@>PCiQsvqN`qmJSuA%HT5dXy?dc8WD zw}o+>AJ>sG$_;e3drih0ohORvDv zL2g|6e))^9ez&;@jOzVS^^LXTQPPFmG_J{Si2;rOhh) ziXDt@{B8X(FSckO(&dAOQu^@)8Zv$osV|I_BAeCJQl7;kWo$aPP*f}+yZ zSg#$(C0;|^d}go9g3EMj>^gAT@Anhug|_pmqG;4$Gk_l<0Y}1y(HLs z^JPr;-vb`Y)?a#~<7L=ECz2V=NJn0Q(X6j555umQ*iW9HyfGId#8j|n#(Yy{aP7uU zuvp00w3&VEXWs{>LA;xpe}DRe`eiUr_>CakL+mmVuWAP%J{sOgN{SX$+XMp4BA$Sx zbS22NOL+x{=>zw%8n0RDeM~p3#6qi<2qqoo0J|Zpo4C%isB@aSRL$7Am(bTyFhMgn zIz9Lcd*4X0>KE5_IRHdz1qx)R zYVeQS!6CTu@H2*${3!LadD{<;sYC;*t-gd6TBhO`C!-2<>F#vzs`_yHmZe$wU8@>^ zA9@`S>i@bRnt#da7DS+Xi!a}j-?Mnl67YO}bBMc5s_J)or|<%?c(^H2xe2G0DA26h zNWv{n1u%}0_(~%o<6gLMC;h95e#kJ_Rzo*Co?NdbXo*XxN0NPI9b{?`B`Y@pQIoh` z2WoHUtMi+hewl5Z&)Frh_4Xt&2e$3uzFt+D966#^F#aw>vKRrhG@{(NbrY-Z8K}!q zO!H}YdcI@~YKidjgEG{Io}1jB<;GP4`T3L5I|IsdXN+{nOwDY=_Tgl3T$|Lwg>+BT zbn)Epv&Kmsr%>q21NPq`$RM!=QV9B95B79d@*jKa4E!(mKiLqZ@vKe06>l)k5<0_c z?kp(8Y=(oi!3wo%y}&2@d@60PL*N$P<$X9{LX3|SLU!O?{Z<`uWwC`vg~TMYA@%ag z*~M@XB0eIn^y9R>8^r9{VGxLA&5C109O%cpwvJ0qA}%C1s$20|-Aiyq(qA_Or`i}V zgb>JKF%){JGz|%}tp%IZ!ECW2Te;pzj`TYHYFYv8&%KW%$mr(kdp4~yAF1mv9ITyE zUb9BDc@iK7Zc`%UI|GAviv_`2;pBF%yI}dXZEwP~{;oRn0{2(z8T&)_d(uDbjlYHX ziu*A@HPSEWI@@BCtGz0nLEZ2-clS(!xS$s@WD8+=RT1v4et}EAAQou+rxv1sv)TQ+ z7^#_i0Smdw`U>*a<4m7G!|XQLl|+nRCNaW9^l1A_gVb?CU$;&CzDSp}FOuajN9`uB zeFC$JB~;R_+>gSLw5OH(ZWxtgwvdfr*@*rzL!40novsPS=i1~)wKjcdgG!fycZ%Ok z!W6-rLX3Z&f}!A71@+6T0bn`^w3|jR!Z$j@#CpC{z@X7N=ZP(|lRf%pk?}pQ0At|y z(4Fgj!HtNjTie<#UB43al@kKnsK4sl-?l-p1)M;`%nw)Lle*|K-fGj|I{!p?u2oAU z0+v*m5zkzeD~}^a^ys;3dNq)}aS3AyOh&XrR6@(pedm}d%1ceFSwHkq#g}y4cku~q z^#NlELX(jmNpT;l)tk6BvHROlhMv@ZOvj6P(q40_(>h=`mGEpDO3I9%ac{?#-cKok za|?rg6drLl#@9^33E#1)q8!KUw<*oj9)23cQ^+QD$S($h26L6@?&aU%jxioCd*01t z7{U3bNV~H(l3Q%hO#7sxyB{3$8Wba#bkLa3+M5Y1Z4wT&9CkVW;Wfr;?krU66iuY0 zcV}b9m7OSUy+2J!zI>$4bPSu?miZjh@$w}>RcI}n*q#?(-=hwxJ@EoIJ8IjQNZF?6 z7uNZAKmtIj1a*{$M_J*Yn5)8wz>U1Ze!rVEF$z{;4D9%GD0@j+!F#^5o9)VK?jC7^ z*_&y1MS`L|JnGZ(PCkF+EcHasS@QD}Zh1YRAREMbG~O2V?p#>cWjMPua;FkekV6Kn zaDdQ$UV2m|LYgzI$k^YePd7crk7NzdFC`B=C4Ob8`p-}HL96G@cia+xcu`gxPJ#Zd zu@S&2CgS1ZzGZpDLT3byhzGSuvXNHRhqUcnk++ZwUIOe_0SoyP*HlVLJM!jW9s~_z zjzx-xYk)rO@IqQD+hc7Cg`$%@{F{keePVXMbJpNd<>tFyx!bQ$jAgP%y+_DXUr@(-k9fn2%s_oWqhi~ruU zJ`)H3zD+kn>QDZj-~m_RE~mJZ3%|A>xrCZSFE7{y-;iJXbHS43Z;<1IE3f!mRs+j- z2DyX~{RfQ*OW}VP?|&EXe|O&h?PF`fb!aoS!RdmuWnaxVcwX{*OJaR49NTY^utL)vuz)#)~_;sTI5;7eLAK-7wpfsnqF_Y) zRWpY~)i$WI=sN-XY>@+;)d&p+O>OJY?K<6F=@@lpc?oPXf^z1d>qL*B{2NeMNiCNY zyAX!N0Fo@}qj`ZrYmyZ#mgJ37xe*fo* zK2P{nPqyTQkx0-FP&v(N_AK+4|?_W@+IyZF2w z%;?%cyoL_9^L}|UIUxm|b#t^slX2_Bmac23Lg#(}Vq!Jh8rlwdIo){}X;uw6N(5p| zXL)a2wL|@kxrwlU`gd=SX0zxu)a%GCKI^y=+V!=N9%+05Ws7x zt|7{W^Xy<_g>R2HxY&x%Km-e#>@S1Z2-*UU>QTr`<=wOgml}cOLVMFllj^sQmX4ka z((c*d5VRX;_%Vp{5idXqh))I~($tA4-iB^>V`7ZL4piq62oa!N6W&KYFKfbdkRC+_ zqeV0&U<`K~xObgHa^usoI|G8ySn@^@1K@RA0_xeRXks#yKnfrpGKe7r-9e;-7tR!o zSrH6L&aPU7{U*m)xe5tbABAFi!)%}zd>;d7@A%jeRNqvH1y%Kc65;p2k*cJ;3h-4E zp8AX>BT45@?_AvcquV* z(MVO~J@a~o;v)Vw;+LpG5nu2>=6tq~{YS^EpR2d}22}E# z@;&<`E3WC;<6NDBm6&5q)*^2iin@N zEwms)pVHTv-#j?GcY4hnI-P0##uHVj{&EJnJobqnnb?|ZcKPvDsGp_`f6@XK##^|N zwWj|v_|U<2DG7ao#CRPi^z&vE;y@Yk#mLWhTPEt z9`kkZSi}iBy4YIrJEWa*Q?&taGE~h&xxc7p6l5aI@6vv7Lo&3Ao5^%I`?|eq4Kenb z)_BfZKByEeEUUdypn#@^o=(AM#i+ED`pK@#!-RH%$ULIKtv4!VM-;!EvB37As31Kw zvG*lP{#@W07y!O&*u;77ZT`s5tGXW{YPcaTKVr?$;cnd1FTvMZI)P(I9=Nej1==3} z<7ahaH0HkcIm!%gq(9BJ=6A4ox&r)UN|hc{<0mwa+Tz&+Zx7adijxd7YSZuI(3t*n z3t*lyfqYn&uoptNktPsT-e+>af=5*(U}MexX^5T0&EC{-y41L@P^!iIsBV<~ghnMD zAEoZsg?mBvoNjVN>pR6=)s>!>Qn`zMeTGkzWjELZR2>vBXo!+?0AG>rLvXS^)ZqB$hAl12Y$%C{=_d+D%ZoKdkUo3Nk$ zU*JVb9pO@IR0MNr*tyY|%rnIOPd&;}~7f=X##oJ?d2^O{Z+|K8-r{in?W@Y+{YS!uY z7Qdx>c}|InH^Sh7(hRDM`zki6Rb|Ykp|&X`i=edCUFxizZu<|0U;7cJiuTh95|dkO z#w+sn5n@r<8TSEuWX}eiS?-%wwlB%()CsepG|HZJ!*eF*bYylV-%7bGl3j20k77&PIKbcITWA=p)qSOtPR!mJt{954({WPf`_slpzu7#$&)BD^ z@u13o$!s?EnKHPUxLHlu1oWHV#jyNhn<6sZiC3r* zHy!odestpf5fG#dMeU_BgV7add=VUK4t&YT-tCupnsxbGM$erG$5$t%iwNM#vdApD z)Xfs_S;ew7mo8);Nvq8IoQ;ZiMs9;epWIgX>bQ4I!w$*T;~OxWbIn{8XC0p8+JXqB zOm!Jd_i<0?`>2+^Jy!(4veQU%L2AgJFOTf)uODgS>j-E!M_UL9W-tZV!PkB^{)W^u za(B*c%-9kEkKShF;R{waJu6P|8ClgJc&FV(#37@WuD*aLz&y&XnZNn=={=haY+^Qk zI7Sl)@s)01ff-48+0*b2py77{>*ezGO{V#_d>vLDE27`a?aqJPiBI5n=X{b&ur2&V zutuAm$d}w>ji$E;xG&*Eq=+_IYb7QUOj^cL4hacg2za;}HPqX#s#qLRG zCA(s~jyszgAw>2S!?|F?w+|j1tzisk#-)qbUD+|6Z{z)aC^WtCN$6L;-xsVVLyzZ< z*l5MksL&H#i806JUhsj8gEt%eP;b%G@vGS+BrivZC6DwD@ga2kK40L-zBwdf`q2b9 zPVUEh-^T$~oA}dkR(afT&^9pecG#Zc=HT$kerMNK8{Y1u|QM=7! zQSpEWGNQ&9t-7M0TRV3kQ>yQx@Ce(CZowPUihW}6iSAaC!^Nw<8m(0HQDPqV?#G2m zG>mEUVI$-O7kiB|%fKggm_p0mlk(l%BBXY4=Sn}fowXA6@x5q4<7&U+Q}HNUyS#yy zgtV{q`Gr2fhSQ_P=DU<|E7%VONy^87p#8SrXo6#bJRQOY*TBPaNu=>~aHPp^Y_Vu@2t=t9N*6-TF=W<)jbgukLI;e5}vLD?7 zX_bVEVu!8_no6G~_UGoI(n?~NKFj>GKOpj5)J4QUMwqhscUNBT0neKpp@a=lGuPIW zm-oLSb~=`v8Q}s49LkiT1SAxJPpZb8en6Y2;Pd4Yu0vNkB3UIN6hmu*z%5IP6M)P= z-v-UUUaXtxE$g@aV{}!UHV-h;1ymygf=)bQop&Q}@Jc?@yKOE)_S8>Q!XuAwho^*| zQ9^&H#pJ!8YdC+1*xHHTi{?C>JsIJHvj_&+ydJV5CMv=mggrE$J(R^>T>qrY#UsI@ zz^FD%9ymU>g;H(uMR}-*LgQURz$4BlBr@cfirsqC^NzS`9t;V3cCf4dDK)c@bJ9=9 zscurvOX$nS_De0NCv;h~k?b9BTO<8dU&n-#>esY+n>^5rB~9HqrJ;C)<447OUoGdg ze>$%J&ZaLxeOz>ds*|-wqzBF4Aay*2M_{ynW&XL6~~uH#I$KByF$q13qLl0897AwOR;yJZ~~s>EXgXSK6e+en!1 zAf|5p+FMesUvyS_x`x)E&y4mle!@>`t`&Q9!31SFAJ23cx2r!+Vs*ZWL|KIhH{yEs zGFhNR?sw)o5~V58Asq_!^gWs}PM14tNqc0HPI?kp%=>o@KeB|(KRNVk;yUA6G@f-mc`l~xf9L{T+7>&ksIxjUueeQj!s%m@n4BG3jji5Ll+4SP5#~3=b}Xz zl`UVBxI8t2>;I6;{|6o6L0KD#9nq^%`+dah_S^dv^vhK0a4Jc^4Gl()Qw@erXhm8P zo0hw!uRe|O9NLg>@yOWfnZ$Biu$5~~Z|-&94;c*WI%%b39?&nND1MXi;>mvUeLASP zGU#Puf@#@ea(orj6&}n6(3_C;o(#Fr-r$K|dMBvq14##qfCE9qGuIsIFdZ@+h+Fq=^JxJdR0ya3CbdY4%%Y;g8wUZ)`EE zQ}~8=UrS4s=ApCxnz8+6u85qf8BoXHsAdiV4|+^_ra&aiB1JY3nM#{E`H5|K8 z9fZ}lI+EHZ7%2Qm8|(Dx##iN}qZhzjG^DF|zbHV!k9opG&h#Y-H#5Mz3@NE1ZX`*( zc)uz5rF=tKz5&nq*pbhBI%xb^DaPKr!LpK@$Yx?}-|ve!DL!=h3%3lQA=JlizMp}w zUDOaOD?BnLviaVY4G4}ltjUhyq-Q#!f!Np;B#AFRSv=q|ZX3}cgfYBWG6Fkj-UT-6 z8kNGnxfnLW75xLVuk2%%hBYgZpZ-a2%i*KNBFGVj7AAZcau~?c}*f8#BEXktP&yfplsTmEEk#L7Bw^i#pHy2 zI2q;_me-qn_=~uFl8jx6R5(nXDBr;6 zUw+1y|G(Xl6&M~(wpV?NluRx4v_;E8&xQU}cw!8rVx&sxQP>!%tsQG;1$&by5K1m| zdi~g4UR->a*~WS}tEw=rSG(+(Q}|+rBPHy*ZsbLuM>Y~^&mDAFObi~=SDh`rkSHD2 zDMOmpCWEz29htbi&vaZegvt36^it?l_k&rS?+@^(phl0N)4t-IEqNDdn^S_(eV0UA zM|yc@;5+g_QX6xM$PnN@-6hn^*;IM6TVddo}S9ENb{QHsqL3;cbC*Yf17NX=L|jA?VKff3j7Q9bfFExwSJTk9I_>m{*Wt z9f9$DyB|MBMaK831zf(7hE9$nmV>HFP-UR4kj0nQ=FX>+iZc&khQ#~iem!7D<)YAq z=o9DiQMb;?7;Zf8zP>WqM=$TnoJVvxYT?XBNoht|t}9{mCQi%kBBp)&grXf3#H(iV zz3o80U9`I4Q*%X3;PF;xZ1W5<>!?^7Y3awPup922?Q5BCggUdX2Xk zxmkY9UQ>4Zgq-hrUjbJ5&_0c3VY2oiW*Rl7x%FdZ*l46Ih03FE%7bqq>3}MLd4*`5 zo0}p@B)Xy8d!1XBBdA8e-qd~3UDyv4-2XW!C6gw~hrb`3ANl~?>}gcH0rOOt^=qzH zInOT!l-wXySRt_3+@P|ZA#IN!x!oRXaEhq;dW97>V%y+2fqsaEWT4;x%=cA{yqoHX z#cw}1h(#uJH)*BNal9g6K22{t&(oIB)TI<>6J~nWRvz7lE@!2AeCHe3WE*>BLj-T3 zszjZFilHL<3Nnx~nw*tUZN>I9si^rY7@Se*kDo|_vd^{DLE(%f5!du(0Tb!a@CmWN z8Lb?OZykfwN{r|1gRtN#s2i3?&0#@ zVM=GB>9rw7Cdn+k0srCRPcO}HWwui2n4!|NInoyut7U{+ldAD}is*SWJOo2U)74d~ z?})TEF@>aRo<8@9OeY0(E|Z+M)mc!n?bmZlMf3XvSEo)=2(I^6j`JYP z_M9FVIoq^p2(F{$JGm`mpEBG^HzU`HIb}IU&WztD4zTXUsk)+%^RGnc*_Ci97~6n< za)!6JyCGA7to(+*8L;s~Qx8t3-H=Wni|a4)<-{etwDoiL5FBm>yL#l{WVqI*UR^Eh zXlJF=s?9kTA?qwAln|EKDr)0W#R+TeF=$+}B%5w5c-`NhuvU%tG7TdWa!(es>OZl_+ zT1OvBd=%bfvC~Q{!ijdC7MQnzm*ptZM5hlj#p>9Kx?ktTg!Y#aNWQ#kR#tvZ!s7it z0wEtbrjJdTuEDLdp})}=xmW)xox?ydKHtXHCc|D>NFsfoR!3v&fOrqDtkc9Th+-ax zQ1S-RdKA*8%-#XKkt;A}=a_jVN9Sm^}@eahAT$otF0hVQ?z z-v3Q|1?Dx$3AUEDhZd;&GG4SH@nZ>2qBoKIXR zN%a$5yb$}@j;2#ArlPlQ-6+=U1)==4n?i?J=NY5m+EXI9FjkjFlgOs|$%6SoHBU~o zXQfs@M9lPC3+HU#ZvEJAS_55ID|yhtV$|wR&e1{u9|!-!wZ*p6ZGz=wN~GRqL)T9+LZd=W~B{4RY)Y#w&*t5iC$q_Ac|7)qm7jbe!R>OCan?OPLwN z4f%eGr`({TkuYK>2TwMkd)(s-6p8Vnh|`OGOFX{;LljzoYA4>3Y61 z>m^$;V<6C0tATc5j<_Gsq4KaP!*)ed(R;?%Q6K#RNGAU2dd&e`^47OsJ^sbDhbsS` zwdVtv?S}rrre*)W#N!-~L)JipiN-x3+IfT4|Czqx@rb!v>5{*UUgEg#}V%}U}A)j-@d`1_n)cT(500;h|*f;M3 zxO1C`XE0#%F{&?`O@px1QS=uxCf<|#tiyR{^@!N;#2}Mw@su~1*ZH+Aj>Z?DO`TNs znM{*?Px-0JxQhwtKZ=!PY!!L)_+8;!ib;|9CTs9ZG!o!i{U__(B{L{CrEGfSSqy3C zI>$h-L6ymnR=ySZ5hx1Yim*uyl<6%zZ2a~yG-8T$?Mkjp?jA;lbDl`k_iFbwPoM?+ zpwZtWkdBYdNXGT>tW@hV@t5asr6#%`nz>KgXueQCb?YtlZM!`9`Xqb*rU>xWJ*)er zo=?w2c)ZJ~D{%Ns2e zre-B_laGz*HBmTXPM9T<=CDqi?DnaOm8mQ`r(~OLXf~OBq;_}Tb6h0qZ3WQ_j;rh(B+H5T->csHg)@8)=mWS~5c+*S z%>@Yk~D11(?_)rkUR`j4v|&WTwfc|da~Ei2MhqOQpn@NG zj4Tx+`)sO5JF$4ZqAFVsRLp%8=ZtKltXBYVg5BhwN67PweFR5)!_N}s(!76^|K9kHNFAzxc~9ut{+faFLwtEooyGqBa~1F zgD2ut+K3qXQ46oP-qr`lK2Qu3C6sg*ihHY#o*Tl$7`jrm#98!Bz^A0BXG{;@WnU4)%}08J+0c{-yfYfu{4)TTdX$x*uLPyG zE7tS#Gc(P|k;a$42{x|t<=Pas+j3`zd=S0pk)%Sgq6YJK5rU>Mb4NN#(Pd5k%Y87$ z2fP>l5IO#BoYI5)6rhs~byM-7Bq0LZyTE3B;;P6U9$8xpl;oq>x#Zxm)>y{imez)1 zYLzf{`s|`&( zV?C-XvAvtC6J|Xam)6sLG92FJ_7a=aWK^TBda-*S@^)kDNDK5WCIwY-AALgak;U=I zp3f{sI;A7);xRMq{3l&J1fEdx#|FcG6~;YW z&g0qicMNZNXVr6-KYc;0PIi{Q zj?BV~?#SYv=1{$ifYgYF25E}r%6VDAJ^62zD2zBPa8pG7g4qb3{g0Uw@`I-E+E7`M zX()8ox|!e|B(u+Wfw4Nm)fzxPfZvZ{U-&mB$j@DGllqOy#^@-t^>@qcX#MRzHEO|? z{MMy7+2eqtuuDCFMr}YgpW*vFRRj{2YQVN5AS03XG!1bikauZn5%YvMrAKD0saG!- z{Ha9#n<(O0W(yGB(Fn!Z6+Gs{kvibf{lWs%Jh=c%r%bT=z^Qss2;1lM+s1IDQ+>z) z*Z!+ZDd?Y@C4%s4@Kke@aVJt*U`pdst{gJZ2$^GVp=JAQP>m2FZJ?pU*#_Q~WIoe$ z2rJuQR17})5reR)kQT-e!-87}bL3d8bZ|n@GX_l)+O*5n$-fp`wj|rdK;mtX zeaZ1;i{yQ=JX=D0tJTk+$D|5NrcJcm!rlDC2Gf{T7(id^C=J-x3XsC1H)ddH<%A+W zodeWXUM1RIdVg=2%7bIDTqhrVNkG<)6{nW>aMo=XQqxHwT-qW1Y|jRd#Xd4}aX8Bn z#Vxxbv#x{4M^V;PbBnJw$xV!j@Jan%d_;^xd1RC@W`V4*tx zGE8k08{Ghe-RbfR?^@Ufn#+>$<{n2Xyhp&3`Sf8E$hfb6bS5W}T8!>M()2O=>ZM(3 zV9eb!iv4>V?`rN3GXLiFljC03-EWoXbcNRb9^zV1gJ`4MK0XF#{*52sp>HvF2#o3L zh`B7H-Xde>s7fJEwwyhaO*U%f`}wRt^lU;zaB!nuOYXrYQ0vA*^W6n0C=rDd(}gom zS`lwa?{du0v8czzsBAI>9lAx5U_76WBIH_9HWTwJXO{;HBXoiK!F24J?)~Fgx2e-m z_d)lm;d;~`bVRXyE-qUc7i#kcq>WbdD9wUf#+f~34{FG!H$!=UHaYWv5`+~Cl)-&? z*gbFH)UqGW25f=sZvxaD&ahD%cULFJ5D_E-1~BX3DSiPSz>|FChzS5S=<~@U6;36W z+{|HCVcbVfx^KYtxdgm;BJA<@eIQdjLA*wjGCu;7sWqtO?s0?~-^21qs`trNO`;&+ zX^+FI)6sKCuT(3eWFoHtYLg=iU8nOm#s&QW`?>-6%M(~1IoxOhHm!@29ht>(2{=7K zui+q{-_{uiDh?5rTtKly3^NbIgI)v8R~f07Z$LYVz|dF86}0daJ!(0~O@n}?fCR4x zZ-x!vv9ichr~mfawMmc$cE>&eUe)ADN=wpw zVb(4aE?q`9#C>5EQV3D<`y})-<9^>(uEFRn^!Q1?XCSegTgq@it+kCRzzX^}vg!8&mUg}&?bIEexpSwwy zdG}YpF^GPUc!@ersVhvf`9`LLW6~~~Pgq>--tdjdc79QZn@v}M$FH?udpW~j&H-uM zF0^&{h(E53mj3{IHMzkpyCFZ@H?qrT*VE<>4*g7OES>cENno{z^e3j!!F<{b|3e4b zegRszQ~giZre8Sb8gY5#^La3_yvjV{S8913-h1+SuWZRDX>TAtIpmHN*|^lZ2ZG%3 z?Nkl7$F1@hFFhLW*91V;KP6rNM2`WWSR2zX+#^RGW4-MM!jxq@w9e~M1AhroQbfn6%8ii^tAKltwD>HxtC{{OH0SxYaqBYt)f{#R zLn{-oJwilZpgGTrVZcOz8^al@{L_o%3gbv8K`F9jA1ZhM=uJC5( zv%Bf>!xD4ME`-DVbTF~uNV9=bIWMcjd)zMB7@=*!s$SW#rvTtdZPSXWo{V9xbE$84 zRjXIQ{We|w6rNwv?Zb;F^JrSIlH0(ylfDFgOTv3QZ}^*c-jlu{ly#gSX9vrup)}0M z^FO?%LL}31`lFq=sQSPJ!$8vBjd1pRZzsP68hyi4rPJY0*Z0rj;MtMo^3XcpI#J0z zE@nWbUjoptti&zQp?@egEY$jT&v$c5}hA*XXDc| zaGxVm1x%sc_61|d(qga&Ib~J~jGf<@-C4vm=s2q}n1(r7;?yFNp-rCV#pNh#tZatI zLR5~n-l-Xdm)iQH6_@HZ628DOgb}Q*J8t?kSUkjcNwgBDwa4wu%6~Q}YHM%4_u1gh zize(*$AC%)6$qTrAlQc_!6#DUk&P>Hu5npqSAryIBy+l+#&mr@C{evY(%5E_e=uDZwffPkFptC1potfa%<(Y|Ett zubEucHhus1qs@OdJT)TJQu`~^`ha^~ai3|u7hrvIHQ`qix9X<7vBxi7H$dN>H((+V z4Hu*5f@s`8^qF};P?U@;4vuw=rI>CtGsA|5xnGtS9z+M+{>@2(B=}Pp85Dru<6O{ z%4B2R3me!UN#+;mey5t?{DY9q-

Gl3!ppg;1 z_`@k zT~YMZNOTnPpe_Z}6_W<;z)^C6nv{9;hCq?y^m9+XTgWKeQmuL@Hu38>sHRwd$>BE9 zQ=WEvo;KhNlgBc0oR~cTeYoAb7Q!HgXu@*0{_Xwo01R-r#J|#&s=qp)nY*5u=F#;o zk`dl~y#p8_nuM%dc{UyvHGmHmwcOgpn3n9C21-n2Ioc~vE@RfMFixsP334Mv5h>^A zo)D3nIs!G&-32mZhTQ0%GL%E>HVK)umAQf0BW8EO220e%>bsSnQ4LtHFB@*%fSLT! zo4;8?N7?lgeb%E=nWCUV{e~)V2eMK;?RT!{xvbN-?WvZl#CkXS6gwIA0I_>A46z^t z^n?I43K8wbGGpU5H8r^e{OFy4wU)?QG>ENx2=pTv&?t(>T?25eCj15(N%Axils}-y zuk5}CT&M82Un7^Hf`mOe_F@p4ATT35d?;O=+hWT==z0YxX0jMK z(U3WMG}KFu9#3*_5X!lR`L)s!t;k}qMt6Y4iY^R8`ZWs${~3_Q@QudDVoaZZ)jZ&Q z2weox>M3AY^vr!=8IYAO=u}G?ifHO4wRZlxQ$JL=IZwNFg@8PrXF3{V0L;#lKu*!{ zngiUW(NGUQdjB$I8jR*9p^5WWfe9%=Od6wac~A1bco*%TdEK5xLxj0#h+H2{Maclx zu%m=>Ye2*kYGtQQgOi%t`gwuQx8ap12=F|Y4ouQ4a~bx=X%B|D*U*k$FhxZx5uBZv z9Cd7Dr-@)na~IwCQ2zDlf>Nh0%9{mBg)jalEfYslUmLs5JonBNnm3IIa?U5Rh2%43 z%hznKM{Qm;)0h*`ZPIm)%0|rGzqaNcAZ=(j!+#L$n0u(`xe@Pa{;lEUENbuH&8&S^ zJ(i*kZnCsmz`^UUSKHyL;rlpn|22OV=+Fye9-0#PNHL1PT)ylS{4o$(v>$2oyt7rd z(gWe^qFwDg0Kq6(bwA_VVEvyZcYc1qABy7jRqkJs0OhTo7=R}#u6_>Sl@R7uP?rh2 zLxOB23?TvzSO(P)@%=fpUUdUmboSn2#YEd=+dNPYi$w$4x2CT!h^hm^pQhEU>(wwqU6{F@{2 zuI~0A$z<$@uHrXhdN2GZjx0@mX(hO)BNz+}(;kX|Dp;C3fPhnqnEv$s_qzh5GIQVK zjj_@8L*az`+7K|CJ!o^=qL9)k8-fOTd=p>{12|f!^X0eDm`R-D$fwYa;KgQ8dR4E8 zhFp10FFlSoc2@VNPxwT4rt9mfXzKy&(s={`bB57?OW?efQKaaDpB0R-?&;WBO;GSj zTZtUYKTx1WRrnat@`gIFvSUR*7ppM(Vig-cFG_UzlcW1|qRe89?#hMH05y??S<=V! zBJsilhHA?oCiC{ZbtlksJ&<8RwJ_Y$ww@6yjW}m1TvUWuz_rGtn^SK<4|MLm#a{gl zOF?0U1rfc@Ah17KToP+Ssvb{c3kQm|*0LH@UZ7jF6|hf-O!%Kif{?$xO07Vw5hcuJUw zj1B$chIaAccJmtD&H_hwr2X0n;Svs}3J+&FG#^kaC zbMp=acjC?J<%N0TNk<86SLFCVbZu0Ihbk{2>Rr34bQRJ_Ji>K&3UW0%KuOKNdoo;Q zzvsc{nb&RqRS$YG{0IjBW8DRtyr?D7&?4ITG{GX`_ZjDr6L6I`E^M>yd$(W&SaMIC zpCH>ARkbFSQx9WKK>&XKDZ<}y2ogImneW+PTBp`tw#INu#Auj*fO}pci!~iXAB-SYVHq0xFWb} z-aCEXSgB}3KQ7(H#P~za3wOX9@AEcR^gN8#Y9&Y@wGu&VWEG9sW|&v2bT_tr)zs~a z(zW#or)%D1cn9q)%o*dmt4<}T>Xhv7Szq`uo%KWr$+YghEBc&v=TN`i?QA#8Ys+Yu ze@hQF^!efSJg!N4Y+Hs1izz%OHZ&<|1fJ*7dNzV%+Xq@#Pkt8NNgu9pFDx? z%(D}-N-Pviqw9&XNz04LgxHC1pXv(ydR_=_{`j6~Q`0H5CNR)de{ap;y6>(nTnuki z{Qa{#4$uX?I-chFMZ^Y@T2(mFH=b(hCN~TGO0{ho=3yjG8T1e?kZrD3iT#L- z%XJdSO3cv2by6>pcfZ>ufOR{I#5abZq9ui7trcyUO<^{igr8t|{?tQHh!r=deJtVD zAvhc9^L{~STW{CQ5_8-#8*^!GL|M@a7V^zsA|=A&);6@~EaEKEE+;L45Pfij9;E>5i^Qdkenu*De+m2X($O zh~6MDD%@bVtYKg+!rQ;SY|wk2=aKK)XR(T_JNt(-g^&H?lsTn|izrsPxrlc7*+=w- z>93@WkMiIAmy&{_BPHsV0hdAB=&u-+UVJ77>QXcP7c3OR%xICa+r|-|^{QHESRlD= zK<3C&w{FMJoS;5^atWMc$LajiWtl!~c=cCYsev)xVJS85S4Bk2lyNXrg=+rRd?L~i zkYu_v(-FlUQ{dit`sy{hijhxYDhunSaO?9Mj3yQxG&Bt&4pgUUldQCwxgk07WOda- zBnCH=+F{_suAEmww_tKcPr+bi%cE8ze0Te}gKX|^d$iJKgKAE+6T4HHz$D zoBkrO^ch`>>DHVXGh3spBq>c@3LO@?8@MWO@^F}yO{ihM`8wt9*ckAV7U)M?J?*y( zy=rm#6n#Jz@*tXu)_#`P>vPR=%FwFscPT z0|r)iP;AfJIA+rrUf*%0Ae4+bKY#U|z7#Qx5F#{*jt^04q<)XOxlQud9K!SowJ@>< zWrL&?Iu%_amxGCa(mj=O!jt#$kn?jOx+_7ahJl)cM7f^asAlc9k=ua+mq+B4k)wxw z6FNz6-eZGE+XrULez&m;=Yg_GV5~(IPyP#0HAOTYLw<1HX|cxw@l?Z3HhwoDoRnFg zfDUudNk#v8=pa8(4La))tv&m8y_b;iDDA_LtcDj_#M*|3O?CYtVSf0fs52>QWj(LF zfzy|7t$axwH8ZHs`|FvXIN=TII_15#Cdr+Vp=kcbyu^h+8@u-WGaPjQmvxeq`WO&= zIi{?t>q8E;(iV0N8NaGyzN%p75;BE7-`o~}-ErC%M~vSQyLEDZPI_kWDsmZ!PZ70= zlpD0|1RNU1KU(&`67G`OkdjO2D9v=k>F~8U#$HBCHY;=Fu$cuU{G*7fb^cmF(r1#q z-2SZoqCKWDu3dHyyXDbCvq?14OfS3ZdBc&*hxR>!*3J<=FAEIPFyy_-Z;2dhJg8U# zu7}(zQeB3ap9wV7t7MrhCJ+tNlgi&9n2?+P<98_syyaC^*pQ5 z>koZtjK*d^txmb>+?SpK>ig(ru?1ih!{o~ong7aHxITbipi_vY+(ERAx#h%Ah$Tg3 z=ZVC?^b3K~u4o-O#J=OQ5&(oR_3p6Mt-uv6Cp=qjE%|f*aO|`MxTHbPgH-13zbgrAzM(| z$JgZ%MEM((e*YM{8Npp35Gega#?wkes_^%4$&> zXf&pg%Pk@cUV>+pc!q0 zqqD*KuqNf2NAZU&B?EbzHKRZkNE4Rq;Q=EovHSV`UtYBR4sBEd;j%&ai%HAR#rM9l zoNqU4lbolo6rUSotABd@DfqQSKY_#CE`N0CkYB1z$g*D z_%530m-gYwdsG}J@&&Z$y1RzHTA@zKu^&BExn@S3CxU`S|9%qfBt~3!f2JTeWA3fP zv+EN3(xfrsYQ>#H$$Xbt=3L-Od3CA1;-G36oqgO1t` zFCQLPzrQ;w!+P0yuRX+BwIim7bc;(>c|WOKo149h7z_*H^;U;p{v@%GXf*+u9Xk9H ze9SiMv|l>^LV%WUqb8W>e1`t%8v?e&Fk13BU|-*VE{q@rK^5UhZSC^FCk+1(qJ4WD zXo%v(+OFkJQ&Hqj7yejZ{^X#V^>>C)E)b3~d^X>2(Ds5Ajb&(9Vu|xs^>#qQuX))O zhC5U1Sgl)lM2;qECE>kH)9Bk=wIStAM?^4veoDYV5nWMn*Cf=ot9lvK*%lDeI#q`? zP8z_p3|3pCSn4U>lm7`AXk9r5O zb|yV>p6B^JExuRp z%l3q2Itg*ZF_g|WPJP`@5Jh~??h>bY(@OKT*BD4EW;ko97EMQ#wm2OPmwbuQ+NXe@ z{L*(}fb@9Iko0nac4P#Ut-sGe)|JrQSDXLr-e2e{h|PkyB-3a-X~{4yN(eC)tMN^P z(~9AENqDTsN#cFY*=uuUcCK>oI_T6%-dwpG%LcKNVJZUkI31C^-ZI50-Wh?(suSmC zS~a73Zbq-$S*OW#KlqZ(#tbhvGc(dG)bqe_a=-<@qf{CXqqGN;NAaEODDUYMGwIp! zdfE0as^d(5^lY|lZy9dnPti}_dP6EIuD_|=4I#$^dSh7BvfYP$)LBtQZH3E1OO@M$ zr%y-cRH+&+-Ekkp66wk6?Wk2|{eFLSmyIhoHOMOA{`z4ePmgsrw33jl?wo6x?y9_O zK^2B`K@{+BNuppI{Zh>5l);NdVmnH0t0OMx^YVppQ1q?o#w52++^Hb2q`yT=*Txi} zaNGQfF!uTri)PU0F6SbgDamP-?}%V~8F$WN9bSu1P(sindEkKWq-n5;J2}vLOv}I! zw0PM=F;29)DW2naBF>206|9Z2OKvCQ($!#=$9=Jg=*t!XV9vX6yXQ^1XA$ zwjuMgSbTi^ylw1alC6{cA&I=K=K~yknHT;dg8kcM zq)zQ7u{?fapK20$SOn&R4Dj&>Jk9i;QlS5{6<|GEd~Myd_-Vh4VsO=1Vp?Z%#&N^( z`Uq&;Mo;Vrh1Xvh6*y}$>*~wZttoY`+d#G8e?Kp@>b^J+5O({NyqZb)IE7>bhqX3t zgiNND;^a{rTA6T#ToF9@A(1hH-7NRB?@GP90V}s@o9r`yOl_$RfX1jb?e+rR?5~gOk*84-gz~+L?$d zA2$7fD^?wGXFG#p&d2uCi0WJbP2U$-nhl zu&gvS4Q^6276Sj!6?NY-(^~jBlP-$h;moZb%Q2CZ;N$SAs2f4Bix?;U_ll$3c`G50 z;koYGr^kH$-Ah)UYI?m1Q+nJi4R0%m$(m?YX8ei+wtW`=p-lU9(RJ~vAXwY=b~M&v zU#}-cWBGoP!4^Xr^a-4IpJteZCdt6*{ChC}HebPAHfrrT!;8~00%0l`IBtCE6*ciqu+ZplPqe#bZTAuzs;a9%eK?blV!(sRJ|CuNnq#-h`E9Ujy-@pANhu+ zCR1{?|5MM?3tg{*>s0iz#Oi~wfpavE@zVLI$0*`cMqPtHjZ4@*LeGB9A9z`$EuG^* zS%3z6Ex!Lur9;5U$Sq~E@}%=>YOp{ud{P?P84T%ovFL6W@^p|Rr!;GwS6!q2-{vgW zxoY;WcpJu0lb+#8r6d}UASk{ahUdQ~iB(M)8v|tcVC%s5I41)eJ%=?5w(1Z?CHYQy)SljP>m~(xO?3mUdj0Wj;*3T6Mei_wIn9*!n zHMF$e`VD(-?=oD5lvD~YK}pNRW?79FqDN2Ft{>@fRA?mNJ`P9Oeo&qK!f(=(CPp3c z7_oZVS^5qQ<>>Ecvu{5HP%}CtwwvOP4h-~?IP3?f6n5@Gvzpj!c;xLLD^DOQ&BVmy zO*crT@h7^TynYFDF?xQukh>p^^DgPVhp>?4Rk{GnhhCf@sg{Qh6pP7r7fm=1XRnAmGzGnlF6h^#1v8>94$i(i(naEj~9b)&11)Rw=b7RudmQFi4d4 zec+W*i;4R8NS?C8L?fHdc@l8E7oov;VN6kZ{a>#l8^ zO?Fd`L3PQ(f{0zMa8?bdfY;8G&fK6Xm9%x^-G-Kf*I^HRbVs~(e)nV9!u4C%5q8Pg z+*Qmv_00!jv7!GA+d(P#i)xM>r-~30oFA9k0Xhw|0VN-hO>@R>R7@>V^BhmPoq<2*HYi=4?IT8 zeGo{V57ipP;^EVoZFDiAb^#0&27$NO z=X3X7j&3PmH;Fr(V6IOaS{dkbL}8Ru^*=u!byhHW{*BCpucybkk23Wr>>mkx=eL$$ z*CS+R>Md9%bB8<+W0+kMykgSR5jEyc>8XZ)TCDgzy#CO9^Z3>hl`To7I@aA?WoXdT z1AG7NwjH2g$InHkcPKbLu0uZpGnr8XHVXe5B|~%3m}~(F!_8UT;$}q;y8kawV~i*` zIYe|gzto$K;A#)F;i96x#U+pmNrobDql8g=mEtZR2Y>iW6@fWT5Yn^Bb5-|&o*0c4 z+Bv`?GI<_`SRA~?d`CzhM8*>qFB&-JJQlN3iN-jDTdXe7vWnxvm!+Zhep;=pl04}4 z(&b`LP~Ci&q6G7Eh)UZxotSSMPQJn?RViukcZ{K7AI7i^Zg+d)TrTQ~Jlm!(QrbN7 zS&beK@a@v>^Jbk7?`((cWD2KH+P3?=RayT1F^}zL%Fx zrIk^8a^H`!>gS_)1Z?UF_v1FlGL^GDLB%k948 zXB2P9<;bmX@!O zm0=US8XnXRU+TwuV>@)lgVuPehY%mCAnQb zZ`gW6{Kp^(tmws$q5mq>5tsHS9s#cyhRWcG7*&_d3!hilb9DDuWykD?)*QaB&1GKg zn=h!CKZ##2QIGM^A?ZWY3x3MOIjr;#)Oh&cQUXhRLZY-TF#qU8Lz4kRreQcc9ILBh z$SzfGbZ$AJG;1;kaAG?QX_e{H1sdYBI4xa@f^34g zk{5bE_o5VtZvq06`Ut1C;-{SF#W`}hX17b)DnI}dULX{I@?Ka7&mfm(uA`hak$y|y z=Xi19P~o6+WR=q%OO};R3d^|jJ$tSePBR&DJbXNjIkg+p^#1+&?bKxW0rr+U+_G(q zJ?Og%ro0A5kqrr%__mn?&Ep0;8hi?wDq8f0F`Cw}>u1zqX=5cff7=xAPjeAIy4g

gK`ysGgjP3X8X^`hnqCpr5*AZ4e z{zg*6DsOtcw{fPN;?}yE8iwv0|Gx&wgT**HWLP>bprAWM`JtcRv@WS_Fz~pKHx>qz zqjO7{5+D3%?%X5YcxIm7ObqD|3BpQ((^hYY*ZyGc?JaE^ejQ9VS{kh#?_3<)YOJdz zIGWLOXllIsd&uWSgD4HtaP@}4wKHDgAHIFs-z(ushc_gbXz%-@RP5rBrB1DBd2F$t zbRWjC#!zm5jhCr>JRG{VAE9UJpPp<2A|2Q)h!Mzy#d&`1>i-O)*nuY?VG%C`yYm!= z2Vsh87-L(^{1X()ZJKX_$Ht;HTz&36e{ldx+Niy@VCjJ2W4gHuXA5dzXY5D9K8r$V z^TSs9EM8JlFu!?{elB1et0!H+!;(r!NIf`SGd{at&$3)0-=-TGO#8jFt{B-F^?w0IyW0?Xgs|;w9h$Wbg32DuT^~x1ZK*;v$o+H*^LozaDTl&>SoB&mSPC@ z9}*7D6=%&d*CC9>eXaSR25+af^k{1X;YyCP-#B$kUVOoZMJSuDi+@nr6qC*6=t=B{ zZ+*|ezEe76%pVv3?;}4^Ubz>Kdc*5B{aPi8edvyLl_NT(bMi`>;eMk8wzyj{$Xb(zVN~Mw>P6M#{(oxBeKqyx zQ&*7(I!mrRORf7X6N(NNANjv@&`ZMpV5nT%HA!5IbU`qqc7%~(7ziNzd>no|u46yy zqzgjJ^k_VcR)oFmYx#Y>^2jn4ONV$#sBHZdwRrWYsaNoKE~eI5n)wKN^#@y|w{Lij z3Pb`s&wsJexr=B@p#_d=*>X|ZI4-Gvnj~0VQ-=Vd8U|jR2q$%DM{T!g5*slolrM;C z^&6jA5@k9Sg!Z205b%8(V0k@J`C3s1+Z4@Z z7Izz${#;Z<)H{=QbkyJ{es=7E!lO*at48!{BS8Yf$2fzODQE)iQ>H&7BX8KC2XWNc z*ZZ>vjbfZTCstXR1qgaCBlB?Fv(2%Rh+mXU=Lzoa#TAJtwj@!phZBjSbHjth+Rv15 z8T+p@jwVKv?oHpn_H6gZYoF2+EBJ#m_A^n`DJ<0eUBe=& zrcCSYu&qF!5xv6)?(NCaN8XLs^FASF4i3koJPtltCXZ>CaCx8u+&k4}hlyeK!tI{sF>L|F?uYP=OEj zs|iK1kmd&-=@&GKc1n+>KE6pW7AhTC&?)w$fUAE#(H6shP4i$^jzf3iZH7w-1u-l{ ztg}zFK=|a>&A*+cR*_KojbA$1M$OpP5jlVUiB1X*@(#}&VP}o;>Exc9RFhR|syriS zdJUsEHzjB)-Ep=iUHf@7R6MMva-I4l$w zk7wxj)MRp1z4l($W_dj|U9edj&odBmvUv8+Zu=0YNwZXxT4@4g4h=5KXBgH09jNxW z^6hqNTJpy|QvKsh^=eHGE&#*2$ng~$xJO@H?oOP25aDOe5lHkw>T>5nDjsWRb#E6a zU!sM{N8D3j+Ui;%NWwi~U~yXSuCH!p*0evoZQecCAis>qrtM4MAQDtMN=Mi@%`k2==fD#B6>;YWFK%9Q2%X%hjagr0 z1qSOcG>GDgrZd}@XB59(^YP+T@=B@jH%!&xFTaS&cwBU+3E5}BzEWr{o~$= z5Ajy2Z6GEjwD^SbsbU{K&^;A=dA}ZCY6B))0=Wo3Pe=?AXqlXL_XFS+A-`VbQ?5*FWxZV8X%Kum z*RjX4A#ZQeBGR5B1kHK7Hd3~^3dOMSe*r-PP1m(#i2e5Sike+xJrp17-8M65>FGeL*rZAkFwlt z_eq0HumYPyrRUAuioR{V!jjGjHBi7pt(1#NAcnYMtraY;Qz72u=+@5)<%T};1!3?U zDZ1e+Gtz>?{8*h$p@=)mzU!!qqWWe>%#1wU>%u zjH?`+i|N)ab*xNRnxw)L$VK})9ptJUvr3*Pi{-LG93tnU+Gk9DoHuqEg8!rIR%FP- zA71chF>blnPdBCpBWfO0+`x*?a0ju4^=Lm6{WwZg=YV%P?T^gnfDmm7Rn>BLcMCy? zES7C?o$X8+u|ZUwhO54AQ;mkA3&(@{Dq;pLEUes}nTkF8N_~|Pd%UA;WPu{4{uyE^ zZG(mb3%z1?tLBZ#I4QQ;2;8LIM-KQunGSyLcxWY{--R1z?E6~96+~G7=vrj%Gik7_ zsc?T|ij%C0cbK4`5XNp8czS~Aw*@l4T(R(3` ztA?Lr4L`3jn0j+7E#iW|Iuoa;T`mdDE&D`@;s-jV3rz|h`KKnCmL0`C&=hdSu1}+o zF-85^<;FCRn!-i-%7<(062u{9Kgu$F{~p1l{6b#@gfy2>c0bAaFLNinBNv|lsznV^ zfL7V9`ea!^r5JPyHED8KXxPlA7>N?N0+!WL=^Hfu(Q)ZL@xv>z&5|ymfIR#izb1W1 z)3DPcyBVsz!}30ew*=F;axIDamSB&r$`5NO>4i6*q|whQwMv^$T|%abH2Wg%YKr)g zYw~|(SwgeiqZbKJ9|wtGItp%kv`WzTnP40D;R#pbXFgysJ_8fNcsWR~$-)+h@8|;g7yFjf0pzMtUtUxlU6gqRn?ZeM zo^<=lzJK6y%|>5`J+h5Uybh43PlmfLp;?#iRO8eI2YHhn(Qjm;9yqDZC zL9}10hfcG|d~4vnFkUwQ{5_%7jCIQUwnv2?aXhd!ybzJT&72iveExCu|9j$v$@9p_ zhc+Z87*CQQQ52?eUA_MTKbY`^bZ`^ue6zwc&uL!Z+l{uQ z7*-Y`Bp*>HJ;sx7kcZ9fjPvbdHinh1@CLn`ogeFV|82GRqm)XmoBak@-ig9>W1_`H z-c5<|!$|5`jg-f;rjU#v1nH~Zph{FL&;6bs&iqzq9nE9pY@cI%NJr8Ij(y05M6n9+GsR5yxw5m_QM?Qzg!x8q^UCkDm|BBVE5#oC%oPPb6(fJ+v z7n6b?qkV|WB0|JRSGINgz6tVidV3rr6a&b-x>cF}j4z3|`FvP*oS~2F1w|Hk|MP;C zTmvL-+gG7&@~8HqA0aXnkIuQx-We^EOL87tv&ALQ0`16iD< zAyaI=e^&ELWQbsYk{e_COyz0zXT8F>Kl$7&-Ck#=CTR)oOE_XXQ^olRjwqK2%~o=i zmu#-fsi@Fb-8#;4;&V^f)=!Ib`}t89Us&9>FWZ6zwun#;*b`Ut8&fJv%apPD9c;^K zYu7T?G!4oDY|vKMfj=+7X>V%!kANaOcwhwc_-EdQK#7>G1PLYFOUFs0AosM_*2t=! z9*~=zO7UIo_y2!0&N@JbrHN7kOS9|2;iuFlYlg%to}sL8O-f{fTSgFy+?>J!3iZIg z*R$cMyA*}roB>|FZG}ccrwT;`1n!YlLs1^zkx|n`9_)|W(p$P8esc}D{rIDNH|$;8 zH@P9f-w)_wW3@#dBYUGYiX@9ODq7{&(t}ipIqnktyUdPRcMMmvu9Soz&&Z8{9z2C% zoRuk5nB<ZND^K1^6b~Ze;`Tk z<{I7IPzXj4)bdYwK7hWH7;#$^KQ%FP!?FmbA`WUkjMH zQH!T^B|Ap>--G%Y=^soyRPV;6d2l?b?l|T(>T-Hg%8g$cueDsqz|5=tKNu>C>4lm{ zla`FitePX=S5RXTSs&>udF04e|{BpN5GRaaov$+^yYyO`&s4`${N$eQ1(J6V>uJ!uW~_# zyZ&iu`}KUgYLHV5{4Yy_4~)Pt`hTY2cf4AGS{I`%F2BtWKMt~FbpG{Tp^kiE<$i5f zJ-&$>GE1?J4?;-pOn>${;H22tXd?M@uqnq$<8*CVGBxXs|4Wa1%!WdH3c=!07Q z){i0Zv!(s(CHEiC74o%>--{MY*<1adNNqa^<;-K93b^VED6uJd*!K$=$0lX- zqh8^ql@-`b3e2r z2btNFL-%s#uAbNb@~V!H$aKjOm=5ay`Udc&Qz6Dx0167q?QJNIWcvTv?PjMHy@Bu> zaapU1i7Zz%ss^6R=xRh%P?Qk!aMUVXsuK$5jmm|7W7~LVMl5FeoZCuHjxgSPL{I(p zq|7HefC7PCjo8bqY4^`GwH$MO`Lb1;Z}`GyN`I1Zm5Y*&2J*jrG*BUv&)|h9Zi|{w zS;_zL>A+Nkk4wT(@&4zhkQ5b@fJmmCJo+6*#Q*q!KO_C0e*?bx|Mkz#&;x-5V)Eu< zn={}klh-7Ywt0uSf=56Vq+Z+q$83HxAGF_IkF6mD%az5xq&o=T?qynkNxN!p`LXSq zi5%9`)OR6~8~8s?kaO1vhx%U~Dv%30W&43h{hRf=A56$H-*u~D-E^pW0 zUuw1AnXNoM8k0~z;_&P}T=DU#9^tPx?GDpR=ePgjm7#`Q%)lD(HpS??K|JJzFx^4G z&Dd=vd);golL~yLOV)Lpok{K20|MYJYN~ZuY>JC0e0(==5vc$9ZxWGT2{KCfOfZJo zrtnm><@t2Sw#HZrq3?5b_68z2I;wJD2AN>FKJH8N-5rJR%&<^QQKmEJD3YF{;xge8 zvA&Wzx4+|^I{Mc6)|u)4rP{f7dyfK*;N98|{ zyTkI6vwZv{cQ-a~QaMjkBE}22ck{0wkONYX?2*xJy$pHU>nWR zROnNb@JKEk;|D$J#=R~IDYEwS4f1C2gB_51Q8AFn+zgv&bg>%4?;#Eb9d$=|6=dUc zy?;F)<$nfeVYXictCEeR_-}vz@*;=uCI`&TV*Zd6R8qVs7P~oH=eOvG1*mQiV3!MR zz`H+h8oP|}opz#A&pI^kT7qG=vNx;*X&KE`mu4uD1pdP}5%)crddcs7{HFPI&NjAz zztsm(NA%KgqW;lB!0omB(TG5Ega;73KLLrZL(r%Qh5s*$5O|EFh4_3WeB=RXqNOpV zYcA`7bttca3A;QGnpzR}yO{m1%qhnA?Fq*Ysm)76=TKS=yZS|U_u~nLdC{Zc&w|uH z-C~u>P*Cga0HKqJlk~<>Zku0^%iy{NayZOOHoyg)bFE_GT&6)|p+@dMzinjtI7GUQ z?C@d|j6e&A0KsU|gBSB9JyAp+nyqkfgzPTxI|KX94%h9=TTiR_?dN767kn}Ugz&Z2 zW@<2&ZI2>06j$PEJWk@q)93W3)$U3#Zk>6&*tBKZ>y)p5UTY~QKY{$F@0ez6p;dg= zGG|kK?h^ix(zjXOfqnKggr#!X|MF3}PylGF?wrH%fVAdRl=bMmk7 z9bWFYlN$(y;gDM*`5q31;5dw%Fd%=E9t>jg=cW?a9akVl6jKY72=2Cu2CP`R14CI4 z@R``4Q+Z~WUu1$%tm)#HmKinAv6i>2 zN!=cNd^1y)QJ^;bCFO}Ww2Jv1OJ_<9m6&(K>^lUkt7k2m^OdWe`p2gODaQU@>?RcDyqlC=VKegbdt%R$m~Z-^5J)oW4uQ- z-2=nt>lN&RqQ0_%n}!8x-vEi_N8SgrGtJ5EDgrAU~BJ3o)b*|JjH8oJlZ2XeQ= zn@fxVwK&vNg7Br`=~4rdCYAkY$teO8gGn5L1rtyqGv*qI$TrnT$qKt67A6%~bPx(0 z$)Dti^?Je{Qf_6+_M|1u$gVdVQ4z#iePIQpQ-vg8@-ncz;KJr3pd;7jcEVugwpZZIDrq<%Z8hwj2Qd8&i1+2*pAMoECmV z?V>xXc?zgz)14Y}uWZ{>jE2V~g1z>K=AINcm>pq@9w8Hq4%9KQ!1V^|iaNQtujg~A z&nAHvlF*`%4kR~j-mOa)18I1f&SAQEdA;8V3gLawF6f^n*GX0X#s2s7_Vf1jCvbM; z5E}o#4I%L6z`NmZZnUpKAI}im`rhAIJSL><7*mvF3O?URS1;S%bWa3D6Hy_Z4pjq8 zr(4D47rqf)P%47+uhK5X-#iHZ#8HXOg+_vn97l@-Jn4=2ub?M~wIPxCQkzc!g${$` z*iTXEaEB3Vk^G~{FN`ZN6rwbGA#BHZ|@gIm4vi=>(SQ;QoHTa`b?v@ z1B_G{)o|2Q2doz#G4hn(HQ*PQ)_p|p=#j}a22DQYK67_L=Rd)tARpw6te7s|5P)8O5OCsZ%4k%U2W4|xtl$wDJgMgGOW-x_#oB)O~ zhf@wR<@-V-M>9l7+`QE5(VnP6zcaq$ssvrfeZG8TQAtorEv5Jd6jF|}wUUlO zC2vSKe~K72B$6@J@Vi!Ml^kZERF~}er3UyfZ95M984f`L*8~?yD=QLsu>W^%B_0s? zT^v^eBtROb>k^6?BU}nAm{)&N>I;Q2r5L&E(O(yYME-LgYflV`^rgEn0K}+?B*v(g zy}RdVs~|TZgCG>M?k%hVU&<8pk*)#4y*aOxBK4K`*K!t@MF1ylWx}uzr!2v!Pr>d} zgJEjx=zU{S&)hNipmYLXX^w2bpLyj|TK6_@+z6jZ^IahwATCWLr4hwEk-d{x(MU?W zBrpvsq2TU?26X}U>IkHh)3YvF2at8gKidI&BJ94y5)bI{S$^_+lrC9p{{j1fn$FLa zot%*Z&K;=ex}YZMQ|gc7fR;X@sHP%mNg7OmNmsk@j+Xe53wvrz$OICdDVkS;uzd49 zHuF>Kkb3t=z_xAx1=|5dzpA;#vF@iOzuhcXx@}ow5*7ENgQoRs2Fa%JOfjE1AX0Br zIFeRUq1|k*zSk!+T9ctSCo{(Uu75S=K?sRG5T-M}EYQ;aOMnYICKSMqyk?{^%`Itg zyMru+ICngbIRq1WA7cEF>}h}7?TwEC0C@{6wv%5}JJU@)ofko&Kbt?9K4nO0UAO3N zK*9t>!*tNh<~_VZcL>l)swv%PP2Gps>t~U2Q_o5y?wMl9k;E~J#5&k3j6^4a?H$wdt*jExpLV3V|vS|t?jwcKcq zD&8^CN&kyYW)UDk>^t6}^3U5<6A>}NMF1-gZax04EbbA8OYH!P5S%>y3|Ni?*_sD| zl-Ez2UJo_I-l7<#?R=d*xp8*corq*Y4pY{tN#*X8q#;j zd(HbLFUw!aw?u&A665u3L^d;^Sb`Ma=oq|7>%@DD04LSjEd}wBC?VBQTAhE2-o*Hw5CD!6r}c~CP1mIvu{0_4%-%X z1~tBka!U}lfs!#<(w{J)O|{edL!p?Nk@+72OeBv?E(r{9HsIzDSPYA(m!U{qT+$xw-Fwyc)wVb+?|O$t?B0g*#IK(_>)5zZ z<)Y`K#y-2XwWVJl1%ui0u54-ovGD?lVu%VJ+yT3bJ7DJ@Q`{4QCpvBDqKe3V)9*(a z4D>+%3S@@|8C)?7*0P4&MfZ98J@)sGHLtY{0I>T>U}D<~YDcde>YUCyN^DOS#G}^^ zlPt|n%O(s@y)<5IyZWb$>;@s0uv1gzX|WtM*A^TSIkzV<^$NNb367@smTIjW@}O$c z0p7-#;PuS}TJdja$s3B!gS7+dt+)ewD3VJp5YS~FfqeBjKg8l6JuHkJfw#XDag%jGQ{?gV@OPy5&AehRBSsi&+01&uL4JY#_w&$q;ta z93QegtqF;=g#7pow*4y}SadQaA_n9cI6-2GI=@KnUL|h%SwOpvAdtNVi5PR_G?X>z zZa|5o%8AbkJ_kc>ewTA$)*UpR4hu+2iTpJQIqQ-y9QC=TcbryaQ@s1$mk(_s^G<{9$8N~k;C|j)QWLt zbf3H;AT_~&;ffx-Q-yPNA_l-gTxgSyagx&SiK7(p&Th|7eKiO458_*uc|kfku^B9& zB(C)U1HmQ)E%4Pda$#_MFX7B>52$1<)@cr2U!Lw7jf zx!xDje0MgOBIK1#lQhxY)i6*=`>{(O5Cip?0SLWV=$FYmw9g1OVglb=&mLGn-fV&a z*Bs+zpGHfbfbXA+`Q?1Pvn5R-ulS5|qVBFNkt7|ag9b+P^aGf%JY;h7ZkYJsEpISe zOZcPSVbr*SG3#GoXk5x=dxP%_Gut#&!CuH_+aRw#mTAZ#uV{F|&v;~tgTQFWklczs zaMSfpi^oI`Cghb4Ik=-Q493jv2Q9p&(+~sSzfdmG&f2Q3UeJLLU6g@{M){V1o3>Eo0!y;6M@rdmwT!<3l!|!5w90Fy{Ptlv-nG z5mI5_+Qgc=14Bh&;Hqfzf%e@M4^m0wt$Csg7V|xr|5JRp+MUc!kwgyW@9Wy<;7&dg z|9w|@H^&K?LK6MSai|R)@|D^$G&rVf0k@Y~hb!Tn@k-Q_ssb5@a3GvL&97`lwS4o1hVW3ONMzWok8H10b_2e)`b(98rDqj;*0 z=8!O!S|XEb6|h78d}-{b&F5m;cD_-K)U}ZqcqCN*;j@b*dJpj)y4;sKSIu)-*4F%?McAF%*>2ayZC&8xMaubTrJ4WRrVcwoETex%t%A-6kE4!J@n9p+}U6{hY0 zT+RX=dRLyUxTcTN<5MY8%7c1D{I>Zi)IX35=LalG{9gYo5W{D{=CbpR#Q16EdF9BK zt6@&BIpp~-ptRNl;jlDUWBr+(lJOy_F8x^t&jZPpaJ;Y;8RM>(3f)jR7SPo{0*v!o zoYSNhm{9cF{jV2%m}46c0K2+0JOi4G9dLaD>dpP`wFjnYG7iao{ZCkS z8J@t?_GL5_6sGO6l2t(r7sZ60qu*P>#<8#!)O~Ax!;i{Iu{pk z%%%Cd2z9QqQ<*N=T13JYGXziq zAkN7do3F@CO;Q1LLq}EvN=wDr12vl8c}>>CFVkgIXcsK4dnY8AB|6z1n`0wVC!E-0qbRvV0PN^S?{dV8SHo_ZzxHar8>kajYC_9_9QpTpVOZ@*o-+y@mc zYG0@5$-Cs>tsj-vFa6t30w{Q7w(1n%n~N!B&ej^7M?5{ie#DMo_`o%RQ78;-8QI^E z{9!Hp9SWFQM27@s9;1o|&wV`BI?rdmpz}go)GRQ%HUs6}4gkR;?cRXtY_EM;8LkI} z#{TE|<)p@7b6%mFshCL8eE7?vhLR$>RwPl}6MP@13RwB)>bnmXD$V;;E-F1oY__#X ztBT@C#y}^n*GMH|49!e5hKz4;b|WcNBSnG)hVk_tuI!grt=>Pu3>|=1Lct!j)WjZ7 z?64zh**=`gyAUhKZY+nBM9%_m-vD@bb{H1*!6Y`2vZRqiD-Fr*Mv7l@uncNSy91NzrFej0rMY{YJ{Jw$n38z+Apjax zz|Y$$_MEFI%X1BrgmL!E=g)$uXO$xu`vjsSvpRv7x5Nf^tChN3Z-%%@;LDykIF!Oa z#EhssS5OUrQQx)~&Y{*Qnniauo`qDlc7a*UU#o7*(5VxRkI}Q&o#pJcS8wbd;KCXq zl4sK2|MTe3KmzDtLE226ltP$$=K~*uqnl!qX=jQsd%0(`N>AN5ppK!xdI;3nwhb{L&+uiN81k@G zgP7lMNYkS~@_SSl#aQs^trC7+IOFboJz_tJhM%u0Z!nVzB%6kJ1`p3yU7yY~ccy=- zBAQ%83Tz5gFBL*O`{VaW%Bgt)vG!K!;JVu13gpTH$|Kf&czILxoZ?^?C<{F>8HF#( z9W8cRiTm}QfrwLql>ufYC&os96ki&nDh`|^BqDn)0Ox0}( zYWr6>ulJsx;skA#HLOLY*$t$*$9{CbZyvaB=B4s9HmeNS?G=$g?pU{o;SLv&L#(j~ z-mQlF>Kb{Fy!}py?+CdUvqNfkqze>}p%0ev@8G1tq=biUb75S&Im);1V^@I2cOou3 zOuGa&`>;D47m-~O0kLy5OIJ8hd^~UHdAVC3JN#sUmDZGFLm zc6pTtuT0uvgLhS-uxn60bv+mAr^gDsLHe7Ir?jA3Sx8yDZ(gf`1S4dsG+3r|Hz=0L z?kgREZGD+%1SR7RKot(t?JVME{ia9ORPwOu-&ho%HJQMy$vrQXBrfTdu+#yhJ`Z{% zG4V!=+L~@spX~e5={p+-QjW91-Tp&Y><~Q+0@#WI z7i(Sj2n)~#XFz_DHhydJIWnTON$waa(F)bKX zP#Bsj73dK_9J#OC0&aouBmjESu&5iYMDIH7bGi>KH!3zgNUzpI@iUWsnJOi_rp+&5 zhb5n_fgrYHr_0y_Y}r1MS^cz8vW`G!`R7uS+=GdQIU9~&-&uC~hW*c0(UJBLu}NIB zZLoCvkB!g8zvk}~ViM$^G>(EJ*pGC0?mXIl2p}7sh6}Wckm+6L14`BdsCo~L`N{j% zU8FVB$aO-luImA7L!geb#DVgz1vV=7_XeerMBe%35a3ZM}7e%h}Vtd3RKIg0xSG|15|w+34M}| zHEdMJw|WxV(F6p7H;PJ2_?qF0(5UT(OzrUcmGd*P-`wxj~1j! zF)i3t4$R@y2wJ}a98#_hvn2%D;}r5P>( zC!sBaF)z+F)mz6@i0i7jv*Q-PBHz$6;1)rg;R9@UDu^Ml>|1?E<@c;5GQ)jtfU8Jv zvyW!-`6tOwSkN2`4&ZhXb70}Q8Gt{JcE*aTKfCoO`vb|vmU4?g-T1 z_OXCojZav<)GvYq$9(+`I?=_5`xU7DEI?tma2#NWlP=OQ7Z(K*JG1tS%b!tVO*)?) zii>Kp4QK!5aa??+e_#JP0+6bN)Q~W@nhKEo?uAfbNoeG(Z=|A@yz@!vIKS!9Mrm%((*8((KQac{Aixzm;b6QYVA@{XZ|P9$83 z?_JZRdgosyY0*dqPiY3)I@u~3DB_J({ykG3d>EXzD3L$?@f-VX^rf zbj`5gY%Mx5lvw}GmZ7fA*L>fFACra{ukgj&(~ek6Yc$_3y3JghbA+rE4Wt-2q`B_S zP5qIEG5Pv-=6pwAzXHXJ56?Rg+Atlz(Ku>}r9bzM(#~O;bj-A|Z}6)!7(AA|3Ifny zH0_-uto*fR07`qB2B2gjz>8u(fr68mAolq7#Wg!pV@ty~;r#RvDT=`7j6CoyCczcU zhQbxFO>W40DK4i$Upff*%aD!z0!FEmv0I%+vddt8vxr!70gj)*!uX+LkrvaLYypPF zTJCUco)ktC;ecHNgrCC!4J1S|aUkij*3~B^K604^Z!;D;B^n)C{fW$#roC&KYoU1e zum;*^5zz@DC~?nb!}sOY^S53pBVt`>p~0%s51tG3reRf&3ftm~GJROl);MCbrkB<^ zC~}rETMZ>qlaD2vS}{a=tI1D>?@@IIvz!AX>l0y;HBayJ->F7VO*sxQsAL8JDekXl z?elKj!|mw>FpL&6O~R<-H`tFJ+i5*L{qA7=m15AD%D!6>d;a4Gm=!H%+3lWCCGa9NIf3b&H9S@@s%k)Ar_aMs z>mp`C6q))T86`TF@RoT701$M@-RIE>R8-BwMp)X);~)9TBqUqXc5rA;kkE&K6-J`E z{4_x=x6sZ`+*VAT0W~3A|!$K4BuBhRbr8H2zX*2cw0i(5g*} zaQ+f_e|=&?U=7@usJAWZN89+@apN8LWRkYmJOBJZoAJPw==qp%PJ@|7cD4U3k*QI) zldo~++qUg>nm^+WMlOYc(3&BJ6j7qUlIJ>gOHpg-GBp)tvUn1+#`LE*GIqllNP~VJf7> zII%Ll{AT!c_jj`;($td@eVo`|JBgo+z5cK)K5E*m-Va%;6k;jQ@%fWtyu&y1-4dYo z&w0qGpg}2MGZJFirn-Y4=Flg!Dm~OI3*`4QTjy!TvK@C#erE_0o=f~Ke#F|#FE8KE zvDVAj$;MrU^@w`{X2tht3_3?qMf`NuQqV~}6PvUMsvcROtUKCK!?nBh&}>Nj_^Ao4KR zC=SVv=-{_35_$0|(sR$3+s95}qUsnN7^z7AYdus$jXgHf_hINw9Iy)DZoc&6SP_yk z=>q2LFI8Kw{x0dYs4w#bcmVeO)djV(GRL{++AHgfXBbU3En$%kGcAaWaC43Hi*N4_ z08l@cwEP>~TXg{yx{`M6iTWt#3oy<^-u~(j7)F~5WRZ8$Z_AQq!pjP1l$Pd@kPQyJ zhkmiJ+8i^>gYoX%JY~DQ<)0D)JEyeqmHek61cnQxV;=9QAibV>EMJQs@WmcBJayjt zwbcA$#(7Bz&XNot1*F`MG*OrI7a87ZAEuCG7!w_b|8RAx|D7r>zs3WTeApdAJyHAC z#Bkza3$`x1k=vAZfhC!BAx=#?(QBR$Chx*(-lM&I$*luiYnjJ?C%UW6kN20CaeT|< zex#8?&{N)OJnY>EwuH^xUcixW1kACQoQzFFi5429VI6{9u)+Q4HgRsvHGo<)*+{Y!-5y1{lTv zM|7klV<@H$?)e=;^9Zy|CZcDWkt{?@BAv*S>QU^w2Qi0u982$ZytBF$`$O(1GbP+csBA)|qRc{8k?fJJZu6F1l(M6Vj50$uA!KCl>~+hY{a=^Q?|U5I@Avor z|Bs`?J4bJ>^FGh(I?va5J|EYFt0Lh^7vrZtL*!^-7Kz*VXH>0F*A{9`AGSgl?|WU$cZOmr-fiuEIq2QkHy&lAt#ziS4Nws0dFjYV zJWAC3y88TvqDvm@jKF&E>cUJU=}dBPxYnHx*c=j&Li@7~4~T+cH@-=jKrgft*G7wn zuH?^K#E$4pFcJs<|JURj*Z%h68znUwb>!BG{GpWudv>>D4wAfA?R={F0d=93^TC`6)Bh_GJ)L#bzhF zIt_J&gZz|_1-0sD-)EGejitYa!F-x+fwtw1Lz~5dO45McChW?4r5#SB zvl=lxsSh5?TUL6-oDb6RfIgBhr+~Kl6(8jAB8ALzA%nS>1erXoSIUCH z>7Gd8jx<{BgKhJZ%Vj@^BO)|3sw8yrf&aE^AorQVUZfMmbcGY?w&8fEdCUlN=$#<# zcTE1}xWdj&J@UK*V?MwhQ=|8(3Du_?P@x$xP!U5`&9#eo0H`m?E8D86k{Rz;`!=~My&HD?W&P>XGR$OqPld6%kH@j3N1|_q zx5eBakNyJ@&@(lf^I5vJsJ@y6&5Xha{FW44COr{~-bcH$>?hfTgO2#QP;XGQ#oWAS z{*3bft7=%`WaY0=cakJ$hl7y{2JtZlw#urk;7X$T&vFm(VqU6H_e^%FSOcx zhq@tcUm+ z#mqw-k-V(C)9Aw-4BhIF#6Q1uDlNb5DDrJl*P;lCYwxIeSe%s-3iquk7OGDKm4mb0 zMPc#cFz+XQ=xx!rj|H{sM?1g3=2rjf|`GzGx`&xZ7Y*<;e}gq+(E{SV{5pcdqy`P1`YVzZDO^6lkRSh;`xe_ZP06N-XPb z#KrexU4j#^G{4*IpUg z$WV)#qV5ooKhKg!Eipa_MMFVouK#@mncg|rhh-LSa9UjqPr>GEyvm{%G+Oz(XI!fu zkhh4G1ALImmzmGaWh7_Ao+OXIl{l_>p@13D8OgRtnCyPzC6a3py8cCYAw(s_B3d^j zgT-+7y|;_piK*~j$4M#ld>R!!3+FDW2Gy)1bfNXqx^+UP(H~F66cbeg}b@(0k#YqjaB-*0h zt6W6iceZ&vml1Yw&V_;Orkn(e{cwb9IH!sbEmgl|Ey|%^#LGH1kghmB?rvUx^lhI| zqEJaI8d{=afpK0PYB#I{S)}gX%kyj@UvBP5#`F)?nJ9bhL5YSfnX-+x%2%r|d`^9f zMx9Mvlm(s4Xc1V}iGJQl!9Ygsb)PqQGWXJ%+rhdONi0^or2aLfYR=Y?>$2NzN^d%G z+h|(JId!w-8zTc-!Q;&w^&(knVKk{lL1p!F!tP*bP}f+6%&!OO2N#V7efo~n6|9xh zO=?l0WJ)V8`c2tS`YBlO2hQSKQ}RJV|1!0x$f)?J93`)zP3-e`x;!Mz~=OLDj5wKHR@AVqFT9>Vx*vvXnv8hPd8>b=&x%C`OU84h*u}BGSETFD=mNT z?lq{lvegSl&NsFZqXxxj0wZV_76Y{0vt&abrX zVu7XJAS6&a-Qo?keG|{i!#<%=X@}i|XQE+07tVZy$Uu}p>yC1KwW}aE) z(ad4PERM%I{uQt)7JNs&XUlkMIV|k!-gNR>yg{>8f?VT&s}Yg<14|wYDSPlTs&sn{ zio}k|)}2{GjN#ZP7Px|Hw<@}uIXaPvpsVbZ(nPnpnK&;u3TH&_RQL0BjzM8-k%TV0 zp5?4b>){-!JUd>p+Sy_5sKW3mM&(W^w*&_|u$1&8qeL?1>t~3bnW-(4gkL^{k#TMZ zSLYrfRb$D?^M2RQTSr$raOOQ_>5uIY4tg{Uinbph+ZzA8ieWit*VCh%f@4c=-6!KN z?cvvsz4>f!y5#)bM4tOKP_|v%4&VILV1r6Rn!eP7&xyxSd;QDzKV8E8U>oUex)HvJ^j4>- zz@YC5X#cs@HwGb9*~jLYLak;SrV77cf+B$D|jtfXsI6+(;6(2iIfRjV)H z(V2m+<;tu?yG9JxG8eT=1bVYw$t&-qMc}%5#C!VLTZ%O{FO4k8w~cB=cpr;7i+|?* zjI!jFF#OGU=DU|+TH)qhIzyA2 ze-m*68)xC(b^-xK&z+c03P_$CO9+Kq^2kS2UGPrzx?9#;kTFHI4s+vAOJ~;bVM2l2 zr06P7uyc`%1$7WrQ+=N0l>EAxgcYw@+J4))uMATtexK*pWY!p6h6mSFOF`N=vQq`J zzlvpfs~rB5`RA|+76t|q&6t~%oIR9vj<@Gkt%huA;nm$d0Vt7r`=n|eJDk$2mGi`d z8X1PjGbjOrz%s5F;tH`w;u!QK|DTt1=c@^cVjVcXK3~Mtd!?1TaC+3v@$U%UQ4=CD z-1b{Z);v$fQo287+NrY?p<3~2*?+LZFGAzugL@K+>*w$5I_-r@M28U+XuGbmHeEe3 z8Bab?d}T0FN;`Uj<*%1^EX9j8RVCNE!Fh<>AVjjUx~R`=(Tu@<1uX9>+H?dN$!+99Nz-%cOdC7tfRi}aL8bFIy!LO(Ki@IMhH908Mwq&{v+ zu}sri<$T+n8T2S)n@WYK>pstR`OfWHb%6{U#@@fiPd5ad>-WJYd(f_IeU#Hk0AVvE*H%B8sdBla`=qs-V4 z;2v_T#md}LB<=X6!*4+RhLP*s?({EO48J4kfGQak#niiv8v%D(4`m%Pe`4;5P;n8l z%;4LdJefSK#CR}tJ@hA{B+#^bM4#=&C*1auc%pksPUk$JJ(RJiMLC^vi#V?}l=p4r z0=JqMvkqzf#li_5MG5BqQMwZEYY)ZZ`LsJV7V3f?@!h0nAdCrTzU%jEQ3t2Adh15L z8j2AmI3vwj+3#u{`BbE}=OLewOWGRgQw*e>{8SHOIg2qrVvRQTpT z|BQ-DzVt(%YQcrB`z;N^O_@(fJ*u_pFj?1s>C~YAeDS(2T^dm^AvLHA+r2^=_huO= zi2UweBvjTOx_tiEIfasIkjCAdHTph$fr^sH*n?s>noyB@`O`t>xUcVH=~xzZ96In%WFOcXON_x=E9KvjIQ(#O8}xX8AjjN) zroPgxBpQEylgF2GlEIAHOFBO#>_{cpuI1NonerI3Rh`XS3WgqTl8qB5%){577(F#& zX}Yj%Z)>u~WtHmEYGf^jZyUxSPNH#(u;Rj}we*#qB0Y=1t;rCqCK-vuohz6t5{1sE9)qCf9rSdI zn2T9_v+fB*lNhUDs!L~abo_n|@@&}B#xKjnDs#|-JPfnIsWdZjbVbpB3+iVqpzZ$T zbiPF;-lCQ8^gTl}?|U6DBt@~Oa(yK3Cq@i&)8Teb;bS=F#S2Tush>DE@2Dp7BnWjx zzO<9KHM9yQAB>nK8s)JjREf`W@cWnIH5p!YEok-qS(+6r!P64dkfynI+W6&qj!6Hw z$4=x7yXqSyNWy`1$HWyrJ~~b~o}#*#gfL*jHS?Mvmc8-=UtLTuaY7BYU#=nWrU_58r66B-wk3@&A_;T$qo?6KzJig)zQ2hoOu zmkE?2wIws;K^^NDxz<+9HhB5f1Frb6UzeOUB^`NoI=8*-IG~fga+xm5m8$jFZjy1U z*35rTbcRPD+i5J>jh?WttI-KJ(jUw zD)}U#usEKXGuM3s$Ka)pVJ@Z7eal9>&~CsI={uvErs!|-L3o3+E|Fgbb*H5{osQ@$ zUut1)Xu-=Z6WnlO(C|x9QC)P+Rmz&!)v?eS@zu4;`efeC1>P`@$svu&;qZI1Gp{7e zxUkP8*HrmPw2A`8&b@QEUmYZAq8al&!hwwBpOIetWgJyBx87W=nh58E(tSXNs`-36o;yOT*`pbPdVa zI0z@gK8G->Q%an$kGsTFkk95+b2zG+Uz7ahiZ3hGbxMt(5M>u819MSS_!o5LXln|{ zCmkSn6v<83NM;PMXOF=L4GmTM*L}(&w(7rK;^<%@I9j_!MLED-p?4(mN5%9b!zQ)w zylAQ09n7dhXSxcLfvl)C)>d7#@cg;o=TLKBWB0m;Sg4pNhjbzXD7na)>bWUT2&JeP zC2SWrmXq=~1=dXWbv5RfjMjcCE0IFQe>KRx?VXl2ARgy{sXB`~v3a3Z3dQl&+{C!w zW_7si)GP`A2x1zb9H2eR@?uoHGwa63yJ(@asPPS-4on?YfP>7}ZqbCbOWzW2@LtOb z*`X=!P~7+8wm!!^-YxKN=f7A0m(xfyd&fIW$VSaLW@%0!Fr5frQR zd@=ueTkFB12Q{mEq}!C&m6tkGS0X&9v67;w`Hc(2HNR9l^(gO1`UQlEsxZkb_-U_? z)NLE(Tr>$(wo~CKUPiPeT%!0!;BfYinv{em1u}_i}r7Db*V9_20a=A*!AD zyvD69(3vqi8v+ifvL{27-A1=ag6z~fzNjv=(3@XqgC&QyL7X+Jx(43|9-ZK`^zD_eX9z7(R_BXf6{zihu!3wpY+Q8ON-TQK zOVKJ)64OFwie8XAl{(9F^C;{>m+S`VdZ5d-$9V0ODZg~i9%?LZuR(JuO|Ij7+HU=J zQgC?u#j6dVj*UkWnnLs6{}F%}lYxK~AM~307u;Vt#4qhAu@s`#Jo&MFozzJ?b>Gfg zznNg;o8OPw&L^%W4o82hroJ7yVw>d*ZVggdCrSybqx(5c7u40J5hH(WXqW0I)$Pw@ zVoQ!2&RmUGq+2FE61<{4qPC+$uN++bJAGbp(nh`;-ZwVfG38M<`)>2X_x80aD3Q8N z@oyik4SPAP*!1pQXeTzPs(Zm+7a>?rjxqQ~XKR`C`qtw}lbL1Ei7she;F8AC@Rnvq zxbC`PB_c~m+)jwU@#E*%^5Wpl%74Zssp*vhEeJV}VOufj$|lz)nY{CpiS699Z~-&F z=)RqXOP^!XWH+>p{WM#w-tb=VxdA(RGK359&x==%4LavkL19XmNiy{TOh_xYqj-aP zgG2&ehx$wBd(YIwER}v@86ZJm(fzyZ45&YpxH!(tN`sN_tR)ClA$eSeCgI^9%) za)o(~vO*Mgk33gAd_)Nw$S};RE!QXuPMoc0!WZDQLV0vPmQ}YLAYMXMQ#C>&P|^9! z5O6U^KCUNUXnn*{Or8HW^wHc0Q>aecm8|Ny(S49b3e~xqyoiMWaD^JRvD2K^mZPeb<&ZuhjJ>(^O7c%~tc$QGYSQVpRcQvrA z+Td)lRtP~>6=#Tp>TAk`m7g;scmInz@ZSyhw!A2#%k|Eu0#2yVf00KE80njWN6I0H zTE^Ka=vgq6^8`6-Z#<(V)zKC1oQ=XTU&L1KMX|fm)kEt2$G$Xw}Xn z7XcF%oM=v8V|D9GGNu1K(7nKzVoVL|tfwaa0a0^7FqGi}dk$h!cw(VaExBd0UZ-$1 zk37;Qu)uM%X7*P&}3^sFv-i2Vw!_3Vf&!sTi*WB1R$VIX0R`%BKGb)T}H zYSJ=4g#Z4UtyYh45!Z#hX}S-+Wl!)e@?`U-)Vx8}PfdeJ>1sKL-pENa+xg00P5iKY zbZhxidMXk~ydTxOgLHaT3HXEbJtSq(*A%X1CUGC;T%o_lMN}56lJcmo8!hzdQf%`* zhNura$>8p{N|9*RdXFOuIiV3BklNay^xbUlTQPb4DsIPwIyc;nM{&0AbUSg^&1Z+i zHPK6=Sw+_Sp);zA1%n4k0q>j`4+>@WZxk_MeZUX(Ce`Pb1EhPxl-Ql1lwJ?@r)Q(6 zBT2Yi628aM#T{R}>+`Mit(i{Voh4;UtqtFk9eOJP_JAEOtKQKRo;G&NU@02h)vi@v z-bk=JL%4~UF)Fw72TwsNe;RxCQn=CD6&3C0Q&K)3`whwwXQtyf1L|W@#Ygxj?MoTVuv&~l@DH_|=|e~bQtU)P zXTrMhY)fTuxa(HVe;(ijr0gp>>|qE`$z$-0tlbduIp(iCr5_J7Igrqd49|%f)~27P zKLr+=k?b>fk?zHr8$3sM-;KXaSf%X!A6^%@L4=esrQ88KHHcSlA3P9;J^{n`TzJ%D zsV%3XWQ#%WJxC;R-zDupQc(L3!!M+I#HaHZ#@ur^?b9X1KL>xc>-ac|jGteb;s>bj z`$Gc3A1c}Hqi$h2o8nn~f{3J4D{h|p-`xp%p%xY6?c)%r1;(#yd{q7o)ZzbbDA(4* zdv@|*wCd>sgit8hK74sMpkVCrzmdEjT~>h`ez~`c3PInk&&43P8G9XJU%<*D`~FSg zc@CHlS0o3ZX(>NegY~e0KDfsy>|;4H;7-Jz^Z#kkonBO;A`1M_Na}f?@&=Vp%NH6t z+ta8?ziO$dJ`NYfHV9pg%HQS-YhD!8jt?qaL9#Fks1Lu@={kS>+iT%wZ?3A#Jkj_l z9Xe;-WnN$EaNVdc03j&^N)K@7jV zTryVdRJGeE@}-)Pvj2)muC+)`exTFC=gM(r+?W50_F>Bfm;E{QKKnR47XxihbMYMs z5+k}JFmO2R;3ph|b3u$!+{pH@A9Stuh?778nxly2a{%h14wm^K0$3Og9x zR6@9cc<>F(944^{?8O0*qI&;@e$W7d@jxO|gwMfb>aMGn94o84t{DjaGbufC_}ud_ z-*GgW3CErvk4x}m6p!pexpSaCoME|{KgOLl97Hk0P&DC8^4}UbZ1jt^l>ksmoiMw! zZL9-8el&RWs3gIIDURx|t@i@O`Oq=|C5o@7To|}~D#aQR{36bY62}8*P9XIP#31?I z^M~NeQqoHY6=p_yZ#e0N&ZwAfBOVeQ+zO!|<`j_o!gc~~OKVvI3`$EXqLaWzJsh%z zA=92I3TG3Q0ML!)IE__pLH4?P7l-M~Q%H(Gq9V*Pp=t^G2Kgk>;DRN9-Lf0*|dvUoGU|5R%}O zO_2$@CR^xFM6?K~i>+RZ;7}wlH0@ps#gQ;I5{mp-Dp3OE=Q1=bw~QFzZ!ricCcsQy zHtF>zJy7TpO!;+XU4B$ceySg|$(B&?&Q}6v(c-8|**WuU($lX82J2VKPja zc+*Wkvm1U0{=2ESU*IMwM4Vg~i_aJ%+ko8*m?8V}=-!EvQJ>c0QJ;tV;5r>ScQ}_B zv(-B>CmQE#qfS*JL?bs&fGCcZK_2r3R<0>ZhMb`+Yoq)(nK^pB6euY6uvT9hnC`TJ zsl|K*-uTxkLY6^eL7Nv|xi#2oV0@2pZ1O9#e#=U9bBn&;UJ$W(bK6IC6BY(eCW+&d zi4RW;&*TwSAVC)Y1U@vS{3DcVeS)Fc12_#ugr)PZ9E^C54FK874%*J4n@a=TdAv*^ z2I05n=^1bG65^;pYWIE_P*tEbEh*!10Rc^aVXN4p%H8#B#twBQi`IORwT*~MKH_w-<^@q-16k11L?_O6~oY;1d`At&(Qtsepwhy@ZqeoIF> z5XC(=uX;bihRX9`hVifa@4Yl#f(Pmg2%krR>tSb&H(w058si1ewxhj|C zy&t-CCbtcR9!6{-~);%0pJ<>6e*&J_2{O{8oKTa3sZsLb0R?5w4R zr)NC$qO}NI=0Mn6q-_EZPjE5YqH38$sx_c# z^pJqIom;;}7%SF%pj`so$YV6UpY%l>W^GmTnpTAhlMy-5cQ@Sn2}8n&h2tZ`6`35d zuJRaUU_EFU=KLe3QF<~eFNNesgAcZV+Oli6TIw?Ol!9Tcye5cr-|nxr)XzUFi!5|G zXBW}f+k(m!jDh8&V1ibZ#Z#z$eF%4(E(~5L*%w@jWpw_qP46ZvcMAXU*2|7;(S;Jz zbb5|2TrM5~JDiSpk4CQj6$3YK*RnH;q}Y0VUfD-S{VC!jM({CrH$~k`{44@1w-tOg zoUv9bWwS4M4aky5^Lz+#ZU}g+68iI8s4q};TYU$lrc#Vb@>9ev|7g_PhMy-VIK0vG z8+?)KW`te#KSpLlFpI-C#~$bMk+ACORG+QRyFT-8&30Nu^6bHq_D>r63gY!t!^7nG zzM_YQ;+fFbJn}3*r~iOYhl|jD%m~(G3AD_fDiiPsukW1pQanj%{lk-iuW{~IrfDY& zY|1uU2<<00T*w{yqTDSA*Zg06G316k48yDeS!NKN>*oJqq*(J;N2U1x4Waq0mRJgg zvJsX#^E;q`Qy7^8yOD9v6A`_4P|ddEfpbc!AM`YKX#sm5okr&R^3vY_vlh_=hy+Yc z0pix_uf@*N^>IRw8cM2Lu=pwCXRi?{7+?EEK0PPgum!G&~*{|_lNr~dajtbiw9(ie56!Qj|3WtRzj;M z^__TV%p2ODH0S@IlK|BKp<2d2_;TX47zB^a!Eg=j{7*W_x$?o`c-_AXYx9Vi zxHK>dL4;RzCU=XBM|*s89S(7s+ll7;Bqz)TwWYt0!I!j*EAV4lh|kYm?$gQR-yw4R zJKBNQ5*?Gvk)>uI*;J38sPq3<3AmCW@iV!uFudbUwB?sk4EbcP31k=mF_}{&XE?2_ zM+b}Y5jk2YG#HQ1-c97}EkArEqKy_O#9|LOt(u}mABboI#RQta9YJ@+AZu_ny-|{+ zcbk>`Ph+CFV(gjt-0R&>C2BlxQ3ZrI)r0!j78qm(k>NsH$kGRP-$Skf4!*bxHyObP z4rnAZU-@2jD3%}HIY88BejQ~%Oy$0%-hxqZD)L%gf(@3`y(?-l>q-;#q zwJeliph0Lo-4BAx<)5-+U%`*s+CWkt-gtjn?%%<>!}f@03AjRTt!I!Qf@Oxh(;fWH z%0kEi9;hF3th#kT2IBaSR~>PMRz*$0;Zy=r>wya%3)UU->ybqdy5EieETyrS%=yw@ z2>Qpm?MbJiUW2Od0kG{q=QW5;RzwIUha1lk&)mwnU|UaU@Awo`%M0}{=*{gw3~+qp zBb6Rv>SA}qk|?v7^Tkf_WFXWp$Qdq;&+#LfDH5|943~q#4>lkYWCPk_P&%SxLF5k| ziUeaI=Wm9pbI6-ol*kVgPdI$MJGJX^;AHrTMA%6%UtPIbz-yAES5D@~O?PIfnSS8M zoGswTCju|(VEg+&5880V-`GGF_ya*21ss!98ku^Uav{>zRQklor$Y;q)1xN-i5k|0 zFIEy62H|v*5+4y4Jz}@@hX_Un*dPN4b|ONw*r+DK>~l?bV8S$_nafkjk^{umF#2lA z3rJ2{G{-?JwCP?}AI39U5A*AkbwY62M#93fCH{0V9P<4T(>Yu{TM+!d-4s-%Ir`ZK z#Iw4)xAlEJpJ8-;{ok=gC|4r$lBbaU=s56^f_T`Lq=N8eq39ICO#^C74T6>clz9r` zoGk_s;5U`tPPYhlAD0jc-}lQbWz{gU)%MfB#@S8F5IeA_`Nb zcl8EH`;qT1!FO3>tSkge*{7<}Y>TIKu3^|N6l@5~oHEsC$>l_lnJO_~C0B znPQ6_hR?#xy2_%1m7fm9bO3}6~b-{g}$n2+~?)36T|&VL58WP<`Xaw=O3^5Di0 z97rh3X@+%LD%nVxV#oZ$On^!Ket#qF!8^S+am~jO%0+b5LAcy2WDada@`8wJW5{p) zC;J9T?UNa{KvOR7KXKsh)5{?9t$*A3OB$~7y78JJtV|a22=8hChg{hs54X*LJ1-ZS zjQ-!z_V;DDKfhj+9qH)~6N7l{U%z2Y&68La0y^-iVQyT z#VVd?l#KzKNz>G4y!OG#3Lc=eL_hXYuJ;nLe(MeAo;=xERa4+)Q2H2U-Cf)h(XCZK z-28Z30LVQskld~0g=6;#D=bzV^>FEx1JL{pA};UJFWg|!MJP$yo3C*o*km!jh>smV zfD@fv<5gGi{chPaMUp5Sqr{^nBj#~3GGE9E`fpwponFT#(#a>vdeI`pu7(KMSGvV2 zH*St~sdTo6Rz#iYem(dt8we_kQoTn17xxkdB{j}ic9Q=|kP$;()!rvguW_tQpNZS< z=pLLZ4?j^}d{YlGxH%jW@7NuzMf$-6Hgd1B^Jt&1q*s$qrwE0fO{4`IKfWgermc$f zYunFu?E7D^=R8C}Txk))HQw-zZ%uW9{?PcDY~W>{Sw!n=qHbegHAG9-d)<*+Ls%V# z!Yesn5tGqCfk{OJKpKnifq)V^Z%pEBl?GTwr=e`M@~(#^;p(vNE^)eh2Z$#oS#@_*HP?}6l}pJKRw5S3 zF6RsME+0QCvd9KJmcW8yF$F9_iUMraReX`k$iaUbpp5dRVkXL3_jbE*ziY*{_|d#a&lKSl3U8hXXF}BFS4)ha+3Ch5Ln`0bL~zCvp-_ ze7LEOWc{CEB*Hg53-1MylCY8x(ngn-a7hsA5{i?vT1?L~jJ%aZvFz(6kH7Mc^Tm+9 zQ9czX_ELM@4BE^8U&HUYX84Ks_`}4)`5S1vfrW$R6=coAQqCjSk5eK<<#~Hfuor+9 z{{uAo-%Eb~VQju}EWb&8tQ`o1nK`MRNee&Gg5b(tVHY1AUxgC>p+{BxV>I$zr#$lD zHQg|TVonxxr3`D-anj?avN)paY-TUL^ zDaY4DkRnB$S3dy-ei00oBvIe; z@l}|RYlfKJDuK*XafYx*%1MuSsUfhnMP^NSQF`W{j= zYpeFSc)X_)!#~(%ywuY~zWeXxk}f0@R(NsdGP2MM2=1hO{>r&y+zAD8c%h>dkq1u- z=II{E=b__sOcc2$Wo6cd;~?b!=ShLi4(y=+SZM=x3|b`)H@b&FD`q1P35P4)hsHAW zEOthKSULPJzRz3{1D09wxH{vy{QM>OxA;Qjk<0zwtv>z_&;mkay&RY z8nln7UJ*6v;_uv~3L(Bk`ZWHb=GU`xSo;01^>1$XJaFDaV4H0cc`fIeUH8v0UpOr4 zw~RPGHbIH-+oU<{j$iXOSW>e= zsgVSWn}TF{Hk_Ykj+dGZ>6-~pL16cny8`l@O2daxzbS^8g7=)&931~8NKT2Qr29}j z8Yb}jces7{ihH;$l!&AbDS{pDZGPd8ma8><+Ktxi8aMrUPd#%HY6~Dd^8Ja}q%&bc zw(cNoHhY9TeNASd9vG92c8(ei3nWSvq0#bxwZ1{sV+jh9Yvu0qT@6U42I_I^%Q_~5)5&xTLi?Ska@#gy_Mj$y1h?=V zV5$h5J@utVHjmT6=R|jX^hh zm8t3!ogY$r`!{L~4k@Gt8!AiXZ%k<>OfPh0)idQZtSM4Y8ThL|f4rv8kNd#pPqNox z)@GGPy?>tBN9w%@q*=7=&$ExYJ+4a8!#47Q=+pRgIVj-{dl3v?alu9vmy$u=h8kvMCsm`qZFMY$%3~JilB^J`nnGp(L>{wXcX@XS29yX`S zyy}M{$`2HNn(QD|ZG`6tQ6Bfpwh$oceN)Cj!e`Bt#N#QZ4Vt}%aCqdp!9&ga4ak-6 z+UvS*zxnc}-wmw4`!6^q`#1nt$#pgB1*)T;hMa+|8f+Dx=~Gd8p?`p(eKqiX%~<>C&6xZ%Nhk{*ZzzgChsao({KVul^JbxmCSTV^RvUqu zOx}s)9NGnLAd{$2T~uS!B!RNmgUIX`D3f3fU7Suf4lffxXe^R|R z7Un`zhj`A7LD2GioOi;FH!JTNFr*EBm!Sh+2>~(CtG2T45eHDBqR;M0l3n|Oc<9_R z00Mj&o%9Ut^d?RaUR5n8Bo1UtfqI3@svPM8WYL##nl}fY&ac1+6pel&FLXwX`Xh3@ z^6B`ax;7HiJ-kHoqox2&xG8Ew? zdYNBD^@T@XXkneWz-k==*bL;E7$U&E%Ab5H^1Cfy&u+X~PJnD0Nuf@h{E=Q`!lS5y zUa$vVC?H36;7)Y_i!ff1L*6SN5HBQThg#;Qg|E?`_Yj;jn6bwcZrGhRBVA{^Aq-HI zH|rh<%kh6D{aSYt$D70_QA4L|TaqV^+5&tF-GdXX5RR&S+4PH6`Ku(G<|02}OL0Il zm=QUFqPw?!_aA=ii|FH0JL{F)51HC33Co&FKZP=dkwe(&cCqblIc75D%bf_}Z$Rot z>{}~Hkns+ZNSZujc_W#!o}vs86n>d(=RP7t)9tUnhbTIC8`U3(Q$EC)jt<2~PEnn`LchbwbK%u$_ zqW_^8U^m~tCnWr~09B`+3|TioOzpr}6fa>wC|4R+gZ1BVs@!}*%=~COAfv-wIvSW0 z)=vjOa2WX{-r5CzAWoC(fki|J!UsF+hIM}lIZ6~|P*zuhmf`KYmTmm`lWPs0Y&Gte zgCd1>uW}i|UbN_xf}Af3$X+D%oAWr3n<2C2)h58|Yt)ld4HMDGRrhz*{HpTz9Wz5v z>)wMuX0onr5Jnu-wJ2?lzv7-Qy03Ne!R>e4bfs~*a&TbgkvddBF;ihj@U)tTbSezf zYc-$q2|+<*6FR&4QaNBBcV5u^3~Br#+Qf+Jxujk@71HOGI(~TR@x1|?2PM~P)d$t} z;cXSlw;}4wy-&R6d2FTo{qX;IA$X?97x`q?4sjw2mkcw5{Rqp^{;b8S$ zg4v+{Ys$ASEee$P>u1d`_5)gO;iJKin^I6O=^ZW>8hn$>QC6AV2agV*5iQ=_{Nr)= zII^@vl$HfQaxjG{guIr`vAipl)uT&r>J494*wmvH2s77ldZQvuD1kztoxadKjWi<4 zNo62&k2?3;SvQo?Fw%f#xHcPGmv#C15^%aPsBq{a{8nWmUww~s4XtkQ$ixY`gfJRl zECuof(_a`wPxYVOmU6>Z=hv^4!s|fZw}(L7GBT^#{CYzFQ>a9&P?8)oKAjII;KG*3 z5XoAh;aW-i$w3TrH_h+w-*n$pI8WdA)bi@~PUf0>@@i|6X7jCu8wa%uYqQoeg| zj~XBt{}C_Vah^9+T;%mb<`I;6Y80BXcp#20tQrn}Pm^WG35_-`icF!Ut#KIUv7$1r0oe=IoY6$JcT#hmN%fsS zaFCs=pvG75&QMyB6gbupGfUUMLfXwu##(omGy=}C?BGuKnR=#>iVN0JjEa%D;ux)& zCw-|3!WF4MdiY;wuzsnSvmxKk9R*mG@c&>{kXZBd-|{@{7VCrMk7y9z=yAW5~fj$3EGM9embnY0|&DA4IE&f0CGG%?Yn!K?HsJQ){|8 zkIx*k`hDMDxkyWP^|4i?6mzqeD%%aY-$N!mb(F@J5pN3j4diU>@NcA>nnkLdK*3{~ z*9&(jgc`4J<1<$jamsIap+LV~ory-4xoi97NwkI_ zR2ox@UA42DHp}4QGbRfb7x=HYl2Dlfub`>+3}QJU#Dq>CfH`Gx?_AU!{Tz75^Tj}4PglV83R0fUS4O~M zyJHp&{Yq1AW{=?&6qhj?&!Hi7iah)CWxTvrVBEs2-lr25=ak4b4CoC)p2wc6Z9;i) zoU_xTk8Bh`HEis;TsIQDuBy{!4dLAl)lgC=28V752g0@x9x@>aiY@8ob)NBD-!(CU zro;36pEay~tpGf5dY2Ds98_UrzFb*$QuG(^M&c2cf>0K7EHg>VH*F%eAdfc^>lsSl4D#1;2kG>cs zU@@dhxQe#^MO@*NNQpy}IpPd?-#v}EL;aIjJ#ReYd3_oB=4&DNeEKX~a9A7BTbg@~ zpyOE9$#QURE{dILxG12Bd1gp>F$-E`ho#lo^t#p~RIA%Q>YG3&MVQpB%f3Y>$OosO z7k-IRA>QGQsi}$C%CD?3`~x-=M`MNiy>V`Ko67d3pjmnj;K|{k;0)EIzfla4Y;vgW zN6ZAO80~F=8rxQ-lH{{*3_<(I(n@wxyaHdVr_1*{}^ki6dL7FZB8!BULw*)XZ*{{zB>Yb-;N?-7nz(%&Z8DV!Uf2&)?}&DFov zM6^4Ja1ps*8%onhNI>lCJFaqaGO!en^LqIhKMGy+=i{3yNxNJhHvHQ2_?P3F)D;;K zK#mSMq$U4HW`M??IfYPAP9DfLKA}A^JxoTqzc?aCJHY)6aWNS#Wpfw@^=6v#u50|w zdwkehsa)J|N$J_zB=(Z*hvj7LuZrjTv}c(D$5MyBP;Wk1MYEaBf|PT4}pNOHW|egw7R5p&ydOxE9F<_-;mIkec`j^vkKf+ zrF!u?iSldMlNVpZT<9lTygIM$Nk8`MK0-S+1Y#y?#G&k>)iKIY{P0?QL`GQiT zgfbL^@5+*BT^;D*9-m$j3RvPl8SKB2noC$fFqOJQNX^<~G2I{KT*zQ<6eknEB!FYt z)u2e8_OW<(Y{-t)@B?xL$-WQly zh6^qsEdFzSChOdxM$$BOFNm0*VZ=0Y^QLtdjSy5*2_%ZRrG zA;3wMbcW`is~_SBr1IIkEX7Xviv&%o_Y7+qyR|RRTibrVsW+xe znRZU%Q-2|S|Kd?pp(}|@vJE!nUP1%$5OIL2sz+1u#IEt@J%Q34`F~bSCF7JEo z{q+?2P2mId8AQLs`~`un^H1AW^`i$GT$;S7hcx7gs99}XX9<_nZvfzyncrXRfy@5p z;_QR#>kW)8mwFJC-ZH$4G$oS#<)u^zCmvDPQTg zNa1j*ips_MZ^tY!Hqf3y4kFei;iT(W8O-PdqD>0I7$;hbBhrF5JWWO#1vZVZid|3g z{|85OiGb1Izk%f529uw`>S7C2V&wx{>Cn&h{49u%xHb01&Y0eNY|azXt0yZYVHtg4 zL3r#|np1qeot3*M5ZXc#-@dqghWzuOBL;4ge|LSifAb9(AG-K8W#e`T^)NxCfmiOI z)C)FgyX%_yL6(mz4vONVI7@kD=?`n}8NBouJ9%}69|=$C85!%lui*(MS>2WEOMd;^ zG{#f;KGMIjy`g95S@fOyQt#5>{&~Oc^^KlAg+=3tjGH1goWVDB6hbNLRLjgF-(O9b z4(>E}y2u|Y@pZ*nAv12m?dV`vVSifFctPJV*TnNa*}co2pYFEGoE|EgB-A1@&TcZ5 z<$oPlCjT|g_SVp~3)COiW&)<^4eS=LzSr;N+I-M$OKY_nareNIeX;1x=ycx0j>2}+ zgOajpqtCfi3e0BP<$XmuQeI0}(pUx?X)G>bTA|R9KD*O9Q;VRo-LN`%Ai$b%5v?n- zO9jqI9uePx$JkdqW=w+l2%PN281n-PJ?~DF4-Q(QbBBLlem9{L8onCIBZ(HT|1MG0 zw^^4MzhG$3^th?Mm2Rn9%G7Mto8G|sv^PLc^3dxWan*c_VcvYbpS}wg7A5Xmt+bVy zwAw7|JuY-Lp)4|CC**bJK=EmFkoLy267r%6FJ2$d3LzJn5%-&*9D1mV@>4qU{JCST zWi;*g_j%f|$oEM1(4XZ2a@%l{KkQh*bfB;_53ja9Jz-@;XWLTrIByGE@~bY^atUM3 z(K2~iXTRf{St^@>ejn#&k1@OkzTMV!^dcB}vS=&VHSkp)_sX_r8IyIL>s&=vsRX7U zjXK;h%d?+E#npW24b?rR_S_tc-EZ=2agN$&Nh&mupJMIk-v?A?`HSx3nVvnvD45Aa zJIe4)jyY0kDiC{}r*trRE7H3YbM!N$q+(ih-)BCNvR))r1w~+{zBoh3pL*)qSqb$H zk1(I@B+4nDsiNZF?#Akj;w7nL-Okr&=<*BnfcXWU&x8LZcgk?kNB+BRE{;^eRl34OTfg{0shMcNm%si|)r@pQD8i2)2Wgg=_8^KdlFp?7D+q z$Kwx9rP)me99EH(ikZRVJo@7S8j(krMQrzpgR~n7H99d@Sf(PMTj23$c7B&s6&b4R z_?#O!obp|jw8AImKlMW-!xG=_YHGd;CJyb8i_ff#?z^MTsX&lhUt<`|HOd0w)6MZ7fl}w zjTi2|ko|7<*xQoMl25zs#~(Mc;Ofr|I84Ff4a_nv7G}?i$C^X_|wNl*IFE-7_~jUy`WRIOh3u)oI9J6UNxN&i4@6v z!K0|z_i*v4zDwi4Fq+-t=a{B(yTtGrDP{Hmc@-}WIVGj9l$4WwTP0iW zyj_)m_*>Y`i(0obo86KB7ceqQ#<2L|j|ohmF%Q~e@~nKr1}{pL2PLCzJ_HlEy`?*L ztAIFlGsmU~`%runf3n|sn?|2Q4a;l&5Ide^@b_?Nw~j+zD5p#EDatD+KGpxnW=niMJJ#4tH2(E3j=LEqjHlFGAPs9rp3|4tFPrRmT6Od?#;w zQE;JgBayO-J5Pf7Ol!mFdfLAO1&KRfU*XTKbQ|P^n^-F0rfP^X3QsJz4jxYW;SHP> zGAucX*0HBzW^K-{zrb&sD|*UzwB-Cs^o2}1mL9P zkyl)AJUa2Yi*HM+uT(=KpE$%EO`j-hTEGF-XnGE@a=n5n*hh ztVOa+vLuShzVAzxj4fdbgJg?rF(}!asj-WQ44IIfEMptqXZ)_;d%eH!_x-m_gLyDJ2$B3WbNI;)w@A_-c$xa4#Ux{1SL_m^QwHd zthDk`qX4cLX5`%-AR;e2lw?HUM-vZC<2G=e^Tp#k0ukaVY4!sb5~I*uRR_IvGdz*V zEIPTg5_VR}`ER{+a7JqU*=}uBrkR&}r7g3X;GcsEQGJiH{rXw^Q&YPXVj?sLvJ)#R zNB!F`_cCk_`*ChtSPwp^mh`p_i%C#U3So%^UoikIR!&jLi&K2TRKz0GRdaHtk>><>+k8jSd%j4>Kg&8NA;O_+Y@Pi3XM?)(M zT#;_wO0=9vIb9*Tpx3coe05~;fkx<@Lf-ceCCx^VB?rw*M4(oNt2?7ExX*TxLzUGP z%kF-9f)>4*Grx6OHrk!L(!XlH-8VN)bgmQL3>gKhxNQ&Iv0|FD<*M9p*8)*p<5AV> z;;P39f&Ia@!JP1J zI6nlE$Lwq6;O1?4ngbj07EY@{{H(}hqcZ2M`GZX`T5fHB*4{0fiB_iU5hFKde|<}q zKFd4Oa;I47V;L@Ytyf#_7D(IM9x<%_S+>fpp%(G*ZiIEeL7Sz62vz5gf$$Zc%_of> z8#ABgG=8l6N9Bw1B2_ZobjbuE!=8sHYT8PWqGi;2w}el$jE-?0IVZa!DE>Q<(@awqn$sud51C;$i9D!SwU*=Tjua>| zre~31a_F1_v~PE8Z|!4xLC3so<$gNdDcNesUA$dgiVAG+i;3TRT{6^$o)&+xO)^Q2 zBs<-) z0uuByl<`oq1uqf!Mdw^7nn{E>em4mBN&Nj$;&nT^DTR?^;b;WOfV^wl4+r0gQcCqS zj?YOEa^ffgoyM@JZNBJtlgTB0Tno(tlH-;*s{*d2;Ou|NC`-;q)$}j@DrUa~3hBBs zifqc4CUDMCyYuiG&rw&QXW^zAZrDZ7Kz z<-ahg@kUyu`+$f0gTT-&r*cg4k_NYQy{(~pazzsa@s3Q@l~lkYTm_S7fc2Kmge`Mz z-RpJVIh6NL3M5zO#yh7gJ6z?u*`Z3xRZIe^SVnl}`Jtj|*$}_(+9&r_@sW~9AdYAq z0x+C?T#*-iT<~lZhZkrH)J|uOs)9z!i{-US#+TGH29+}sx6LK?-<2mH)r3u(Y!`-z zclzw(@3hE|r0kU;)5MOU1D3O0zTotxmZWbtF=mauy`n6r}1Tk_NFmbXos<&3n z&>TB-<@A5^SFS{VBXju|wfZ8*HzU(!@Z}QuKgHHO=o^+V#SipI1(42f21S2KvfJ$bKNOZM%xlQW?JMrs+P_DDOR;8`VFKZs@b!kG zu9^qdnqnpF-z!l}!pTN`cT_VL0Ei;*Mm37^riF|q*||Fj!fJU2=BO>r3Gw0{^OHOswcRnIQ0C*K$MbP#7vK05HI_pDeQm%f~VyS6#-ZT!(l zxLLBiIu^3--#FvEbtrD{1e~#td%1lR-=jv64UeJ&TZx{jtf(K!r!=Fzra&|2`+M651i#ijGk3;bZe^>wtkG2VQCoRF@d&9938%%JeWZ;sDdFee>d z^{)5iws+v^&0zw31`gUq7@GTE3tqs3E|NHozX`(z=V3y}cCx**G5$XY%ktKhE}fp_ zLL@P{#xD3%{4i`$ys&k`u{^Se4I>JIfAI~TL~nXdjS#k*nBV#hS~Q$j^0eXJYC;yZ zO#1fiJ%$&m_4QZ%q?S>Pt~etT(9SeI8*T+d*Ux;n0^>Iw`fWq!b*=CtFsgmpvy#Mj zd5Jqc6MxB!?gbvyrA_SB@@)oVomcKOV3?#Y**-yCAU350)ib%av(jPD3fXOiK7Ue# zIu+%aIWS~nCTn&i%`;x!dhcL-bmvCM;ut%ohgoFeH`8OjgE3st0rIW#ShDs6{`ZuG zr)yO+cTjhR)YYMfL92Sos@dk1`l#l9mY5cZyYilm*hA!a}mJ;l05RM#aVp;Wuej(ocL zYY>{4(~(P%F7=78(%nyx?De+-e;BD&hveMHR;Tc^TIa6#NxR=~oPhO1BvN9MZ@0T} zk)I1|^e{PncHplU>oifM#p%Zuy(}+vzcrZ)8Y=#9J5^w zdzXvs`K~AYK*aa5Wv6l6Z_KNgZbdW`0=$A5g6pRF`Ukn#WIExfB0L`=9e;0Qe6$7A z&5siAJOocUP43!ch7X4)Hr=$f;bG7P8y|~U{4o1&faQiO6$=@1X9TQLLuPvy1Dd*Zsw)0yRBU)6;& z`^Ku#M-9t`LHCYAhvNJmF%V4^PscvT^s!3P^@VIC-ur>glw^&`wD(yo$uE@WTjy%m zgfg2(u~9wvimh=ri~(z;XGP~{EuOt^YN|a$U1&H#C)#zlT1%53w>PLqUsoe!nO`Bq z{Jtnau2SvoTEOQAQE_e}F)t8(bcT#R@>s}JeTKc0G-Pu(*ccnrmQy+FSB9EqR_*Ed}Thgi)<=Qvj0U$_AJJ#!xVL+(w4<$@(B>UF|U>2HWEt8b> zE%#RaTPJ`+p|3Ho>n^D{cL(1-rcWGs z_)tnhT1#)(*)56hn6O>7lh@Gxc~{uMgQK-f=Q-cedR&P^hGjTNk~6;63@OhY`k+Gg za67%Nt+bBa*adLI{vN*Mr)%|S8HbIw!WRBjzKUF z72&mB-Q6z<(^ZJxocWX(3_JMVAN1jA*d4b-!i~20iv0ZC?;5TC3+vqC#(Gtm-LZnc z7lk}*fP(%DHgeCo-Z_-kQMKeS-Lk|J9HfCG4{MP-N3$~F3pIX2JvjKZ?=_Fqo%H;+ z51pv#ZH-jN=FR5;#0yhn)X1Zr`3}HNXP=7`pLTJ-Jd@Uy{dBa$v*$^BFG{rXd#w&C z{f`BfTP!dz@ONJ40SY#!bE1pk3M}oxJ_@#Fu`$H!(7W^90;Ud^qn@<>M=)iWJRAAZ$;+xl?s6Pd86o0t6S+V zZc(tbXzC!5>=5a~USL0W>|*2W@okYa(CZ#qNQJl^yrFu^dx`e#3=lw{zI z1K+vurQH=#yyhd zT-mC;%LijCD{+h#!(E?#6w_QqM6uIFe>>yI8+6Xk3vkWKM&1Rq31}waEAYqJO?Ju1 z9$u#|Qo!IU8=Rek>Qe6$c6*iIP6=#09`!@Ze&=$H`k!pTZy;adH%oAVfzrO&$Y}Oo zFAqR2uB*TEnEBP2UP+WEOc%^)CX|pHuIJhQd6wmKYQz<4fL3Dpf|o-ZQv6)wrvN&c zE|?c|5<;Bi7<(<--X`Y1umEI~EN*$rOMUhc_(a&|atJfO+MJTbx3+i)L=bZ1U!t&u zc=C`y<{ixo%}8u<-H%>tRX6fBj_KX+b5y}QMrFzuu*5S27P`!Zbx zYYFu(Pz?{Nn07;6;KS~C=Nyok8Z1Hk;kUp37K(N9r&>E z_WlPK7M2!tfuxAeIfk1<1~&CZd9ee6i6(SiA01lO?i^o9DX0}RWEKnrAj~Ubfc7zd z8nSqX0%w{T+HrJr$#OxoW<@3!L}2TYtabLCATC&O%2Um~upb9E*BI1=Fgxj_lz{T5 z7H8jPTEE!p%r2P0g*8>1)SDc61$6|H@msYjRvSz~vvpB}Mw}HJUcpcroKr>5gx7%c ziVw0y&MxPxo?Bd7^*x)(`OB+N-LbGyzF~A{$QrcN%!X%i_#1iAhaAwPH`N^b6EhVW z_pn>Bs@?J+NkZ%qBX#}lSGbJ248YLQig-tsKIk!U{E|N_tQjMJ9;D^iJbg*n6;bj~ z2HNis=JL+9D3?6V02|Y?MX){6bgDal{PhpBZg5i8T@=&!(zW{o4-9dxeLe)4(Yq__ z%&ad--3NMmY$BQW|EvMSfuLZ$h?z1OilXp~XcLKHgf%A6!Cy&m=v|PdWt7n!Jj9W1 zrtRMB1wpYO?WdMMLm_r8q+5A>t0xM0T$>=V)7i$g{h=?kSs3C5fVMat)W94ig8sz3 zy4-z$C5vuU_Fm~>4dtoY+uL&{Ax9g(9eDB)z-C(u(ct0+e7u1D#gmtZC9T8@{F3*{Xu%7P)Ao-jF&pCmYoe?(A^S!78xnh zz#~}AXs?w+FHvCuXFQ7>*cguypr4oZh`Xs6&Np>td9kuBEB)i2jK}XjJ#o5vs3GbW zAKo5NcTC`YtV-BKlMFM|>qx#f89i*&al&@e;3l_+wYg3`G;7^*0b}x6qe&a;c{H8* zJX1-*!xC5=1x%_u#_reh%u&lE0x6Df8=&#;1Z^4DE>6@!YE@zK=jlf#LzEN&h+*!X zLAmyH1zWtV=Qp&=J_jWtzQ7N#8|4~!4YdXVA<3SSXYxIdf(Ti$qkpFG4?=)shRSP< zk&>hJou}XVO)T9NyEb9aL2xIRZe{SG9#1a**{)L~p+l*$!$4a3sEwF$HyljVY6cOr zO>0zp=xWZ3r{jHqy*HMihs&F_V4>J~)47gk^!uwvI19LoqCD{L%}T8Je&a=z}7SRF>o}(Pom+LD!jX@Q29br#zql&{E8q!h6&^K zncq?8XsZOa5^xqi`cZt+PVewu!~14Q(PNfKWh&y+E1u(X)b{OgOVOc+<(8bPNZQys zgfK@U4ULfW<~$#gI+QTT{N4BXy6Nl5s&{OLSaKq<=qzi zmidz4Td{bb_S9}R>SHF_9y;%P;9lVD)+@6fyrlrR0wwW+sU-L;xC7Xu?=udTq+1+G z-lz!N-2H$kw~0w8XY2N-SYb{6jxJAAqZcQ95NRu(x6ECxTJU+M3SuD&1B{fHSIHrX+0A#t<49lyFf=K;h++22seRwDK9h>z}iYJf!>%o70a{?o3k@`yqMDgbS z*4i*29n{#lh2ax%rF8s%wrVYa)6>s#{y~yLctHb{xJjpKdJxsN!<-63V+s&-wi4_1$?ewGuElp HvWxmJV_bSH literal 0 HcmV?d00001 diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_3.png b/torch/ao/sparsity/experimental/pruner/images/prune_3.png new file mode 100644 index 0000000000000000000000000000000000000000..1af2c3cb4ed0825c6f2258b575e21fc81eaaf83b GIT binary patch literal 87843 zcmeFYbzGF~)&@#9GJ+B!E!`m9CDI}SqcqapAtfLU(j^jtlptLqozf``-CfcR=N{kv z?Qd`QyT9}A`Qz}LdFElBx@)a$gwQR1 z>+0@n0>S8E8wrwt7~Ucs!mne`d=?6X%;&ZOw0d9KWFT8z z`fnk{&lg$l&*rz0Fipd3V~GcR&$tkb9z@djAc-H6kCtPdy^NAQD>y`b?7<>=skpi;!m}?=?FS$)UAMixN3dg?Y5g*;FuLD2s1vyd$7NDjw^{>%MS|RL9eiCMpur7be%c#49 z{k1R79%ga*(NLH zNfgneytT!yb36MlXZwog-rAW=%5@wmd|b-+@fvWXk?&xjvShqvMOH+b5T71?IgWXo z|D**G?}s=c1c|=VME{BWZ7_;1fjHF}0X0<=UO)B*#eF}dyj!k3b)8gklUp;1+wh0K z?kx&KP#qS0WNulyGOCS!p;)xt{_6LSz+37IP!hP=J4T+geZV^^UBDBH7|HS#QNfUy zGmcmehkkPDi5ul^q-?7thm}nY-mXwoC!a@)nk4hW-37r<5q2J#ZbjQc46?qwKPZZd zWI9eOo2)5w>B;@j#a(`cbKBF*zw;0mvl7O5bMo%hdqEpxb*Px#Evx_DZs6qmMM()p zDTd~I-ziKj1kK*uDPyp__VRjBYJxMC!p(n><$^}8lpRBXrAOcFJ zZ_rutW}DhYIs{#FJoe@%&pc~`SlYuMG&ezFX_L8SF%F3qiFXnQzLa_{jC!tgEGfn) z$A*+kpR@3K3Sqp|#+-8}?&)jqyN|H){mJp!o1Sf-Aj;muIj6&Ua=fUc<6|E05g9L} z`3!;S8|yizD$V)VNTRs|Y-`L$s7z!AN$lOnQu z38o~30v5k5ZM2-XB>Hz4(6aSOptxhcO8U$&%#*j*8FwE0EzLi!L16RWnrG7%*_o$Z zN7|EM_oICD?mY)Ystu>~$-}#RZRnWNEm3BV$mL{)qgkZ=7@|mi2!Fo!(zj5mic0^CJgExPQB$Ih2mB)x>}nN}>_+Qy=%2s9yVIiSjm$ zavKRhwyPgAKfX%KIj~~F24cOSy*A7J*faGjs@VV^+Cj_?6j?tCUj<^RAJmFe_XxT% zrZ9sra_4m|=iOhPyV)`j02gLWiDYsXa-RIu5+RTft)`sr6YW zdp!zk=AU(|-Pc$O>zJX-p}R}>JhqaKUQtjfIj1>iBBwwx@`FnfXD?D;n@K_M`{a84 zjdEC_wuZpgqk^7knDDT+K|_08d);`;7v+{*)#OPU*$mqcn=+m386JY>`(^skas#T6indUfF9BQu# zu4%S@4a~*W6TRyFRm_&g_Kj^yd$)G8_PtZ;*5v#}M_+Tqp2mVpU+OOTuIN<#>~&>O zR?y(`dy}G)3BV8U{vRRGUjoGbw+l%${wsV#ffh~n!xcjL)Q|p&Q z13m0>wvBUoH_A5xH{Y+)BTLAr9+q0SEg^&<=r$$1trFMJe>1<@R7H76sr$@$FiXEj zU*gxvys4Fk3$F*?rFyVJFw63@AL#EXOzCnN?_oz^uwq(cj^HX_X%U@JPE!jr*9q1! zw{RQ2kyd*|#l_^{zN}P)nKv_>x8t-d~ zUX#&;v zL!B@1lG$C`LH!^oZZPgXqxI{+$Yg<4$uFv3_$x&t;u1bqiG202g28^ip6Yae&rI(6 zx^Cwze`!F8<3p6t!Vl@MA$1|>!vnN$IxfQ5M1vXwZ=XY0ptS1{ zGz?9QbYnE5q92}V_oFJg3Z(8mVj^%36G61EY>O8eD24~Y%k-izCVnPOdMa^YL2`J!xCy&2@OK^MZJ$I zq85|;FzE}oI@j^y;~?{}91&(5_A6sS(NwW|_rUXT0dDesqk(BI?BsaP0{5zw?ep@f zFzQg3{!jf~$%mKPiQkx48&VRviVYgNt}@RfeL~OjYoX;(4NphkTR$c2{eZ;)*05=! zuR2GS($5?kMzbplj}8rQ77xX~-)P@B&c*c7vMSUk*l0h|rhDn~+I{dKmDx$-UXfd! z!)W)UdZ9XP!R*(xnhWh@*@+yUQJzp9>y@FEW6Q5&Y+9G@sb(|Bp;4jrTM^^zuWk+e z^;_d3nn^8rldYI$j?By54!^SA)v{o=X6mWJuO_tes%3SbO4jrA&Y!fYsCT+dB#R-l z7Fu#mD?BLrGHG0AY0mTNYu+wQ#O8EGv7V{tP?(p zUjok@2QTI?q{5|WE1&LEOcn{M>S%dccx^322c?i+k>0spy_$OeP{WyUn@HN(dq;L&%;2Ycsa_pI zo^R>_ciIR_lPJt8l#&_SQy(22*YhC09Yts&da0+k z7ue9iKBZq$jcE3vP|%+#0Am0rMJj&%qOOkf`a0R`~^0xH-;1aDD9(tqwhL1aKc{{1~jRD#SAQ2ud_GWdl5 zMS(Z`n!i4gV*(M-!GHMR?UsS`KWC#@Wg!30KH^(&3_(IoN=^=Zs=anFHnwpzvvtxC zt`!AeVA?%>>4<$~}Md3*3L)`~}nrs-!6R+SZ!Qz{u9nn9a@F4t^g5VK+gr zYi;agK}W#cI5CPMxD3_-9D|C*he^7koDmLk+prDv2v2q z)y9$LubceOeWZ;YUptuFIhotqP{QwPU})>?BtlIMf6zbw{%WVOoB97f$;R;?-2xqC zhaX|*WaD7}=e@zD!tieepP9QETfLMvw+3Sdo*~N3|5*6<`Tud`f1mg-S3>{yN={C0 zu7ADsUk?3wsk)=FgOsf`c&3x+|IXJxF8~hJS*0Rgj{_4m|>bID(wC#0xjX?Ns!dM>0Qe z@C4RqLuxW8@nsxzGmYffN`l!`xmZ*W@M+6q47#X)QN8q2&=!}-e23b_;zt?dPnkE@ zc)R9$wK*vyB(l-poBHka>sqpqqiJnHfseY5&hd#rE?t*!atucB+rN8>cS4e+{ggUK z1F)!(s44&EB_Zw?@xbrzu6>K)M2v{ZOQu1^^#1RUKP^!ug&^Mjo0sn$8p`IJRwn88 zyMKEe+}8s6Z`(sXXn^tv<^&DMy-wgyJxNjEk-wizI9oZ9cKcyh` z;82>snHX>!-)a=?Z#ycT(Tef3WTOO2)&B2~gGrtI+XeCePwKx+@c(BfHA4#Hs`2*p zcAMh%EXcZMA@F2S3U8wE=D7HS0`JD>*vV(ZQ~lcWxt&ULUnQAxo%7m>mL=afZx}|Oq_yVh;lkvN7e7AW z!{n_;$(xb+!w558qR6ABJZk9`z1-%VFp5*Lan`SaqR_f{!>k zt-xMyl$-YEi^%D;lE9DZ%Ul)_wGjq=(qA>v(;qu@lIKZOc|m^>w0#X;!lkmej^~=QjtLl zu{BJYC781qQ@zh^yB`^M47&Nf{$jJ>WFt3y%&Ed-_gg1h!kUROyTfKiu>B%iZq=`@Y$I z?rW*0t~1VwO5A;qT^2*=Cx0!pT6d6`Za+fN(Ih1H`PGcx>~?jw@3Ig;H2Jl9_JKkf zh4FBJ-p7rUBYdUNF(eT}SiTu3seN^Jl)*?kOew@s>>U{?`zw>?W+6Ki<*Ya!Sg>g6~ zCU2iP-%p2VeYJ=vZ=yd9koT@xfWB0kis<=g>#7Nplbg%EDPVWo!8Gqn&R0{4l}vx$ z;vE8aBBZf!adX;e%aY!(5J)-+{0g~m5PhoCus1qz#~xo^fDQi^!ykq~{VZO*9F^3t zphR*siSsoz@h))UHbD;^f+L7IskO7@vEvfbmG`n6w{8zmFgC*(s`>@D-dT>`5!%)_ zP4{y7y zxj3^kOyP1Ra+MR4u9&=ER|=;8BOaI_DQ@5HNBX=|=##<|F)HulO(FY-!+diZDyYEbekVmWiRs574K3?R=!97_DpB}rDmtfv zjAz`x=&XWa$H3Q2!b5=Tw0#gRxiEi%;vBFh>up%+NrB#VH@W+AG^C~Bw0=*-D& zQ)CfJ5IT6m@(5Oe{_X$reKk@M6k@X)I?g|X^~z`8Y)wRZA7w#oA^s&5FCDsQcOreR zvdBEPOYa7uGT${U)n}wuNco~pIL7UBdpWnGHFuHV1qL{fLF%ok^aGUguT_)X2|6w> zJ$5R`4OjAr^gKB=VlJBTyded0>SB^>)>ObwoT>k-u>dDos&3PG?J{MOVvSQx)|T8( zsMV{X=jGgrgwpds`;*<2`_Yi>PN&#SX&=wHw#%rhY(Tq`k9JJ01?^}l;uudbmx}Pd zG)KVF)!~3G^VfsgWS2Y&7GD$rr_IzNul zwE}L4=vPI-f2^ih_=44Tz9Vup7+mb9U-E-G3}z5SW_J<179*Z>BGyp7E2X}NgmXIptom*%8ehdmPD+t%_$M2&+bc(!z`>WMsrSc} zfX&2EtvjXA#-gMw@(%UtJ%T-s%{jpT(=v4-W;y!)ZsNiZ4M)OB`TEgt9~gl+ZrYZg zY!FD&Qbwou!f@uJ`)V?;#b{nhivVypRx9zE{kE9{v?zJS>X?8T)8wE_gD@-H-Tsym0}q-B|9#rh1R- z*V}^4u3hzz!s{qHf?Pz9Mcm|rIh_#j_7VKd^6e-kuPy*EvdYg+C<8!iyy1Gsb_zre z%gcV(2JUFeKz%92Pg_Ob`vdVP$o=|%MAPihi=Brki5!o%qi-~UA*slH4mLsM@BbsW zQ^$O)P$Kb=vZ~wbbjFm_w$2^{sV!Gvzm1?(DOX!!*5~%x?x(={a%^kmT8si)2r-Mf z)`4^erCxWkN}It#d(6+eM1}1EZ*!j;`u()I#*}IQ`TZtjJZY8((*qb|;YkwP>B}|= zP`lk4l~RjQV6lX3<~6ycC~o*x_^%IE;)$b)iYu1iT!^jcT=aWCj1-21=!f3N`>4>P zUTe#G3!-Ph^S$b5{SlAAZizqr_y@_iaxUBn$I6Qc`_JvHsR{c42qj4%liB5KD`{R_ z>Situm%9PxPc9KGQ2OZ6{7q}`bLeiqh41 zP2JM1;zrTg(#>S=iw*1c`qx){XBT&ZRG`_t)t(LkY z@Yzc7!6u5Xl?QNlFaQlm_5l@hYiHI!+2J$dWJ`^QH8?N9a`Aq4keF}Ke9CO5zNPGB~(qU zlHG84kIhmdUL!;me(z-oCH^*mR42b$@q&`w2u8(XjI38_R9i>O65WsCEtM$Q8 zyge-C@80@j_&)U~{mIv`9|55}NO%Z+LwQ1plnP9vYW0KVlfpz`67&h*JJh!#_+~i_ zYgx*BxkOr2MiBWprKIoHP`;P}>xSx!w%bBL6^LXB#yH5t_)CHD-yvV1lBqS4(B)r3 z1+vY=%nuI?FkRZJ*3vxs`}vo`p&wF-rNY=7+#=f*1!+%Y#VP1wbJh&+DiAN&kk!5J!jnl5<_vAfAY#lH#g<*X3E3_ANkh zRfEVmzMpG?7pW*08icqZass@;UK)iR`j0AF^O=82JwOx(W6 z<>9WO(rQn*?9G7arPXSYl=SFH$45h+Mjf$yQV`K9rsxr z?U^@Ez(j>P(z0*Pv={bjMs#}`7pLuYVW+_XCwHV&oY(c19$p9AuHv)w&61rOiWQ9Yr&u}j-d#GiVmTixF^2HFW> zhvmc)7`s?zK`iuF_bs`qshRFs=^q7&T~Va~C@NL*(q+Q994t2aO;m}%Ifx~s0c3>b z5OaK**?cvWvAF=wWZ1rh!mBbhDd*Jn;OP4{;p!-blTk1L;Svre>({bOgP0&?D&Q z9l;H<=&g^A{g{@IT+OktAGO6~a=lh}+Ai%hu?jQp2CyU^p0ZNaQkJ-dCqrQHLVucHa(!x^Jd|*lWSQ-)+DPsJ>%hAkcBkeHyzB7*l!8VdHP5^sad3BY51Kb z9ug`hRmbbGcL8@KA(E2c-kFsy$0!75__|4ZXJ^WMyS~^I>c^wPs^-Rf@NvfTu#f6_ zJTv#4!OMykEarD%4=x`%kE+^xkDhz%)hDt7uw=jM8A<%&1KyN7WWkQ~kg{z8#2*jo z(&uQ~0Aa?-pI!rWf4QkhOZe|0c|8~910&3>X}+X{WV=+oBZ5?u0Y-U;a(%uZv{Hj( zK(DOdG!}gQSr9&arTVRAy!h{cdxs6F^(kP#tZ@!{3GcYJ0mS;95pB7NYZN^C~ zxAi_up@M0Hg=qXWOy9TQo)tei8+2KVO|FDdHv6(6R)56HIR`)N-~I2A8hjQ?OsIBy zs~e8LmN+oddvla_#u%UdTA)Je1(;<5EZ!NKZb~}JGRhW66!CtJw&)9VB(xSB8&%*u zUe>(azpfnDCEd4V=*dpheZ>H?W6`r`JfW}QMDz-KFVo43juunhv9mKH}moKhzl!{_{1X79Na8n7oRccgZ{__MoL0d?|on>YUKK zIY=Zg3uXWmgft3UkxYrfeVR6Gy_i~his=Hh@9`@5R-uKG{DjTX6Y2A<#dQbrve>+F zM4VXB%$U5{NCe>%cd+RTBlX|T`=z{eV9mM%=_=9xLIE;4OUe&dryh`T5+#wQ%nK;2Qoyjer0p+UCF-AryI&4GX``1T4zb z7b|hNR3@JS2t&D+4p7+p1vmo*2o=SyH&j~XF_}Mn-)r#jJCrODSa?T3{p1;|AMPqDF-hoL0MB>eZ0NBV!u1_N zXgI~-b)9uzLupZAB6l}5h$XVp?>)_Cj8`g+!uD&ILuB!EZdrtZJwIjta8RWQRa@}iEv)Z zP4ttiv0jJ79IFM)6ISJYCg|jD89Sw&^tQ(Bd9=+lYc_QPfiE-SS*&zuJEE%Y2q ztqPv<$q?z>JJ)sW<8fJzmi-V$0`doj%Ptjf!-fay8@sE15$D|+tIQe`+Kzq3K4+~I zy~sx&*m*>Us!|VR47dFfBOVG23q#A?A5!{HA>#?_M~dCle1jT`T}}*G-JGwn_aq&j z{H)%NlDfl5G~9~}nF6gd1oYlH%5n50nzI`Kpef&_+rOi~OT$24p`L?7S2qtUDxsA) zwO*O*AFi2Xn^%4Z!#27;*T-f=>7EC2sRCw)uYfF&9}rXZACZ$60-S;g2JbySrD@7Q=!d@1ChKjg^S`4FKm!I%VHU*0+=cGtW&mX${21Uhthe z#B4SJu&Ih~)~(gEG8f`H{i*}P{Un^&z=o5cC8AhY9r*ee;bcN%!s5l}W#b(;sVp$N zn_r)2g`-MlT&Ao~KthzHHT+K8|7|5$Voct>(DPGr{h27w{O{H|7>W5A7ngN+E-WwtYKW91tN&5GT*5kyY$Y%vTe0yc6Qg^cwKT@SC2TPtDz!klX(7G%hUB z-luJ%7j&IO8&hp?sy)YFtO69qmp+_ol5Pcs6xp`gP>Z-9|Xw-qpt=A zZ@q+!yKqOSyW)dG^^}%_rBOAN)FvHtdrC~9nrXKteWA6Fj=f70Z7$o50AU174PY@B z>XWj?0P5akIWqB4*wYiX^>LsV2?Og0lKX`A0DlmcIYADG=JVp zI;du%{g&E8_w&Do?o)Q0VZAWWSz@N1potVVByV!_b7Kn@tRbM< zhA;;a3$+#J8zm1%ei1<_28r1OFils|$7avk$Ry9?Oa1cDrO6hjAR}gg+I@faSSGw1 zled8yT4NgT_~}bxw&DLv#^m%tzZLU2onQCj>JjX9ni$oz`zq zD;cDSo6bPWg+~=gIb!!j+n2@=*XCKtVRl_#;#L*tzBkfPn6R5FMIB_wrgnpF3XmX@ zX07*en?JFqOW#SEP!8 z-u5gR+-lLM4M93(L`6)*k0i0k?p+k1zbRmH^r`p!PzodfxVJILKLl~8UH(O3-aE6y zR*D;SiLcxTe;i_| z>vgPtxYB~#L&ov}4n6a@53;-gNs)tzlK!S&-A_rR;=!E8e-&YJv3N1q;&wnIaXCSj zXoI1Er=4uq(@c$q{ZPZ1iO7|1Co&tOIj9t3@X}sGq{Z1HJ6@wK)pb@Np~>XdD0SpT z4O!Yt!*^4}3bjjtyf)J{7^}h{f0f>TAkC(o6SJap6PHJ!Jp?hISM;ts^rOFQO1OH- z3*w+{lgV zHNGM@PrtcHBW|Ym3Y2qr&4gcmpna~*xN>OmuA0S)X~=rk7T@Ya=W~Jo$j^&0oND!3 zyJ$Hn>kr9dZ(h;gm^U8j?%i0;&8Zp2&I-)EA};sxPN!DRFB%+C}E@?ncf5G zeqye}e`q^_m`?usBfq-LJH4fTQ^e2z8$}GH?JvO<66g>hRHw;<4SBy{=)V(l;;2Yn z@mYk`kysN>*8s%THejK*VLkrBK0Zl_1;{w< zOWc3CKgh0=;8{@S?UK#YkMyK(5WgAXN(b4`3AvK8L`$|LL=>%_Wx5T90 z-d|I6OEswJwFh|0*LWgFQSbnMvSaJx(WDWCl7gsptmC|hZ#Ai=12(NMW&es}dH&*| zubVO|6GD&;ap{hbMRp~~f-CdkomZ63D88IM91xp>o1f6l*^)JgP-%Id$Sc@DcFNBA zeklx^lza29MAQC4Y7-LnoEFKmK-Z}az9uwwR(EPm;OS|WVdtAJ~-_#oeE0{IKpe)q(pr+1%+W= zFD3XYw198^DOB*_>t}Q~T>-383gRN)$p^zwMniqQxOPcwRwG#ouZpkVcWXh)K)K-o zZ)Bc0kW&D$Uu<3ZQ{z5r%Bac!$knQP*uMYP5&*5`958)*Nstixx+6Tqn|$9m;iXB) zxTc<$FW;S2!bE z_pgks#iJ-XB#=_rVemo+exW4#dPS350R`F$5aSi7gwTvRG54=`vwmx=zO|T?lXWMP zmQQ}^T`XoE0dF42(~HC1s}J-CEge7H#U9|Q%mF7;Edp{)`(T09BpQ@-`=t&zhhQ}a zCJn9zl>;2!2^1%h6tH7`zLJQi>OEncxN~|9T9%oRyC)O^O!s6o@Q-#9SpjwBp%xIJ zw&4tHvj2`@9;LIlAJ<5;AvsJ~7-a4Rd1M?Pi0o@_hczo>@rAtF?O$UcuUh?yB-MkMF zItJ&^;sB+Fc}2U@8Y-JRh(Vq2$w>DtAXY$_;+S$phUMV;O;h##wWuU&{uY7j0U=L!+S#xC-g?W7KnWTB(?kRT@k@(^O^V6O z<2Pag_auXKNPYNL>wB17k=cDfDsJ5j3}KDcHC%%9gx^PqMKw-QE*`)`PV~82qJVRl zlR$yGN0@`+H~TtL#Ny8GM=?w{QJ`#|09qT?N#SF)*Klc5h|*yZ&Rkqqwgb{c`|vB# z3vD4pX_C$Db|SqBu*#Vi+3JB>urS;yq%gV%OzQ-gV7=bAL%EXt`iLVF1yC6ADc9>S zocBQ)n9)cylN5Y1&48?q$%}yrg?)4ymUDp4I9w^};%fu~!8X8DN?cOX;vSO#4JeGnh z&V^Zj72k~hYjMiaU0;xwDB5A_){Q6&1zh53z^gd)F}N`Vh;*EyfeE_1tR|aIekmw= zpuy|qRBJ6}RgRkCgaq}i1A)k8+P-sQeJHCH7ps6Y;aTV8lY9XbI{_(-r2wHhR3dGC z0Bfup@hvH7n{xzqvR|>`TYb4Z{Et{(;z3kB__xRw4Sd4`33G0E1J-HaE832% z`2$)rf&|?jn;+^OBxD6|+mdFqYbK~=rNxpctT&@|!>yzN5bjn$;;sgz00Vd1l77~# zCcS=-*v_|a)-JT;GF@X|1>%$Rpn-;<5IB+TY<1e()|Rg-sSB`3fJl@pH0Q zr@#`kp5MA=PZNwEr45u$yPnS-e{(^~o}S>|UrE)x5zG@=>z@Gd(Q+#jNX9!62>n2a zw6%hj7%ZeY0v2NuUKXLI#H0REu|a8d%F=S6!e>SK*_tf5dmE_!^13|eVW$xt1d`Z@ zV~l%;XYz%A8gA{jKUg#`lKwqB3^FyNsTrdc7S{Pj3aGTur8+%N0cnF3fJWi8N+gP< zWCiiRv>sH9L7l8M0uhe8wt)B(np=xwcJ33(m{USxlGL07083i^Jf36ep2YAVI{$(E zPAV^+D@+;ao4lalK&j*%`&7%v#OHgMgVjx>s{XDw^xmk}UZ64y`{an*PKBAqSxb+m zIp=ie^Yc4L(S6#qp(jt83>L>ff+)zlS9drGu%#uOlRYAsCNM8}v>S5NharG$&Uh?; zth!}8y`~I6(!B8X(Mad;Ec!z`@zE7fV>jEHbV;#v2kLrwt?Z}tl{L$CO0HysOl*hs zH{&p9goN{ggr6Lc2OY8b&SNh-*&44L;JK_664p5<^!jWmawmXDw*;V&L7@1!U&6@( zoIJQIn6fDNYUPVc7>r41(hI_?WfBK>-ey(Ei`as!h3%y9mNY~bMFU`F)l(q2;}_d{ zE|FGsJg|fT`ODDZEr|d@sIAwQ7H3qv+aU3qn%pip5P7KL0XSW;Nx(c3QBMFi%S`NQ z0nZv_(s!s9*nQ3wJXO4w+sixNiX`2V;8ltzdPWIb!k6fc0!$=d!n2%u zA5c{`G>ds7FE*m&6@ij6R&@-5c5vYIA*g?#nAr0^{caY9(hgSkfOjbAr-sP>kBA-PDz!?a=6A%W{GsDBvhrLRavH|zC$9(AtP*-MX| ztirY_lvnkB3)!Z)@#YO&J2SQ!ivG3&1$AnX#%!-T_BhrKi@lEv;PnyR(~C5Y1s|*c zebBx8eNR-?DhlB`MC-kA1C!#aN-uYMo=kd+>s1Gj#7s?aEU2^5od7igAK}$7fB-h$ zO&C@a!zFJ}adEV)>7Ci)s(#WdY-fEm;q^Sq`^~c=^=iZJX(UcZg#k16+*FrWN1q}- za-B*NVRE}Rhj@I{f*G?-n(%=dwTc-}p=`Yz+!0rJr4cSgd9;JemZpd}1bbF+#R;V**&&$2GhGEIV>*C}t^_Cin$eZlur0VZJ+6J8 zsqw3*CVdp$gT|{B_VPW?H6f~pU`}Y0JD(;6(;H5N(5EH!0e8+he^`T=jBW*FBeFwj zt-+$-T4#EwDLFl@qibMqdd#UJ{_q{EwbFZKz(Pdm{1kszD}-add;S_F2|5SX7*$)1 zhgyZX-XtF^_u+=A+cyu>Rh5yfdK%c7+DXVJ!_G+89*DPELirTd3Hq%f#{e#x2fwpn zC7|8`5bkA_o(s{K6&T!Z;W71Xe~iKx$q3`Tr!@eSKPKi*fPy@Q8zo}cgTi@>ke|wF z7~Kj0;$*Flfo5vw45_CDz&nhF{NW;Sngit+{T~%NC5i8&tbed9V)P7Mv8Us54GHxn z)F3C1D`iGk;i0zKFbU?^-HiTPyZL1wB`vpxyf}+RCGFzk3oVI?M`?~j`YoheUZygE zu!h2@-w5$;2Xd$hFboeU^Y5yi-ZR$pnuZ$Q_dw5pSh+EEJNvC?yr4jY847R%%Blp# z$S=%!7F-|{&7bVh0MY{~MFkB4#Zmb!2J5MR3=2>dpWlj$g^6>3@*h(GpmenlALc%G zuPfNj%X*KsK*UV?axNbK!^s8!-`ZN45T2QorK z$FN2Db0l(ls>7xPk})m}W8rpIe0_-3@Y_fUwq4axuWxeF8}>)HKA25mykRl~HcaF+ z*(LSmKq=EO;y68Hng9vsTzqW6E&h6nN`>V#kJtSqJ01gS#YTTOjW}V5eU0vO6>`2u zg_)sMJIjFis;F|QQBGOGm(eA+<3}N8*|g&>GUD-fW=x2zQFyZ*4*oefwIlIAygMn0Tm{0f#h85VjJ^ zT$E&{cA#{Pcx`8)C38CAiI%UEb4j}1iA9!!Mn|_?!z5M!Pc-UEqr0O%pd0v z^k4}Fi69aNLR#W00FK^=AZ}xJzbZ7WI?2_> zAge(u1Q9fu-BPP^H?%*Cy)hT}BJ|nNjoF?eF)@s&Qfe~zjEj=iil9n}i(*;@>w);$(v4hWPKjy@FXeh*i9BCQz*3D5r}-FM-aipaSRxso@9!U5_?$a-kv z(0xcFlCPhyR(9Jy@qC%XedcaR6rU0_Tb&P5UMq` z5Ol=Ap{vB}vqx2%Y+RETnm{nTTA+&9)fQZKPE1gV|}w4N?79 z069t5Q1D_*-gz>yu@wI1Ns?bdq9lZu^4{iG3lHiol`=wz?%L_5_(-3A%C9kfUBbYgtbd)J^3)!id z#sZ@}N5=(PwOA1zaoGbE)j4U=no?mvPz~|=fJ&Hm846;pOlrX#G9d@eX(8-Iw$K-p zzXsFoQn~UwR!~oIiyF0T>v^OXGq1Y6fW1A8U-go`2E6~LNGPZ}$V$a$4{*_R>T^UZ z{QPvTTjU;MOQOxL0Otwm9%6Twza{YyqNd^`ZNbP;kf!{$%6HL&A2h9B?nnFI1YA-} zB6*RF`5f`gj8Z)FZnNE$4M%7ChD47BJ)}P3+T{n0%$i%BToHZulO6A_H^_L<3gFoS zRR$G=re;Jm^m%uB!)s6oaW|H|JuWN!2-RvN928rS+$&fE#m|B?)_?uHX7C9Tj`V~3 zR>bQZrABtKiV-0@s*mrQ4z^?@VT$>yr5$i0BCeBE6=w{^HSPl_j z6OrkZkFA5lU(#J>O-=^mrXG~gxnc1LIYK0)X6(BzOt`(1P*!!k_W1{QQ5W7~GOC!7 zz%-e}2(%M;jMG0!1!*|p`7*0ul;Vg#qG@6?eAr1nXjz#XILg!6`Xcp$d?Vq`DA{qp z!|u@NrxX2>XoAo-)Gdo6so5o8CMxWj4Pw3AbS`h*Z*Bi(Dw$a{a5c7HTUx|wV? z`0+2v&-0897L6bu_^kc9;wShy(AP{Evd$H9`8n}9 zevmGkR8Ko73}8mh0fbGip&N{T)Cy%7X1j}C`k7gNS?T*H3R3~Mi?l~KQyd}26XZFjTm+>b>V}Mg!q;Uq*NKG#Ts|aYCyQuHO#CWYJFYqjAER~8;O`~IYHBHELM)3(&yj_uaB)r8G~*XEkPXz^x~AsitaCe3IP zF(>GbbaNQqD@m>Kd&wO86{&|N>TDWE-PO&)K2>Y4emJg~yS>7%`=Bp%&7D7`?~l9K zOF|h~zKsv`x$V;krLpfis*H*xH{k(9-tCA;^$-H7905nmW$v?PYNX z3HoaCpTFIz1@%F!%!Gn#rqdznHRR_-+KV4Pr*dKDDhTN1YN`Em;{4==3 zhIS7T%`^gw7f11zZ)ecQW=!88lCyibXT-hpvIF7?zJw~D-SAVi2OC{C<`_Y1(eON>O4hv~N>;(l>MT?C4v3D!< zF32vvX1?QZ<%#5IuZ+-oaINxn+rSm%qhliS4lGd|2HoQwGBr9&{VW zcBVcjSv&hj4a47vIEPN#QSz{h(y8jh4O;8{tK>uu5s)s7s+_>BZs74Z7Yi>o>7+sv~ zpr)JY>5h-KT*>gi6KcI72GI~$bNGL#ddsM&{x|GbLPDf}lyrAXx0G}V3_XB=bV-+V zgGh&Th#(~~G&6KdBaC!6L-(`!pXZ#l&P!gfX0N@!amRIi?m2347xRrb6}mNp)X5L5 zSRow!-@wUbQxea8bq$m9;^zyRhX#Y^lkJC^Ro1u3QXJSBntZ|x)ov)XU_QK$`Mq6u z$*yQxH;#kVlU=cn_Q3HiOy99`%R8@D5N*;8{%LGc*ZU%)>?`Ong?v73v6{QnaxE{6 zk3-md4U_^7W}H$Wcl<8|>&P4f(?1U_!QmghW1luF*j=I(`W8P9exu?MYks`5cJSDR1{Ml-QItLD1RKFSN zAvX5{%&gH$nucTK9rDrn^Qt4@U~(GQ4DApiV4tOUK zg1u<*y&r`jEWdC+eEqcM<*8-UKhOQ$qU^QR+5eXXuva#pD?w^`e_$gO%Zoa9Md<~f zaOY;{M4F>{;EF4oA(q#wkIv;XR|jbRW2aMXizV;~ z$^Q+QdwP0a@CPuRofIEKe=!?;Oc_RTB5-(=jkrbykxgvjQ@GNyq=Oa2EE-6ecI#!+ zQcFw_`Xr3&1Tm$sy4X&f4u97-^G`=5 zmov>ZoJzptQ(oju7q(ejq%qXez(@sJ$EprP&6BvitL**EoBe_fhV#V%1KE69rjloC z{2uX9p;#<>Box{E)ww`KL1lR1tYRXdQv-!X?0@|N+c4pp=)<@i4q(RYisGoxEc}oN z7zC+jh04eq` zmf-Gv4QnC4OGi4G-(}CxW)F4e8d2@}ZWRJ|Aw8&jg-!TPk2#?q%-PN&Thep6TTYI@ z*Idi;zwd%WPUMjBl?h1Ab68hgnj6Pm=L4&nw_>2ID)#&?7Di}4K+Pv|Imasz>gJusBYKV}1{C&WR(x|SHl8M9~s)@D;@gQP^&=b?V0sAJ}qb);2 zTbSRo9cjxGzHCeepHa7f=DR4M>wvT&aZpX;rya1PpinI`Ak>IWrE|4uX<%^pAUp{f|+N26MCxzuCF=pdv11$8v zwLiM-iXEzlkE+ytC}?XsTGkc@sjUej=#};^TE)25se-p+RfmOgl7RGJ=XDMcFMKvI z;Sb)D-`2b>47J45s!IN}@pdrsJ|?06>a*mis42Zwr4{uM2Dy{m>l*J6c^hT)_x*oW zty70&19(|1T|W2VD}IP8He=264P3t`9r;c7D%5JMLA#Kv*%h2S7#17->F2ic-9`Vh zb}OshQ^}m!#M6Ps9lcb^0!G(QAbs)8A7#eh+RAoL5Hw!UIF;(BiDE{6l`OBfP>{GH zgpfHf&mWd++*v4i-hdl}VWjw_fO73cYASf1xQVE0SzV(?gyubimG#RUHw|oaER{5& zJ_Pd7>m`Gh|I+T&DOajB{NFrsr(c_fj0O!D2i|PrAAIwC&CfJhg4uX3k9yWKip0*s z^bSrB9<&F-ZL(n}OCPP2fify+g#Ub`sK!P1cy7BLJ`!%6IB~K5EBzVCfFUl_Yhw ziz@2&7($GJ@Ya6HFUuM1DG@mA73e-SPAdYP)IF*zBUHnx2R9h2W5VUdvvJ7-dvPu1 z!^dCgQrW(!3de!n=sik6REC zNd&rrK_;7P^H8((=D^$AK!$U+|0#sPZK5!NFee7-#qf3s$ujTt&(EtGUk$>8wEJP$ znfGLxr)&KEo1p8#DX7r8H&M*wejXsu-`Iz&C&V&x7;qvjg9@je0iDX}$MV$e7b-(t zgd8J}&6v7wwL2G>erOcQ?rUf;=?o+~LI$*h8G<{v-vyPZ;YF3Nx6lmN`jqd=zHe7I z-FtkkdKt*=>)K9-$0r;qb%AVIVu8$K2i%y3^xm9=Qn>_tnWI-ay9@b&(U~DEA)mTF zlJE+sfqk}FuYuCuhtqN)t6z#WO0}pzF$L@S^+L?MhQ4*I_a}0r&C+(dw8x#N7Vwmr z0oPjhONQmy8`TGzfi90{8%PTN>X%em?SC$L=5!-nf}KH+Z*QJgN7S=;9-58$;;OmL z2s_7n;tDT2q-?g{12vnLY9z%zbh(^_BlkzAausj=mG8@J|MpEOcjGGlTvf|mnUe*T zr{i+5P7gO_RRb&5X2%uejL!Z#wY_0HhG^;#F#r5NZI^c7k}T)GrA^b`dMfO>yl9at zY1=ujNy`ra8-%73so(;wfJd^!Nc(8V|8HH?e+SNjxZqmTqQi;mtwplKlop#x*{^=L z=nLyW=j1KaQ{G0i0^IHDqO~QuLR#N{fAMWR58)!U^{DuoV+c1J!w=*xZCflF*MK z!vdCWYBuWVP(n?F0`FGuml&M9RB4{M&ZCz7pPqthBB!Pw-`NZ#yN$%-+hGt4yuj*o z&qb#H_+g*N&-Hqz9<$9G@Y?n>Fkla9D=uZX#a?bIKz06ui;!mUe`V&9nHa5oOMPj> zr}9EYRR}*DJUzsho>ox(6iKadgWm>OB0cCz%I2LzC4B7!LAB?&vUr!k67?j&)K{bs z9}`S4r~9a|G`IaXy1(p~0NZ?VF#cg$S(q^jhV{Aa#;{nj+=5V*f^qsZ;{&r3+3T-= z(MWJOj3B1$4StA9#hL<*67CY}?PxEpB_Ako|0%hZF)s+fCna8;dEHWw0(;?*n$`y} z36iZ5!JG5CkdLPC03{JA9Z}x@jk)SEm=Ha+=Bi(;7(&E2%u_Nf5Q5^P0T_V$GQ!8-YrtuheCDf-cfE=eTWdZ z2dn^ZMxRtoG&L|P;?^M@?7bIkY)BVMM&yDHpg2p6lkZjhf$RJMXt*p85=8zNZbcMKRRYz5)Xja zGfLz?ONAq!oS=sNc;LqR|E(Sv`Wvl?t}h_zeDNs}p6Fs(MM|45e-;zs6VR_HRjs4B zc?KDhd&D%X03&qeM=XpqKC0^pq|VF3H2dG8zH&+f za8tOi`Bqw+cfZI|qFBW7st|uXK1ao>e7Q0-izh z0bOYSzAyKHf*6baZ6owm7^$FqsE@E_KM*5M(e0$;)8I3kE9J?PZ@nq3h9W}m1e0I( zc&)MU@)iZuQqG}iv^F!Xn3d<1B>*1A;ybK0Dvx-ubS4Y;jhD_^?ody%^v|5(XXnzV zboiC+o%nR2>8Is;g)bA($3Iy1OpWHMAtl^g?5U9MVpAyESQ~iA&(@+cDjW}&-kY@F zrVpoKHW)QqyKwX&&-YUrSa#7O{9Tn#AD6M;H1&hPI>jw|-(5L2za;CN_0V^7Ag#FL z^D;NiGA@kiE2hZXkBB5U=H4wZZ(jEr@|q@jI00HsWQD@GO6DUT3T9myA*bhkWL}I4f&U;`X`Q5D2tC7se*43jPl6i#!dN$4Y_^hL64_Y?tP2CBHQSTIC}F)1xmNOR3q3Xd19P$+G(gd4;+Y z+&$BA-LY`bqRV~qf7ko;dLcrRG&)9@^|prTvo&%?o$HCN_PM3p9(Q{^P5}GQn6W=E z3C;-0aArRZPt%_m`>IKiyD!7{n0CgBTp?K+U88yRSw_bhSdrLfMX3pY4?AyO^!nF1 zIpO2wNHoSZ+b1_;sg1TlW}7dKKg8NEF)MGe|LL)zF3#$A;h~0P;bM$3=WnG6Zq?e2 zpz8#WGuLg^=8Z605bbBwxwYRE5Bs}0P@T`*4}#A-ODOY}Nx(W(mC3f_iydBO> z#(LZ5a~C#d%<(n|*~uwhJIkv7VO#)3j3jC_oX;HzYFra%AML(o3ia3sb#WJxIm7(b z{MFGZnz%AR8~;M?pSu)$OOp%5M=<5{HBmaBLncM)#kcyKh9-o(*l#!vXo&Bp(P z5$^jkXM-L!Ww`9=1l!YskTm79QYkn5L3kbOk+W>X>sml*!ag)~U2C|uq(Hu&yLGBQ z^705$rn{)oIj8WYag)wEj-SeRht@r5#`uE(2`rx1L?YU=OE|**5(vrTMc0=U9!IDl z%m^tZIy z@z&J)4&^TE*yAKjqW=>8m=|X?*J#)yms_QI+({1a47*S3{AG%6`AgI`S{4shq}(wu zY==(zS`NXG_G2AD(Ku`Hv?{1$Gv<~#Wt>?2-2bi&A_60Zb;h04mu!2R%|ph08YW?A z3Dfu*$p$jWX@MwCtda>zBme2Wf~KMq8g}j~#Nmvr`n=QE2qEF3MYSxEV949j%TtuF za_Db#>AlJ-`&yq}*cGrjv=&98M1sR>AG=aAnMWmeUd7hiIexF3wKMN_tqJVH zjuMnJh(Ywne`AiVCo+~_nc&1^lKd1~76mE*geU+Er3a{TZF-GmlPuQuu*!hSsVzQF zTQClF)YW9x$p2ZY{Q-TI?irVjH5&wYZtqbKDIY=K%o(xVD<65rt4rc*x#z~;*jM7~ zwI67FCvI5x$NY;{!J7MyU!7KvP%o#vZ4zBa?4dI<5TE{%PfGACp@Hc{Ux&(BNeZH1)IEc1y8JS6@1e9G3p89=0o9yZ@S$5u#(Q@raufuV%plvSRh>OG;+$hIomn3S-?IO z#8AglF?J-?oX%PqDE{vX_i|faoNa#A@2u(~<6j%4>Zb}IQ<}W#&b^h@}Fxat~&jaQa?aq2NdB(X=wIEVM9e~e^ ze_w9j6AjzwoE?H$Gx`HU zIZQxM@%E^&;ZlIWK{;CRDriBQ|0TI{`hpp!W5ew4a>W`M7KNySU%Uz>YUxZv3gpEa z)+4EXRsPGItgh-0NXNUazT?3U&wz_xPmVx}7Q6ozQ&4I;5pSF3Mm0RA!_y9f{DzKRuH| zo0?;Peqbt1eSInh(bZ-UqQ*XyKAv1R%Qj!bN%#S+vIh@PjmZ{Ly2h9E*z`z;=`XxF zOt`lvqSKaqc%W1V(g?HR`c)u5nlAO(8hQrBX5~Vz(5dWkP355YS>XI6RiQWK2y&ex`KNun*nB@>DNnjc6dc;Gy%My8r z5R%>jNqtakuJgS0vgHdM^r@?XR#JB~FnGh>#D;t{=b!y26SG&9CXNmhbfG@BkX`~r zn}b)I+G}z9y6oT>7`4F)L~4DrL5 zvppEyJltv(iOHMXj_nKIW}4#}x?( z9PGl&oZ(^`SJ0VnGKmg0Q5HtJO$hdX0$S zBjkvyP6QOtr_EA#nTB;fTjajAJNz)7vLBb>;>V=Hq zUJ>m;%Cv9?zg5o)0w1)2Gx7F74jOTpbc?b;Sg2w+5q%lai@jVH_;xw>dgLl9nyN;0 zf%#ChL1n4G74kkv9j*#D7MYb+aZ#R=oMYok05S9B+6bsnsZaY>ueX#XUWSve70_F7 zS9RIKk<1nhJ976{0mV0!VZ1Xyws7?f=&&)ZDM(&l+bk#1z-mS!{F-rWr}i0#3MI)HNo2;8i7rLKOS}S?vGi>?Wsux!S)dL zS8q~|zft>YqEw~ZQ39V45dA+gLfH7zcllOkZ{Q_G({jD{J zXu@Rp=8%U?-B3nx6WCKj-ShvMV+sMa@E(T9Xo)nW*L6m{HQJe?4?^>N1D_OLXY1v# z&yl0BR5_Zp)1T>fTAV16ldTaub$nBSq#(rfP)efh(S>GBJpVExv(aH#Agkx1(lLRB z*Y>3;E7K6lPg7hFs_0hemsF6wI48pxQmCJWBW*79yJXBhS5UxdraehIl^l8qj6fx3 z-xWxA2LQ}VnM0+~-xVCyfEvfZrw<9i=VLczcTC3kxvS!N+@r(oFNW1BxmK0hi)&E; z1WGOD9Pe!BG~qho(iwNhdhf+Ld%7GI%6C6IOZacw*qSRMcUS;sS{x@HbSLOjbxoO> zGG?GLH$>RnHa46iSYubG=CY?QE`K~u2QLvnx_sMLJBYwxMfXS-H__Tz<-wy6EZYpJ zo?ybi!7v_~|LVD0aVY;M5Lh*g>l2gqnNQ7~ggM!f!$3A{UoUkO1d67APs3KDiSF3) z`N`V-sOKIW_ci>-UPWx5+fEome%^U1o+bXvEt_eitKXBS!gu^nMaTJEq{(>1!IIh6 zKBxiA5u|s&)tvhZfm>oKq`tCg{9~&?vxOZ8?mo&4$YP3lYl|e|hJhxH-AbfcCE!Mq z8Z`4cW{Ec|doO#r-cm#&t<{RmBCY5$RtjNm`zc6NPbbh}V^r95X;mR}(u?awZ(VSk z&XFPEO-b8z4uMax_)UW2q?axEQo9a6NsEPn81)LN%HHi%St-{(Hap^Dw85S=Q1z|- z%(H3l8G^Lk!}p5=>7aDHHta|1QP0iqHaZW5f4470(wskvKz)Z{uKq5>!z#Bgc+!kK z8>A4GWsi0uPm`isK?zBF$?o+kc@Guz2acJmlQhx@&+XESQN`BdysayKR(TeM0(d>GQ*TffP|CKNK6QO_q(=?~b zNofg8CsUjJ3Mtd4lGU9ANOQj>B^uFY7@76*t!~ItHPru3@d^rz+y#JLWLKhJ(AqkmRXsqgLR}W{7j3KF`IfAYZaPvUZuFm!A6dNX z1kH-Mh7jfXK#{1Tl_ZP_o|Zavs7M5}>MFgQhrcc7xiwoAHQRf~A?Z+OH7Z?k4js{j zQUbIc!UpzBT56J4qny1X@-N+8=fp{Co5@pdU}Rf*qY@rp5mYS4HK&m^TNOK*u^bX} zym0M2mBp~kSknoqvjF9Tu{n`wr2R=0Wcq%iYG38$pEB zX6Qb?A_YF7q^P>6*O1~`>7L4-LM!;ifyVks9{BS0yFC%6`+vPd@JLCgAMlS4(Vxa^ z_XaEAjpgkw$z_56h#1D=MHL;~DrzCYQ`W-iZui5Ri;Ks2dU&w=!H=)M=6SDSok(qE8A-8I%Q8Q zXfN83-$uX9u#?Zzs-pOZFjd3(u@2#=Zf+Lr=ALmWj%jIfFc7%5nHh8;3WDkHS#Hab3uP0?@ZNEd>Q?u z86GI#zo)U>jmA{3AxTmqUOl~C=lJXHx1-%$n-I(>1p7auJiut2>pGn9mBayVMK3Dv z^ww#QI8A_JUd?=+l~x1mjKsb99dD)=|6gy>tt#@`=xi}ThWS4RF8p3+w)D78q6{6D zlS8qX*MLBjRF8{gHm1X4HGfVK6oS6`C%_L@{o>XX=Do+)X~a$|_$>2oY>^`RtaJ2fs@&AqD8lVdhl|6mCav(F zOUF?RKC*eqdL6Z6yyw_x9AE;uOoclDZTeVd>!`SyszEoaHyF#Vlpb|}Sm)cMUx{eJ z_Yd~$H*J%RyY1RbD5pXhb3>GpB@|N~yX{ft3*7A_jzXFp)!}{?sY`^}hL>uKr2$iZ zw}JT`bHi0pMs~-$l)1Kx=ce52PQ(t7ewRclsX%`am3eSaMg)3@`0JiYOUWkn?c7oC zZcD-5hrCGnL*Hghscujs6wu>2Hn(C4{K zP-YHxrw1Jimt^L(At6%kO6Ya<5rDA^mrO|+<>aUX zT$s#ja(Gd3T?W@CYk9lL_ui{_F^NpP7@*S^oxZ{;Ob9m_WH^_MsKl0kDkdFXd0)lY|m#;Y5}`U1;) zlMq^Dz!>80Vb1O!JM5UgT(-k+46-B5^UCPTZaH$KGu!*tcKTbx)zo-oC%SW8=v!3L zMra90xcAN9Z~D{;|*EL6n2%0j-rk%-N(YSO;SnC!+D^Fy*YdJKlzI$#%KH2*`J+l z;vnCvoq94tYvr4HBc*nR;2s=$!j_nykB;Ga3rWH_>#lu;DTxsIXA?_icsPfP05>$X}qPldi6u6~F$*ZIYF(<+fVJ2q0q#bHCDweA;X7Xhfpqr<` zs98V@J^-0!Jxwtrk~4BMSppxxCb0%8M@d_9u8`5_(Uc29id~6F;oeLR+V%^2M_{4m zj7pAR;7)60`1%HlZxHZoeZwq>CgQ+A?ei>&&Tek)=y}DBTK92n>8IRxf`d^@G&#*$ zw>3-|0R!SB8FT>L^&Z+`{I?=CxrH{LfsS0|iuPo4QMutEHZU?%>faEE5j6?Rwa;U> zjyrI@_w--C$R2IOuflus2ddm@ei>6fbs-P211uME9N_$#@E7Qg3Qok_B2ilad=`6au2Ntjq+T4MxsTQUYiZ8z~|*Xcl{ z%RXWEE_q6wLhIi}w_YxH<4VC@BHguU*V>AGRr+b)GjO>q*Ry*|4pE~}f zI4uu6aVg02fRKe+UC5lgJCNViBf*E2lX0sG)d;a--0AWKj%+&CG1j{U*3~ zA>6okowHLcY$psw_mWlZGfMqrf39+l+i~Dw{Dd0AC~UebF!z&>#q8Ovj2CQy{BMV# zcRrz(>q@OjIsp;|cmBzEB-Ug45x&k|(Q5Qa6su2=nil=rFp|ZaiO4XpiN~g>VlNYH~Uo5gOh2Zke5zRcS~mg<7(a@97m|7*0Biz_bMuWDaJ81w9#rN8}dUnQ?u z?R8~^bdNR*)ga}o$}`z+|D}>);Z~N^Xs?3??P8f7K$9sQE>r89yxx3?KFK$GcB=`3 z{?{jI5NBv&;|O9L(%$@a5<$}MiqG$^^|N%pAj+}f*w(%d@CYh(KEs)$m+^NLu*mx; zK&alv{V}s`n|%~u)+} zL><9d_&hlzgXga}fJl)IWBLArf4BSXf9XZf$c@4&2f3QllCIy+;8E50_zrMUMyBbK z5OgbZ#D0ZA_;5nCDv=Z8&Eb5XKK+n5i))Ws^ZBHT?Aw7>gH5c_4h&MrFbzd&DIF9o zDIx$r4UZ9S$7pQs0{D(zh6RW;8tHsg;cWPzDylO4XYq-39Dw&{-5~Zd(ajSOM4N+Q zODHq|&kmx{&`Lr${hCNnNrFq z`48YQ4X?kUL&I}*zs7m2FDfMIq!+~N#NZA0mICl8L0c+s`vgErE+c1EoBTdn%ss|- zb6EIwNtZgq0Lgs-@K6;Ncs(!X9dYD;0I1On`tU~R)R&r|Rsy$!6svw;av;Mk3Q()s z-?M4q0yAqQSPXAvLex}F8wJk*?QODxS%Ppb8qY2xsJc~14bL4lI7Eq=7zc#va>l6^ zk0D5r-Sr8B7{pk-&WE|)%aBZ>ho2MLs?s|;WzxvU@eYwJBT3%{<_vrP#l;=qT=ss< z$e@KkOP1!-+%48(bYCBrlz*r(aBjb#+u`w{nZ2GIT`h|nm>$3;ivZ=)%!5Z^EBTqx zXR;Xi{yjys^KOhtNNCT_RnOO&HN;R~4ATpygm~czaM}lwo+25%a)z=0 z9wJDUzS{2n{ELZ1SR0G<<^aRfWK6qq2T~*RHjEIZ9gwSX)1wi-PGuupF$7e^nMr{} z&)dF(;g4g8+8b)U*i`+uODleTY@pwQq6t6CIv}`duLOIFKT;GjAb)@NjF7Q=_v}z> z+F(gVC1=`L?~5OP#tmj*SoRIdr{;0gjRJIJd?)HCG1Yt(9r_S#p-hG^JiG5SS9)tP z+hSNkiExlYTy0J>L|h5wfSy3agU%Y-@F?aLH9W57hxBW}_s)MMlc4lAINmjM&3HcT z>^-vj5OX_o(if$-LZ+zX#zuHSfSopX#BwQ&osd{LC3Af)`I~%fzkc*gHoz(EdKrO% z4jS^V%)!_b2YMAQ-B6;kb9u1M!V5SarS*4@*%|^s_bNk5cElyQF$bs~Qd4SR@4U<3 zzGu6|rY+Wmem6Lb6~q)8DZ@wFx5b?%AvUa^I{%0>%r7TXGK30dhL-ZtKd}7|oT8M! zhzgh!;$ra~V*37@>9Bzh$TGo0hT}7{4X9H{`XJY5D^aP+WLE<2#(zf;Y}LU^E8187 zqz45;5W$SQ=dxo6j^8o-SGn<@)z4g#wQrAi=z(AGZ@&RN{@rvn+`;Yv$Ye4Sn8y2$ zkjC`{F+<6Dm!Yzx{O*I{Q2fR!E!+-{ z)@G~N=95_$8k4d9f2VZ9)mSnUrhBZoi`q|;*D*99s27e3YvQcr!P6K#7($7IAD>;8 zE)^G~8{Hk9w=>mrJa6x`6wG8s;S7Euf~NMlC{|*(I3xi?pS|-xK*n@{=wly)hu9sT z_Q?L30K^f}^#IW!ydvtW2Z%6kaaM0wnb(3UE(!@Q<9Q{1yr z`yN2r9Bl~AdDoT`?!roECiLE$yLWa zy%IFR8YEc~G77lY@!9`q>UlNsc^O2L*Y?j~BH0XlrKNR9aU0yud^*t6;1%&2CO#{f zIRwsN`-S+_+m5vu_IIxr05|x%A3ycDP{}h|JW^|=P_S{O9e3LhR@Ikeef2~MYPo>X z0B`6=P!q)~*CInQrp2 z8T@7nV-c7ZAiMKZFX)1zLdsEv5{T|J1b(^$6tEp<)E2dxRjLE0jV-Rr)0(eO`)aE1 z$y8cP9OBCM0#0!rc&%8*z4eJ0T0H^5<2^Sr44EIgWYMOS+8`o$U<*PX+c!qICv~E7e zQcSBlAN}Ta+dXUy{)TBpVz^mOi>uT2qNjtF3Crn&6Lzu>I{v8vMqunP$#&FVmaU(F z8OsAz%X-%;4qhGMg^kF+7xE|LvQ_wksbaRxL{pKSA%ArVb9 zcn5}s-ymc9IV6BFLH?NHsW6r+MpyDl)ue8dh<-lz0(!L@PddPuwbz^?Pg{RX9xbdk@LQIg>b z%!t49ACL!4QQp(seb-6?NccZKb{fKjqr&hZUrdy zBl6hpNL>JdVEK8p1o4wb{qLZOsPIu>Gg>^)hq(Fn&)id)%tRV<>hDLbu2zj(SRG71 z5`aXF6-~qXxI?Liu_{?79}i;7S+>JsiH;#E=D{KCq}xIkJs4u=e`GE5>hoez2_bT; zd+%;|M0&r$EdPd)qd9Sa9KdRB&)iq6cbg}w=ag2kH=$=;Sr{Gd0{EBNo;mE+HPa%; z&oCO?*yvYDcgR@eW=cc;wihlc?ghR6y*y_`J+KA(Cfk|BXc4E6Ak}6w50GZ6y_YWk z8TLRT!n}8Fcqo#b;KIbdP+jZHL=`iZXs?RwBkpUp0A2&bcpdJobBUuf=jF@N&)yw6`bf|KvRijaxk<7DWHQAYI_!QPn#e zQeO}YK9iwAEyJ+V@SXPG+CY2`Ab`P~E3S5YiH8J81t5zIns&*g5#tZhjKBYuvdmbT zcTc@QSYWz_6q%QVlwkG4vS7}VgcK2nwKXhtHMHEhbsFr?1z3Ix;ErUzcvdV`qKXZI4YHKI6`b+I@V%IBy+PH@i`HUZ|Ucdn8c&jb`FHpb3 z_*@i@ZApE!9T#1bwz_nnQ~9&{Gw$}|_UT8vH@KILOb}d)27rmR^lkd_7XuAew%+7} z^U{xDWfSH64>jh<`dO1<%K@gAumOg-_0}E+q2_~>Mea?VXAi|+PB-8kGfhKY)>gVE zb#;wmr2hacyFyMmC0tsM|1ki~jeS^n&o5`#>w0a28*C^l35`?Fv%4%x!p9Ymy+F~f z7*R|7J5th#ia@EaR&4<3bF6>N7=7&;rH*bGPt%v^voi5^CEGP+GMBk`s6Uu%r7CP{ zfBuJpGiC#+Mtm4L%4Q93Qg zrY>c%eoeK^T4*;e9GaN$dOm9YT%opT)hMmY=`C(Rvy29(8D@C|SIuWYW~slITtP(w zy#X~uA=s~n+TU=NU1n!Ui*)$UQ3FuQ%juubzzC;7Ej))P!u5)|5C8G+FhE3_?G-_c zJop|$@gpX|Kh@g+_fLj6UCj#h$mnS3bp%NBZMn^)VjStc8$L++B{^Bw^P2Wz4zrMOnq$v57NAEzXvG<#)}my=8kUMT%g|w{sTz0lOS=pU#jEU91+B#% zI0+^hU;F_l&GmT%068MFw6dPiOoqJ&9gSI>7Ep-O3QhnxeiHxkZ1F{fv7~?jl%$W- zKeQ%@!RjKmvb%#MA?Q!^(-9@gq3ceyTK-BICmbGDsiE`CT2gciv6f2V5=Kn*oDNYz@}`wGKXc-f&bFmt)i2 z*ZUo^`13lN8=RIV+uy@37? zzHkDD1D(0f#R|D>UmZMW#Jrh1GqjR7MP%T2obTlEW)2q5}RZdh1u#` znfNAa^V^YPpekgXOzu40dgpqD@%0$&4`_lnQU3uAyiCVf$&(dyDQ{N*B)R?Vnzh~j z%GoH7S=8KJ*kb}SyBq3b<;+W}*%k1K1D$t88+^mM>K6+8#6!v+k+uhd5g5^IqwfL{ z=rGxVx%jrg%6oLAXVhHTn(~BVR+xLaf9D-Sdz>ZT9)mKJkguner4u2BliR!3@42?S z9_ki{$M?sl`aYrvo-+Y%u#8SRPw`23kwkLEGAt?09k8p?Gz8lGR>epOaK8e-DNFW5 zksVhe1}d&4Qeq0W0bIy-1|`IeYYPLd%(_i;LDB%K#-e7k7aatq0x}(qpMDDD} zN^NtYFmL&Nvqk)>h5~u7*l3<3Qa5o1e+T0;Y65bw8k57`q8T6hJ{jUo=GPJPI^Z^_ zQc*ET_U|~WNbyQ5=RP)KG^cN$Yd`W})tI1EMf=u1$hP`caC5{8F|$5G|H9IV?7BZL zo=v;DJld^_)xzjT)>-ka*<|#)`g|o?|99=(RyoRi+3nExqG@;yG^lR+85v}sWWE-s z--4YMr}7@pi4zpvncs+Uy|k@M4c8Pj0?9qk{b>o;NG7H(BS!J;!<3g???vbEGJfSo z<8@q1uf?%HKdf6EMlzj=`mTc-(m6IfAP`|6!xT_zLef=TRljWjW_JjY5A;(|4 z+IrSJ>`D4%!gxL}yI1>3=WJZqrpJf*mio+M{NemF=RMKcS0mo*--Ol=-uHgX8L9SX(KlzIcB#?%;INz{wm09-rfSEZd#59_6god> z68*So_E-{rp%Z)9lzp2<()!<^D|KImikMv&5}vb<&Me84>Jk$?b}~_-+AV^-mwf_ zZo;e7Wj<`Ro+n)oG}2rj=?5Xm7z`!L0zT=l{t?T=ru`vtpoWnw8Dr#S~e!> zZnpA>R4IB;hkQPozl;!Sqjin5`femlHRpEa=7B1TOc|^u)8)m_QLNKJTl8GAy;~wW zV;Ks};Olo9+7=@vBw=4Jk&HohcGC$lzn&ZWB5rUf zke3W@Q62=O&x-_Jt=k>fz1(AzpAwTf=Dngto3JUSgTB{@uzbX2G1-8%Uo9=NTm+&m z)4hfj{|;ZyhzOJBqan4^YfUT6FG|P)cv%Zk>FE^^G`+BP(6t68|s&B zn0pY`ejYJWdjjw#nasTcCrusa3que5E#UzwkB%~x*2pHQLUes zVLSHg$5TWRps-ZZ?=Ujxr6%|ku1m@Kxc0Zu;L$;c3wJ5@8X?oZQyA;!=VX*LUQ&|V zi>s1y**8B%xyvM>Xp6ZgCpKX^%}jv%S17&IMjT5FbwN?;cEW+)yy5rEWsiZCCQd=Z z7}mOIOXD&gF?f{tsMcNQn;`F9QgNo}In_soZzHa8xT@!@#fcq-9be5Fk>(N_bjN+2 z=A|yPx06rA9aepSf<>eHd`{S#W2*z;{1C(rUVk^=)&4PFv_-E@fzm{5M3WD#nVqBG z2qe_>;I#kC|6>4Q6Vnll#9aM-QYcs32`nWV_jg?XsR_?zQ4y^z%udXM={5}WxX4Ze zJ=}xY0q50U_o>6M(Pr<;hoDM9P;3f9_z^<7GU`pb!n)NR3~isc7A6`2)%{LYAjP!S7*_!tZlIrYY26AihbKdyZgn zKL86rbp2;3^zw92YcGwFndS}F*Fk1HX05)0LJbd z6E|L%UZEDNT^B07%Rf>pVV1Z3t!-v1_DU{#x?uVwV^vkWfeDrp}K zX4L0kR)xtMA;>Lt9pywS+PZ~ZWEaY*!>|i^(>55@!=IQeKmX^Wj|=RHR~vU?V!$>` z=yJoLV1`*Y6Hq&GEJJk-=RX_`Ms1iUmv7h;=5WMXmft;`hxnFKdnJBR`j|YrVI?|f zZ+ezB?dX{|=_~borQkcC-(ohd z4FPe0tYNQ$iK0tntT9;{FmT6Xs*TrbISMvrC%LZhKU9Wrtl}5jH)~%P4t|!fx79G{ zm?rpT-od8fQQ3x*NYt6IcEl7|vPniQM=i}%rf=*jZ-^O(Ss!E8@2gL_?@P&U=25w? zqi8!)!k@t1{(X|+Ey$v8FDh04nq+8GbG=^h@Co)K;coN^uYdI>a7w~ZmGZCTG_h3WA zsUXNNo?QomTSz$Y2Grv{VOuUPl1a|I;P@bUoH$P0h*LJ4KRI`wW}2VatMLlKqoKyZ*dm9vdt4_{d=!syYV@E^TGHG+XC<3=VOpR z+B>zK_MPt!WIdDx-IgsfrW?D}gp0_OR|o~Hr&jx0wE`nbCL(x}(aRr+uZO7%z)uk{13WNV%j+D1}mfNC62)h)miG(i%el{M<5Eus!S zTil`Zf>{?;RNHhsKLpbM9)kS_cnC?F=K;dTb%aJ5+aHaaDTL<<1pJfH+VD7uyyBV3`cpV$sfIxE~KYO0W zUy#G{fQ=iBy#jiduNWVeMShqR4!JpiBGjdECWIkp2@g|v%E`mzk-KkvU~6S$QY8_YK2YFD9K+AVB|V^ufHMC*@QTUmf| zvMkc~`Tog#!+GX-Cv5a5{&)3iLcuA>c?D%~0KK&4*>B5f0GpZ~cL$Wj;ZzShiq-7~ z#r}6SK|8Y4Lw`xEb7sYY9=C%YYjbGly{xYO4_{va6;<1=jfgUUbV#Q(NDmDHA|28x z4bluLT|>8Y3sM5ojnvSsba%XTcl~?b_xslWo%5gbuQh8gbitmzpXcuTey$5iBfMP< z2RvEV(%EQu@Yr9X1#4@QBbK#u#O z%PUMgUdKFLPu1J=VUR@dF17$Nt1)ipDw~fB7NnXuaznZd^Xr&*-KnG7F1<2i^VB{J zs2veckBQ7zu6t8|@SP|H9^T7Lks^yZWvkCoh#De+ic6MgixcP(OJ>;wM>%vqd;sum zZw@$O+~t`n@ngj26XzZ@xrHL~ZbqN(M|((@OKJ96kcIyeJBW+%GJT8xlQ+yha%bi> zwWpwlAb8%oz6W5^U7uZIM84M$0zwu!>rq+#duqpB9nnMbXM6;gps2RH=C(mH_NR5x z$M3U2y?MP7CeINMn(<7xE0$~Qt?6xCmqlF9x{6Uq4ip6MJ1FiuE+g8mBO0T48_!cm zz0B|63SdrR)m(c^C$d3>X&K`&@qNs|h$6yG)tZ%zzw;XIyd#`XxyD}h0Exx&lwCus z?*@j(-fi15T`=5?{B(hvkwbg_4$y?u>-9(Gwo~Q<=GHalukO6kcUWb*ls1om=h76& zTi;w#VHgQ^uV3c&cU-JExY5S&SH}g9t;dADD^|cwBR}>-&Ol4h8<#TK=Q1seY?`+5m4^00?$ganlS@~vSG4ZOp;AKB{RXJe z#jUK{iNdmS0h78Ypd}*Db+UnWqbbC!V&rxuQ*>)+p1qw@<8^%cZ$l=XvzCaEfJq zst(vvgLM^R>&Vl?@{=&VBlh;F=hB&DqoTZjJK_VZ_}? zTJ*g)8hO#C=-bkyNa(&|Jpphsq`Yh7vgJvF(+|5e^(5U zkc=q}y5!HO&4pf9nt#DYyrBB!;#M9y+abxI6`@>A0NAItmQZcKAg?RHImjc`bf)C% zK(VSpRFIQM3n7~MO}qKr>&hA{2%vgwu=6k2TtN}a#;-X7Qx4|5k5I*2AwUoBEv(oC z8#X}=E$9X)I`r|5NtYiEKj_bue`&owd`5MwY8vtk89zpu3TqTe;O&*SiY(*$T`KLR=}Om7B`qk= z>>3d6*&{o|fId%)O|kgw_xiAKpfjgQYQ5d$E6AMIqEWmK9OUCLss;_FcPrg@DQ)o4 zj?le0jjY3+fweWho!n(4Ad z@>-7Ob>(|s>{is*96u^~-g+Lv1&bx1s~>U$7l_OI5n(fD35qe_eiAWPR+x?@_7wLl zuW@yBxA4_M_4U!y;}L6QRLAPFBLv_ZVKug#psqtu(l#CMK~TpRiBqYgE(zK^4W3%M zwV2;G#)QX~R8)rS;Ljnc=Q^mor`zA(`f||3Qp8FG6tor4R0JXTL)xGUC`v}c)uGxy zyPs2Bx4)t8^p#~Q*JX>qg`g+_afA$NPbIezw)qevOY?N+Ar`}CB9?4Fh&f-91-rk} zEV9r8!$3PCr9;6ny4~%l0z5`B`e~;Cv1{=;SnYX%~S3U~;mNMvi?%+e8s} zBNj*>t#C|CS=NM-5b1k5E880%ai{W#c#~eR!RG8}+B`EW8hv&7LYV>U22%4(1J>;C z5YGY$#C1n&Bl`t4`(p%r&-I{|jyiLZD1ifStU0GB@fbEnNS2O+n`#d}dZ@P9QV=g> zc(}fe3GAJ^#oM`?s&C~9Bd)ds5=qftAlzdaM8bY7kw5~Y=xkIQ7JL~of}Lvplulc= z8Fiic7W>5`#=w4rDcRBlSjPc%Qk^0b!fmjC@vInRe5sSeoQXs*vATko!p)=XFeba? zY45RsY|}tDKY_}~kU~y+%+aQdWlaUf=)4`07?1MROx%b^H;?$?COwR&b zuv1^@r3VmtL|7>O1R_LurRp*ap~FHHik{OYuX(*8?a$@K&g!#*>_a)8f$Pi?-vmq+ z#aG;{u|2Lyma=8D25~c3E_E1(m<5>8o6~mwFnJqA8OhkYaHl7wLCGm(f1`Xo-R~SL zB|otZ-I0ZF zL4K2OBbJ`agfCRIbm~+1W%4<72-aHK&T~nJTODPUU$&~Y4Cni*O|O=*Bq>wVV|OMA zGmq+BFD^eUcE9%Oc)c-(M}V0`HTt5#9a-`>A&WQ{1E zwc&h@{>YqN!%c5) z)B`ZSceu|1*a(k^*=_>aNz)z)G1=`dSO3t>S3BIic)1G&(=ErnUll*ut_~VqU1y%Y zMpQiv?dov5#_BuJkl+~16yRfe<#geULjt?#;S<&3i#f~~o6)zcrUE3KRupDuDM|2*3jn=}VC5GumPQrHHaP@^cjVX_c zgMJfbuMcDS=w9y)c)XJP=8Vc;R{q1Qe7qKGt{OeqMZpt@M8<1~Fj(962U~5%J{Dc) z*c5Ws6id_saWD)A7gMvEEF0#a&pJ8J}Mo{n>x)w*28iGNoVZHqvq+ zjSu}K@LfaH)o9&8z>9z5jP$~8%ueREau{j9A)i$QU~?I*d`_o2e-5KAz0$e%lF>7n zHN2a$f0%Ms7P3>V2(f4A++hKGz1kEsELWVsr~l)c^$Nzd=Y6kugJw+U_M%Eomtnk7 zF2(Cw_%tqN$Tsr;$rKj~d5AgVz86h6D7AO_Jim549Pdupv$O3f_3q)b>fIu(tRSaI z@CzvctU8I@%z`qYwK;o>%E?X^@6>aYep$fV!Z=?yrq;Uektgb8O8VxVPjkRZzfY;? zyd|VqVKM~=z=ZX=ICN_h%%rrQ7V|LIjo$1(0Jtdn7D;){gnrX6pC;AUyl%VhzKIGm zD_~8!Q;hClpQwkYNUcF;_J`CK%nsW<_E(eIESQRsQO&zZ1njItfu+jn0k3-X2=6BA zM_dn0scyK)GosP)Bd42pj+|H<1P0EYCuac-IR`m#^M8Ni1Nsm#6ToZH&DUzSX{2Gz)TbLO)(GtqEEVLH`LZ#;@EYfpWwiQ5NHs~7RjF@SmvEa#PjY-E^0g;R_ zNmR%qk9fJa`@Bt2kFxzWUe)jTwVFc$WgX)8jD^n`N^%AvKyobkZ*6a$p(2`gAX-|PYidv6BLB`< z+NJQ~9j@hq412U~@s!JC6R>}Et#uom4WN{3V--D@5KXAw$oiq|txfWjWPg+7JU}8? zmtRh1sU6k%R^G%1>g zzFyza)?|eZ)iJh?JW-`TuG^;uZRu6Pm~sNPSNmpv_+U5xVG-{bfv7jJI2phG zrq^TUCFHavuah=;Hb~kWG|b6xARK*}YmhZpIBv94Q85;M?&gye=|9(@cRkZ9l4SDz znx$7O05tj5cb(UV=I#t&%eb{M@v_hku+*;1_rE<;dYDkk=LxyO40P>RN*(5{X@L*c zl-SB`12AhN96^C*%iRh@n30}r?)`FORS7h9CZ~E!`n&{Q^Yz}k0`!*$)y^q42qq>0 zt@dFh`IePG@78R*NNm4-Tb_R}CQNLvH5awK?mLnywEU*Mk{N0#4cmzLMBn2{f4qfKw8g1v~Ac~GuoqoKY#C_tCZz)8yiDeCC zc?0Bz7iCUk>TS|#GvfS-7IaT|GgPj-J7h;+de;C*njvt9OlebIES6uK>-?q*lfjFH z>+E&V<{!@FVv8$QiJPUPhb6C1ciV(syg9hwW-;q4$mC=(h-AH3AE#DT&L=8=?ntj< zoRoTUrpQ3$zLv=Ldh~IBROJSsKAEppkS#pi8HJaFPl=XQ^1fLtyiP#*xH@n51AlGN ztcU-e|J9HXSe9xJN|m z*vZU7vWwWuaKDL$sMBrmR1;S|)DD7bauna%j~*`z%Hk611Il5!biKH^5^JT@&MxuK_1fC0Rd{`u+HnY3)zyq*u!;&%}Z-8DDFXdh`U z0&}NlhZ*(MZReOi!N#dDe%qR?pa|^?YA)@ZL}OjZdA*iTTo=hc!z%s?K&EoFd1!F^ zT->5GuSEADW{e&~^h0w;3obb5vV(N8n+Zf>ivqEepHMm=uV3Wt_Vnhua6bs>jl-GeM=VEct%-_0sbxj0i5NA;8ie;i zg*b#P0n^_qk{hGCiVPpM*9#(OynNLV*!1bEDE%WkJmI5sl- za;QG+29#2>(Ptqec;DXnQwV$2@jjA7ACY)ye4s{jr5Rqb9aKKPJu1oU<5Iqug zfGSF)2_*_w(iC(wEK;fR=C1AOuKyZJDT||OFKD5xs7?B6Jn5*4Gv|FJj8RNhNEX9L zv6Y33(a0@@?d0jw{s|mpjd;WqRI=)+;cdr1_~U8WD;v?e0KZk$%sI+oU@izp%u)#V z4St6sQ>ap0p^Y-7)mGgaOkVi)Nx@@1S7CD2?Ypx6Jvv-4-ocqkJ=FnOzm!oLj8vm6 zeYJp+q29IBTb+beXs<{#Rcz8c`=ZohbKhf;`4#rFG2$0uNXqk*qpTXF@w=Eqp38ue!8r3YSr%4^C)y3pdN$<`b0TgWIAD@_$;|^)9&yk zM)TdiKqDHZUe_MBY!2|u$h0n(rdaRb`@DYp!WfsTqp4In0fV)XO#G-F z^5Bx9`Nd(Q)T;+^`6s5=%JZ9+iE^w@Am$zUsMOtri%0><(w%Jye$(r>?#CifWqO<7 zKAS0bdK+*_tB2h`J)BA@zu)zmu;5rzDF#F;$pIUj9L+RU|8Hgo!e;T`%%(hL-xXWr zo4BI6AD?L1i$=?#80WGG53th~AyN(EfMi`#Jx5WTO9>J>IB{hs6AX2C$_C>Cr1{x$ zBbTEZ5v1M|(I(eElU!smLb+5O{h_;o@G6c=V&711#q1()>7bRXPMWY_XBKGaeWsX0 zUC=I4F5U{@v~Esf{(Quui=`{U5;9s$e|7UqjGBOmCfeehqTp&Qw+}s$>uMNg-PtFX z7xUD6^z~&GhUL4Q4JWg*HjFdZgl&gBEvnmH^aX%xEC~62yra}f8PkJ3+DSGdv9AsO*c7E6^nlQ{w?39k5Eb9t+# z%byD(+XfT3iNA;Mown<$|06724&g*;LzXDIYAwS&8 zoRmM9J)J8(o!cC-gkGon_3KsWg-c?b?*0L4u>2w>39mW8Z`4<|g(vB|PR>O2_ydBN zt*hWAN$r?*H%+gdSJ}x*BiT1xIi^+;OKs^u?`rW|?OnsJwK^1?ndkw{eLyt4rZC{r zf*QtWB`;Prg=2K^(Z_4L=4xhtP|upQ{R4WKvbNBtPfbYXN*fj7VkxXNUk*#1D6m#!&F&~s`>p~K7!N% zZjuwscM@}f(Y{Yn7WFI6>eeIsm%{UIO@sBjwWbi$MBfbwn;=0V`;^0v*`3HE1JU0$ zxa4%xi0PTi@yujX9DBfjSypb4M;eY2JJ?hzk7{@Z(VlAPgmQK{*r5OVF1~kDEO;c& zVnDW@cTWT5&(AD58|b{AK}qMJE_pEU{uy3zLN>k~RUg{Qdosisssu_awC_&%VHqyM?5>!9j@$~A!jguvc&*7 zRW)89Z_AFt50>^U@d2^iP2$IFR6I1g)w)D8oyUq0VHSL?dHHJgSw%=vmhs7*w3)3IZC2fma zf4!cHUccMzk8IrxoI^Zyj;={z8vm%TN?=RaFIQQMhu+V}qsUO*I%-+hunV;M9;u8L zzk5Nd!n-(pqBu_5oEhEwSfYbhUob;SeyuPlIdz=f2DAFyv>I-<4=6ox)b(|`jw)n-rcmC3bY>!nqpWasx#|7tm zNQf$U#b|Gt{1T{sZe*gD9DLo{t$L(OHg_i{^Yp>_B*=nvoENp)XkY-rWLaui=EBG_ zZ#%>&^n{Phdr|T02D&9>MAV+s^_Ne#nH5CTzks3SirHTRd;EugC+5Ax&jNZsp^yZG z(NYXW4S$+7Pv+z#@c8xpei#cCMZ~-bI1IYtfF=9e>_Ge-8S?Z#PItyllCGUhwdvkj zIRL=;0&xuhot6+Ax&bJP6A?36y#d0Kk-N>3C5|^X*CugOO~^iG`w7pLq?*kmpWC2h zB(a2jk7js~-&{PSS!vs#Ws`aJAc`L=kd%!5_2xz8L>IVd#8$9`d>ct?M@I933>g;O zuH?@a$BslnH#4?W4$^VJcQtQ-j!TmHoJ7b1xw6Mr> z-N^;4s4Aqv1Q1C|)IHk$Y?Js9?V>K zVOLfTBrUcMtBQHD!g{`JdQv^Opr1xy@dS4$`BH3a-Xtc=2e4SI{w2sun?01MO1zPz z*|Mw7&)=1cXDP^Ir$(FwX#;}pK67%wWv5`{T7c<*Mb@y6bq14BqF>B6~9fY|3!9RR6KQ@0C&MlEuK{PxHVr6Be4>%iHUG-qQ}=ITI}ud%Nxcn3pG{o`&; ze&&3_LQrk!%vpnFvRlc7xK$IcP;AH#Em1QBUx;*-XNh7!(KFsQ?^mfIT>FZBK@Y5g z1vuGiknSS)xBLE%<^HX?Z}oDpk=^cS_v;f9YUoc`t>s0fYHYOIUSKHAMUQ)x#rCdE zAvflkh9=Q)@^FB%X`qFb`ByXXs8j1Z0GyRNgn4C!T|NgVU_Bh`aQC+CZ9t!J4s zSZ6eO%hQA+$uAd#fjTZf6ub`9=E2{yJl_5jw}y%N8CC&ttAltSoUX(vDK;A!mBFar zC4llpc&@qzDUy6EDMMBQ7DQ|0xWs7c7m_0W%7PMk$aIP0M!Ke?`E)Ov`H+TlKxPA3_i$owCj(` zm&2noO>;GiaDIP>43`hb#mp$qUmj+aPyy{U|et+;jox^;h zLGNL?3t^Dh#~Mp$u0&Y;V29Blune6w`Q~& zS3^v7lx&m7y-p%eg}Y50mf4byPGf9-N!=NOn7nCRL7NRm7dRG zeoh*Q1*ulz1Wu3vn$38arM1H&S@6`$F;=rkgbgoJkm1tsPQ|SA6uEg9NY@9rl;AC# zVzMiCK1mdI-Y>BSe!4=SkYy;WO&Y*rNhCY;x@8}fnyA=d9}j(1{VK}kWj4<)qLJYb zyum!G*0EsQ)x#&FqNW_uGn%d@S+XHUB>X2nEbwi_CCn;}^F7sf2P#ZM`HF4%6!93) zMqtm#ySZbzz3zvokfXsGw>gLffE+mg*zKnuU$}j>;mHz%CQ9O&KvPGZ1x8L$NBhQf9C+mgd zC%&TB%@UP|m0jrk3Nq(JC>GoHWMu1gYAh~rtSYOhoq;H{`@b@e8# z8Fmz~kv~IEwd!2ojz0Ch;qv(^$P#6If?_hQg_DL5aKdbGGeC-#)ZK7fWuNQ8X*YE> zCfI9oO8ES0@*}XlK$`jVq~xj#Z{@<HmH z5sMB<17-}YtVk0C0V)x-o2=$fx$|(Ig9l*k@od5)riMtfA=y z)x$*F0BE0Wj$fSvUfNNDIYR@hdH$IWNQpIxCL=x@>zhuj5UX@90aU2~U(rZ+h+wN3 zrq-kn^vARbAOUJAEUF-P)(q4-$}$}+FsoN}jdJLy`?SELL>}lxYxjr_t5+`frk^LD ztDRV0jTY>Wu`4?y*n-biQAilK;5y8j&(y}~;g7NIY}W@*&aiTFF?zK63$Kmb$Bb{(r%bdiYbr!$i;qsYfVrKj>mZ7+1aC}@ zvE+tvt|&Wou77@p&vxi&xtfxoN;bdXmR=GV1_DQ+Km(PQc3iH>{Cx#g`V`QPElpF7mY!*I(1iC;j@aF0+W$EJ-L*MRWs~7 zb}w>Lt_ivF_+M3cJ>4xY!>r0*oP2d5ijHxzfs8f^5uHVC9SINZUp_vVFo{Ze4`08;TozguY4~frRiB+s zjt~uo%Gnk&XsBmWZ~`NgdyBe8KmO;)YDn$!_ER(@_N7qNi%L(;%cGPUS-oD9o9kJ# zM@;LE1f_t#tWWW9UknOOt?v`~7xj_<-NgC3=MDxze^;}ir89*(4?R|=7+D$yhfDff zQ@uCmmO2?Cm=9Ma9dR7~+AQ*lg+LrzA>Kkz?xc|eBg?4Uu&@}bz)@H*;yHnY`NV?B zunC!{b?iVdJJ~dgHOe!d%=er$yb^VcVvn22G^$B2ylC1dFdx!O&_a-@(P#sL{Hf`{)^ z&hBCwmc!OgdRZ!5Gm5&FvI&+OLL7+kB|Z2;`YYZA6D?g?e9YtLk~-u8(|!=a=EmAR z$%ntx`pD#2FXHEWul(xhUf6(5lPmcfSz{f!@nV)}c7pP`W)3NmpM2^fF1^MbDzqAM zW5mdo9fPIb4Aw-`U+Jd1>+b_ZapbtQ?ji$ke~SdX8mMFf<H)^P6|yj?`ODl%AWIR8~gIKMOG%q{Vsi^@WJnP{XppzO=$qPoE@t)Wziw>p#}ZTV}$1QghVyZ{x*x3BvjU`s}HX!-FKP z10@c?k?d)-UVm2vY4cHq0LA(E;Bt&`JOk$S6?eH@2C)Q93>QYZB{qctqU};VM%j{U z39-S(h71A~Kep&hJqGpD78x=Ag!*IW^QVf^0j}u6jCH;;?beHB zY1Xi{1?(i%oHYu1Siv#$`LD#5V^R~H&GK8K>8QIyZN*T&_W)HGSYxaEJAw+s1Udzw z3AHMzd*wvHxa|O$xEO_7%cIE<#z36bw%`B!8GmSch!WpYk zRcKUG1@!o1jE5T2xe;01`;2p7-dwqhaJ!&~SdOztOe9VY@3To_zIuY(jSd~qD&I^Hx|9n zWGs-4?6IzAy@7A+-M&s$*thN2rCPHy`l#<15|L=Q(@E$9yZmorDgmsV_3bZ1Q2I#L z$_zOJo!#%a;KAIstP`NS?IZ$eV0ieFj;GB9vYwK+Oce+%LlzqRQu&WaTS5UsY8j5#ibWSTT=|GeUWy%PlE8idE+ z;fHjGUXH{;SPo>K-Uq37=X^P4gywC$4xhx#E%bS`@HgAWd*qqemcf`4r2lL4H8Q_S z%5)s3uRwyqIuvZseAs}gDblbn-z}AVzS}53JZiIJyG@z&DueE*M^hR;OdGWO99e8; zpLehez|KvO60eE?g?EZIvTbt6ZhSq9EPIA_gw755~H# z_gcu{ccOc!)dOgw6q?y67;awyEx3>g+JEq48*Q~oTS{zy}AGz+`5wY3Anp<{RMt!hX7qPiaPJbd1tz{O`iR`%d z?cR{Q)V8%EjO-ADZPm;9Mw^w+9$MXR#=mt{kb9D!qkQ5WJ4WZ3B5X^{%>Jn*;w9>< z)#*l7?_@$3X%C;A7sMrS*7ossoMDT4)+7IbrC6E&W$J~{uC4`XTvo#xAV&ZYl1!nGheNp&#D)mmvup3m$Y@{lV^@7{HL?^|+uzlbMvt($s zk)plH6g%17$6Vu7F{XNnP>?bQ^QGT&{C6o7vxvJXkiR>3{C}2TrD>4hayUHjMcRMxs=V@TW&_((io675l)y zwSbF_*D9vN!pJWrz4U(nGTy?L$J`P3>{v4{I)b0=!(0P-vmb5cG0=wxqf}LZW~m!j z6mq`3=~7hIRw_f^%RAj_a#eh`k)lf`-(7Ou`vAfewT&0Yn*z-)@rQs#f|Vk>M3NHn z)*Wrh&GY%D&(?@aCZ6Ztc185KNZ{EuiS$E<9;nbSVI>=aYm;dmm#rBe$ zURh^PeVBxKI!TJ!Z;XNgwz9`)?oBVfC$_KiTp7RbraIC}*P8qJ`PZgDU>Mva8l)nX z+ndMjs~*jw%q!09Lw{Qx0*4iN_%7A{6=fJog0R3KqPtmID@pA}15W6ho3m<~NNtAz z?5bcxnJ?mXDF>Wq@_tNI#W*G0AoIRV2jY|@L31|ks2O~k`|>=*z1bGf$BVLcXO;yr z6TuAvx~nk3{KSfw1ytNyb2kPTjv&E~DLcc8*Lgt+evrw;Cu|(oZR1RbuP5bosD%#l z8Av949r6pS;@470(6&TYxoX&8H6FWQALXn2Bh9N2+i{lxWJJX_Q5o4etdLcH`C{)m z+EB?zNH9<8<&R7f3#`w5OD9?%N9AZT61So)cO{4uxTpL>!JZ^(4x0jq2 z$~sw-MDx{YtUo>74M(LOpBzMNM)JQ@6PgJ=%6{Ww#EL!*XhbP~@{kd-)Rar9?e5Lu z%G}@GfU5=|4gGI369jv*Gp{0a5q~QeQ6DG7HpU;TiYGYVGl>|<5??$WXFl9N4Yi^2zA2g8-w@cG&a!Ufvdqp!D6R=s~1|_{k}?yV5!>OAU`Kjx3PssQRyU z52)0|bmjoxey2kTJ1LiIamlT@tTo#gH&}tP=!^;AB?y+YJpMi1hVu-uhYkx2%|&(j zgs4Qe_qb4*RbnEn+1t3SczXdu!x%v;7Z_Pp9i4={{gkiT+xUh4z++XT7giXp`XyBN zr0{UM)`+hz+N_2u$HEDOiIVALG z;lzI@huDbe;xna14Z^8^|4#+v@6$OYHe@L2fvgZEZ(MBPXv^vZ{CElS_G8k&{|v`p zafSs&EoSPUQXdHTCn?J@FcGWe<+)oT_6-+Y(g7({1HuMe@duxOtq>^_{IHfy?b;k(N5H2o8sSsSTZ61xT}x3tvo`SJsD;NkYks~@{oY2df- z0K^@Bis--GzHuc0Z4RJ0OVRx_8poLsd^RG;w*JaZPU|#LSm$5QfrQv&pc}Fb3?4M< z4wEv}8TrhOXHCHmYyg27MxM9c5p-CjeJ_Tb82*~1MgKZ1_3xpxuOK`lr$v|P+U%Ru z?EcxM#-C!${Zc|}*niI zgSVUKBR-wyx;|V?yP>6P*SPuuVOn!ejmkzC?EvJgAA+1t{mOZAvgD0Zh+#NCb54 zZfwi-Y`=~J_U7x$y)oGqcAu7VYyr-@1pwM-OObCCsea?D+~abHN8;;eGmqP%PmskX z7bmvX>s(#;&BVo(jzIl3pN9n%Io3qYWdUuZuheknu||Nmy{XY@yU8qBf9m0SJx0dm zr;jnxBUK082MM-+zI-nRp4N-GjI0E=!B;2qvb174Is*QRnV zZO`eScjQLGLfsn8J_0y{iSEc1$MfcNLy>LgY9M3+&JJ=cmG5gu&!rXt;L#s~f|=Sv zhyea>r%r$Zgl~`OA7mtctamjiZI|Xw#N>QpML&i~=+hr3KK#87>WHh&$=h=nt@i6N zb8~xV`ylCx{^)p}Nj3mnuzMBYJK)Q3!T1%h06r{361b3# zG=+mg0bt1g_)N$Gkb6HvC%z#8cV!%*1}+#D`jYT3{?CsUEoDc6#(OJeM5H$}cox{e zC8WBH8V(HdpPyPB2B@K`sv`BKPl=06m0j#WYMVq>VW}Wvd@%6J)ynYq*46}|x&9>_ z|NP*ugYt%T=JztD=>1CT-J#$WfUM9Y`gq-$1(Z3Sn(#IC{?uAc4zRYIkJU!Yp>yK; zov^jtueILpGMk`nw|nF#YB;j$);SNdRLiUTLx+SDN9o~Tr}6*$y8DN#WxKVmFuQ>S z#>2vFAID{{v#2|OckAE+I3Bb;B6uV;ohK8o)0_Wt2W3UFGffykf5O0i4%xpuYU3RV zHPvr+b=l7HEDHnYgk|6F7E&+mRBeFcEh?N2xQ4=<(-crjxw z#|s*@tIZqplXM0$eQtNke4if6AnQHA?85{~lLe6{SmyxQ%&7Di9Jt*)Uh zNe)iM-i?0+?1d@9_xgWC_Wy3zw;A|%9V3YGUGJ5*Ei5!TE%wFHECSia7BSf7ubwE9 z?IyP)yH$biEDKtFwlA~oHvsUZ8$kY^2*B*sK-vKFqMt3|El*jJ=OuOuhy?}!<@i5K z52a;CFxtyMFFVYyk5>MBf>sT^@+Te$NTtTJz z8$)p4Ff0_1R0m$?zm)5zbgc1@i!a7&;df5{-&X0HasYZ_L{B7<34R0vzxUZNZ!!1Aq22#EGpAH6QaYk-UV&2cD-b7&VbVj+ncHUu4^1dTeX)LsUrgjk1=rhpKXdVsN*Ha>a0b!i|1iu z;id-vDin7Cu-c0Q%;h%gR2lS8$)If=i;pxB9<^%zlC;F(%z?)QI7VDhjP;A?2* zgxlUj9k*FJ=vvc)JOM@)2b;<2|6 zfNR*qa{Am^Ib$5b&-5X&IVBeN7)l)9W)3K74}R9e-IW`SR5Z7LBu^AqFjtJD2;BT# z{IBLuTBkxRwqxA{kb65A6veLPVyfDq=&gQJ16+k=y?syn>Fsv%lU?Cjk}2Tg2+T?s zL(_FMi#?tfJ7hYK$s7V5Y=}Y{cO|q_io(0*ZY2|*kxdRxL$;8n&e=&+a z0O&s;>CFn)FC4a2{d9Cg?S}v@xSN5`P5#4T zfda{NsWum{@1qwlWbHk!wW83;`}&g}lIEA_#2XHYW0r7jwx;@DwRtQ6y-^*|VY{$k zpbnrpSQ$W%PCX2_xjHqUSCjNzwM!h%|N0)2X3+=OlYK?u3$?R`A6?MK+PC7-=vvkM zPFoG=ybHfqEdyWUHrjyr1&`aGlLaUMd=aeNQMI6enN*9c9l!uYRJ{_5H;iAW`(fgQ z0(L z@-Xv~DY46gF!PImGQ$UebYPyp3mOHUV~Ckmd;C%QI+ySh7f=-|+Kd7BG_6#=hriKES4LHqE-H= z4NQ0m_=5;t0P=A$M)dKsyCs$>oX4lei0=(OIhT1PEp5-;)|_>@3*ZkGcK)cV3+epz z{ON-pg6tLlQBb(dj>zcNz4P|uv}ft-NZv(fRjJc00Q+fN-)Weq%xQbX4nVFf1+#1^ z@cfX8r)$g-_H3%1(izP3zMS2PiZ8RRjW_Y#fH8~+B%c{4%iXC2DCn?+wJ~lT&OhZO>@JD z+KEA4dh>?k5jYt@dIKpEmt=Gp2&%W49XnlJQU9t)3NoQ)`3r#etNq1)?(c?@n25y= zqR(w}IMhCY|w^>s=sq279qW4n)n_) zX+iJ*7>2T~9z_*9)ru zABlw^%AkTnslH3a)MYS6k?^`cfBC8c^<(Pk7#5BT;P6sE2hxLRi{TC`on~G9-IuLY z3;EKH5+oykYcC){xSRnD2pLL*OjE>PIImw~VPWwud0**E@e~6gaav#JonAz%fb0HW zmop|Y#b2jS&Tx@4l|zOs0kkv?aOQN2$Ax3EfsgwfR$)cmm$#Xa9-=M(PA-01EnJaa z+})F23f`VyTf&5@f2=zKUeL9!Qb`8=aE^SgRIK5B(*4>F@PUVqcSrRiBmCPL#kKuz zw>UPZv?+@>I5t)5c^dzZdvieSC>rH`6YkSz-~=FgHX>7a{Vo=Rw-413Xq`~r!#J>XHww$Zr zHjYJ+y%c>ok4LenU-oI%HBU8@ZxEBFd4u{E9zdb6mXd6CoP@5vTY=r7MU-aj1{5ae3^Q7H&_-iT(hXBs5D(ZeRTA zJ%A{KhOO(p$a}_9a{fPUj~8s#1N;E3r&rBxamfcjpEb|BaFyOK?RAQ*3$d-e2h3GQ z=6a_W^^uv%fxu8$<@JC}T)7Iq@Xba%m||($i(G`1`BYqCV=z+clUAq~LrhDyU1|O) z`T8ZI{6^=)}@sn&H;1;$sy*&XVo`n+fo1kru&9u{O3eygd z@G`(iw9y}5!U5&jTklMxK8nJ1Ia(4v{lq;W%vA6cBYOI1cp2+5LMFXO>fx&?rpro8L?(p<1p$D4>aJW4L}Iw`$9NU`Jf>nm>3_{ zXAGg^cO<<4u~OWWE~xUEm@)ACyhfmt01-|U`JAI$FLST@C65yKsrI1|cgZ21@stzd zQ+IzG9}rl?Kk!J!u^~ ziMR`&MOP>`oyc$U_3TG496el=FJ!N2`JR9W;IStc&+>Adx0;5PbkV;7pY$bHmThI? zQeYy2miWIQ9RSc_bSquuxc0iJ>I*Y1WZ#L?~CBdOD&JQ)$~>x0?p$>@l>#SR3ws{oH_#)q3$Ad_pg zE{uc2&i=jg+Xg$N&P;9U^%4;QOBQKF{}@-}SpL|B(K~-fQ3M zzVEeG_}v=&2O)hR3YSvIZ@Rj$73gC^g#!1+Y?zdb4}5lanXwN+*Lj9>9U1CNIZ;ej zUjV~qJ0*l%`&6hp_saSK>y0ZGs7F&MO7VKY>~PVk&iW`XoKZ|cSO++Basuj8yk*XaFVdod%%u@L~!vspZpMT2Z-iODo{CVEbw(AVb zlsp<9?*QCs@uL9@(D3yF?d(lTT=FHvAK%&S%>XUj5d=8A0N?(g%I4Gd8h~xe*t#Y3 zlK{Lje4$<7a@A@wwbc6VOd2>n)8m*h{;5|MF3U^iKOa3@((0!)Wx84B8_!IrVv-ab zH6nPO$lxyL1hWSU$r1z>I8Kdw}&TUWNr`F7jFxDW^4ub zJb7{E;$S|V<2lIfZ~xj@k#gmbBsxc^m7hS;^4QxK+XWiVpiN7^Z=C^Lww6KVn2;a( zsXHxv)STE(RAu|=7pXH|;7)@>>nN-=*u-r=D9s(XH7j#C2ZwVJQjmp4xtt2I2e`91 z?8{q)ny9V1JClo_3Ul|Lsb4l|JIj*aGI|Wi8&4Le>RUBJFD@8JH+Fl!(8_*@!B5|O zBG{I3__^Pgp^j5A9EzVIWDW$J4>#3o&qdNhILt47J49XQ8$U=8-3ORLihXL{PtS(T z25~gkj!&^4pmQw63w)RTt+4cugy%H*5g;N)f8l}ADa)EERS6FQ+4A9Bqv4fijf`iE zW>?613gcy%Yn&#A-3p=ON+q^}Nc2NWX?HgN&*7C9=q8@5CwBSv8`s(6JR4NC3S^5#-$rfcyGv2p~X%>9~ffdWKX;=zlar}L#Kf$EMi*)nxB?lRj}oPrb

  • =Te9fEE&6_f3@u= z9E898P7j&2iW(;q(puG#5(6{w6Ub_eKO^eBF-?HJuj$_EEwr0DSgk1bJPW0{(FM8c zt(_^%Te#7BL%mX+*EAsTGEb>rZsB#ef90R&xTmRCmE1;y!kWlmb9!}y6Q4jVO%_D3 z*&oY-v@PvB+S5$U#l)#5b(^>DY8#)ZW<-FuxBCb#r%MO^sMW?Sr?QgYoM!Uy`3KLd z`3WNmuWES|IgPO4%{B4pW+UAezuO z+L^ZN7~QWl^rmaQf7hf`yKVS>X#P!sG%B^VxXDvRcqG*#nM!r{KNwISxZD{vuAeRl za;`H`4(c=*7lm@Ba=f(hq_))R+`F-dr^)mCFMxR9p&KM~QV)^>G}28C3<7`ikrDh> zh(wUYr&mUTM~(oFRr`rX%E?rBLs;{=zlC_u-ZY8eye-X*MDn4t`ot!BlW3An#r5;j zO5w|!%22%qn^@lSDjf0|R@B|%Z$UY$6rF@`Jc4naj?oHn10xwY*Oj;i(n_yJ-+eL5 z*twR?cjRI`T=U!JfyZCBjuL)nDs`5q8pA2XO2Ehfkca%JF=KxHk8>hY?kpT|97Kjt zdgQ^+d=ePXS~rZ5v<&;_5ADK%+ZO^)@Sdj?fWA84Uq4^0P7Yp+Br(uj8mY*Sq5}IU zeY-$64Z-M&445_;si&J>e^hgjwe{z?`l3?e|K=Wdwc^K zF4!x95XXe7>J|HNlybHmxGgz@uR4s3d|PTeU@~lGn*Ab@A}HuyJ7lLHaN?R^kOnO4k4n zT){Uw_vtgdxi>b{- z34NrX92!3F?ZES7{Kav8Ig&fhMdV>`rQ0{Dl}bg?Ph^6u=5{?0@}7|Ik3VZMh#4kl z7>r1!j{9tEe=Ww))&+Hik<8XEP!Ur(b;*NvDuQDRDQcbE7Y(7$oS=?f_aI)lH`kgD zthRg9scT9pAPqRN(yh1x+6Kk0f>2wz{#1NxL_M|G zXiE~^AF#Ii)0@O-a{ys=*Xmy0Y>0BMMOwG|hWwW&OC>L^tkYr$g;yT|ZMp}+k#kW@ zY0a@R3*V32rf({Zy1Ua#63^$I>`T5bpzv|yHdk4n(bk*igOR8?*i7BEXBmLOb-ecf zWbZY!DlJ>&T{y9;gi#WK6_vPisCU#fKY)ReUh{Gk5VKqavey&}i5G$$|W6u0%9BJWFIb(gJ z`^}X4thFC11OdeybC5odCNCLTXmGi9j1Jvi4CUTok}>&`H|s|%n9(AD8WM`iXSB&4 z7n@?NSjL&KiIo^oZoh|Q<6I0Y{>>XCh!9b&>M=NGKyh*(xJ_DRzi)rOS3firBQoP* z!ICG^C!WJ7%dc-TRq?u6b3voBTZwOMpLdk!qkd!5|A57(%1tgsN~neYLif+MXilJl zN!+~n|Drbke=2SqhLr_n?gM$!_u|>4%P#3UUC_;qf;920f8`_|gIWySii*BJ$dlSX z?+xPNH;|XE&i7^8L=_X(hoiQBHY?B(yI~nw|B4ckas}ki&x!rk5}wLR>I<(*h)3IN zA%*}gNBwB|4#b{E?ngP3$irH&YU@Cu?8vB?pPGjt8>>N1k_Ehr=LnGKck-70Bhg&a zj|zux4V+xZmEc$Ju3Fa^dg;qufM(A#ga zQ0lRYA=V8oP2d(?)wB#4%=>39F*nz7^@qrbtS&`(jX{e z&M{Ztb?sfRF=?ZiwiE%jX7MLGwiE7kGsgBMMUwzexJzrTzUAP+@=>bU|Ho-`9y52tL_5pHkl1(YwzY;+ujz`2E^o~5k zbFl%5t8%{O)oErST6V7^RJi{gBmXX6?T}cp;D2tD|6YqJ2%z=yzT;qy`@er(t@0Dw zeiS5$!--%5j9>xu zJSe)ZP53!uG&yAC@d>PNd7Q>{G?b-2fa!H?nhYc1LWg)-_Wcj0Dpu=d{YJRU(IXzY zUjzRfnU6I3dc6lbrCwi0`m3nz;M4MZKT(8Vrj`QkL&>#Nfdk!1mYq%$!ngECq+ z4J5pExnP1l0cz}V`E={(iWAqi{sB>guf-JnvOU_B>Abw3>^-aFHSP;J?r}fZjKMx% z^0dt5{rmkD5Sfz$YjM8=mwlk|-aqen5C}2LV2#|RmEK8km}>BOJv-k(a2)ub941VL)T8VJ~r!4M(SR2Ti_vtLhl0o^}B z2Zto_ZxSZ#JV7`cG4p*r0X!?c%Ko5_YQHz^#I+G*0GVFYf6gqhZ{yRyVEDhbl3}i% z+;ozrl21P$Y!3;!b$7Lo?jZDtXHt+C$x2~2`x-cH_&J!MgeSaElh##l|7t7eyOwuca*PW`x zk!#@&-p-dxqjF(^2|%45SW8f(aRtsGD^cCwiuF(hiV-SO74HitakN1id;o1_|M-FB zehIaXGOtK7Z%y~d!HlhSGyi`}C3fNM)&1EhJ)iO8CVy%qs()|rs&w2GQqJ1E&-VNCnH&=UrS{VIKaZfalUvqiUADkK$b|q zMC3^1ckoP{j;++b@VUeLqD7oD^YyVye+Nm z=9@Gb2*$j;`>u|{?Dz3yJL=It%zjdS!axX@R)nMA%Fu_uho0PR$QI18_RK=_29DCX zC5R|3@J!+Pf;pgQdxetGYG2`rLY+gk0XRIK>CeVO@U3VY`T{!iKzC>f^M~rdmSh(JH|1MB z8PdHn$~(-!R1uX!9&;EiW*Xsp)-~C|rCB%CrawoLkEb_B^7B?0#|PIxThk{k^$$U! zohHk(3fXyh8zPV_Jr$9Zv5T6^qQjqxKrCw6j(7{O@!v~f@%=`z3VM1WneFNE|cSO>_)Pdu}`N)Hf zJHeVTm?1D@fALf6rC=VSAcrur*?G}C2v3P0n1J*R^suo1VJD=l{G_nx63D3uFN zihgc4Hh3%(grEzzWVRX8l{#)@CN$S_&8U9VaAT$r`BkrjI+vdf9)G3-LupS1zXN% zAGKMdj-dAB^t|ZEBWtkT+<}Hz=QEb_`ARY!h{YsF%fRwcJ-7EF?@1{|ROC`*3_(m2 z())pR8^6H(irj#M=#O%_;(vh&jqo7x8LmIHTZ_+=wWC3Y!a~r5TY9Sip2(>5hOO|= zkLz3E7ITV)T=^*Bt&S*Z5b1_LfnDS5|6|Bpi?KL=!(zpBf(4t%XzMsRHPP9Dhrb;6 zyM52&>mtai-ie9sz8V;U2Z>@DBjo17!e0WbNaDU}bi+&Nq0+Ms(UKZ1*^r(m;**a+ zozoV*yJI|Vh*C6h0B&_8aWM}sgU?*vnl{_yvYCsuF&W5q{%{c-Yp+X(zp#8tq!p=t zm#W<4{T3g9KkD**raUEeb!+P8p1=2?Jel&;M2RU0{gukHpJZwME*-iVh=#;Z+1T7p6fx zd=;;-Zun~ zTjvr?Lf=e{-HwVLnD{gB^-?gwtWoYTQl8cv+(fa-k@-`olXVtAy!L~OXMOr4YOhIh zuQmS(SO~1~0Hfpb2yUjU5lT&m^5=vXg@5hE_6aXvUx@`1WTsuOrnrs4F(bpY@dkK28Gq72ft{V8~ z1)upY>-UHA%ZBDzp7mXuKt^jV5vo_(kN7!9(Z~I43x9}8BrB$WG&(spZ`aWf&aIb% z03k2MG?f2^ZwYAxX_j%CeJaJF5ePj5;o$0_;2()j_lWz+GASJ=*gQ`pMRWJ$%LI1< z!~5eBomsfKC%nl(*(;(d>{IuXJm5h|yPq*v!jqz6FnCLxSYCkh5NS9}$doSt{C4-Y z0d%ET~{X9KRUrP;dwS8rcxXaLY$i`E$4Jx+nz6Z+~d_kQ+ z9ilmn`6(`%WBIzPUJ4SfT;Mv0ykUH0Cx4&58y7I@CAM>$S({c1X-x~HT1z)aeKWohXrJBk~6cCL_^93-BYSvWyX8=Q_Xs=HWYr5 zYh_qZqr6tzQaX2F%Fbu|3*AOYgPj43Rg*N)j@*JdL1Uruej%ps*PD*U8}%>_>Ej>f zq}Upip|1%_K4tJ0#h8=y$G&<}8sMY&Mt#E^3?$AWVC5HIhl~l=Yyl&9i{J^C-dmTC zfz{5*k{#iX*q7OSF7<1%uutG>IA^3h!3$dt1m2GXA3*~t&nVikXN_q#S#J>3cgcPO zc>$hAV)~4T(q`HvRg&xVcNi<^eN)^f06_2{jOf(bn44;mBH?@iG9%?#jQ0+ZB*qbb z#8&mrLTjIAK3(fi^;57}NM0QGda5Jg$UC7}w=;v`sCxy(;JBhEbl^GrgB79?7<*lT z@YIc;lb_e9q)_P25VD>m5Fmaf7DqK~pXS0@oLvA#cqv*4p&)H+KsB&o{G35@pxdhL zNJAm5xg-6O=UfH$sMUxl`MqAAqhzjhbnyBskvmC_NEi6=qF_vQKCOzZA`|DX1%FHIz)+vyzLzLVSl=gw0y z_%DSur~EV_gqoyPY8E7l?w#y`4Pm^Nl@{Jfm1)B7FPBv+w6hc-J0iJd$-`Oj$~=nB zpNnHBKrr|Q9iL~wR7oNI5wKIOL=1i9s-qvzl__Q27vBIGXl^X8niP?*0CGsrb1*pnVqn2;}4+WU|S@Gh8vY;M7a0P|0bX zohQKfNpVmLYLw2TQCUJMnhVGj-Esm^vD{-fc?ypTAm@Oa{11EJZ@e=Q=?FgEqjo06 zJZL<)j61ceB}}I3S3M>VSOKE*1rvK6ry>ngZgd`z7e20N+;aoQ{;;rU%cn_Y)dor2 zB4oHRu?a^J5sA&XXZMe83O3$X*#)kz#qoDe4!i{=*OZ##u%TooOtTt{j7j7C1?08{u*#GEH3EXqb~I-*)!+hD4r-iJrPP*dY0eJ~yB3{2HkzM-WK4S#@qXME9v z4mEqrPy?rb2KeC;27&Xng-gCA&~+K?A;Q1lB+hB-wyr;+D1rpR_2=@VYLNY@0u!3y zTv%FLN0%rtp1J*sg_jwUxU?Utrr?Q$&g49nXMqNph>s zoGX6<=Y3?3U-bd*)TB3!T?MDk zr#m*e^nJNf;#+ydVz(a!sp!AZj%^qLe{jY2*UrzP>hljo7&6>tfmx+BalawDK?3fh zD6?f|m#8D>*l;IVLn;IAy9j)-QNv)|dDlX1wBJu1-pLF`0rP3hkP#K|+j zkW9>H&h0zv={Be___#{jgID*Z$=|nrJ$cFG_>_GpflV_KUmz?l0{j++AXC5!vp`!s z<{UpT&){D7v3y276D!EhdI;r&M{0aBF8Qb+M7z!e16;oECz+!afYKtnj5?wlI)j+< zkT0~5dj1|yfo`tC_wefbNjHKO>ZVF1R<{WVFHU{N;+UKHtsP3zGL-I?{tMoUUyYL9 z)N$MZW4Oxf?(NCn8?aor81xlPehG>Q_(nO3->5=`EbghBjw#CM=*>xMYmQFdPu5yt zoA#w}<(|BZ^Z~G)d%gVuF&Eb`fJORx#KhLSmhp+SUe&&&1@13$J)?oY_#Urw$3m=oQtLgJb&%|1#jBf&rkw7PA^@S)Zn6=lFuE3~aG{)2)MTCWU~1;2WI= zjjcvAR4J^qFEU@YT&Y8~?z|n4iSe6a+{R3gc(Tu1g?rk|A_`;>ooezVpr=!_}KEu#@33GNtoKuleO`?z}*)F_bc2Qp9!bE;h7 z)SXPs14Y>LdB)NF!>e=++OFB232m32T?8*qX1 zKee_rE15urqzDmFHCs`LnZ^YEK8jZSRQjsip7<=D4Rknmehe6yv=n^mK+=r5JD9C; z$ChkLw0DI6mX-qsA13^uC#7qlOel16AySCO`_6rL3yvX%cdkUd+3pSwip`NoL^2{D zfUU@WA-7a#n!iDx@AwsCmp{@mA6AexAm5C5hgytTQ0NuS7h+g4-6OTU4jd{% z@kvl-4`7bWe3`!UgwdW&boV10_wy3#&Nv3Qw-e6w$^g@9Ty`GvJo$b233yR$d>skt z;ySg`fFtc5&2h^$yMUcew)T%07XcJ`XDo}&_X<<-T?;`S!1zO08eaHW?X^L)JLvKq zmTKG&O+vJ;Re!k+PQgjSnYd5?<>2&8#F1~r{{j!^>Fg8Ws?wT+k7Vsrq( z9DMCIlm3tB%tPtpozICCzv&L|SHVaHELk%#iWov^hw#>q?5k)Te|Dek^jW?gdbYXa z4fVULS&<5v{{4_g};60=gb4)FP%du#0g6FNVxogM6ld&Bc(-I8rP!w+WZES*%sZhRHov z(q>+x$E}uwf+CCqCFK@h;`eZAm6x!P+gi^!M#yRVI~^$E-%azTQA+*7n{_{K-jJMf zopZ6u?OH&$wE3s3O&CR>2?zQ#H4-88hh~7%}waHa03ri6XWgS{5`DRVxrVhH$E$U>&>!z&DLDQf!QM zC%%9%>`yeP2)NW4Am z4|^+=Qw#O6{-AHzqw#OtnCFQaF+Bb&@5q}-{_M|_)P!yyV$hB z>w65u--F565Fjwq?lD{S5H#Te-4#O8J_?EZ;*Xf`^hb{{)Hy)KBdD143TvYTG{cIg z8huUZQBAYtx7w^%0F*&iUH)8y6Kxf-BIi=0%nyu#)&Mph9bcNq^c|d0&qRlfTOlS5 zPrU^{OSgW=16n)Z?YHILv%rrqC}hpU&tT>|*M{Y$CE{oGi?gAd(F7KsXGA2VO1>*q z5q-(hxP68Hj|~tcbW4+##l_0PdP_G|4(936itVeR8j9Q3Xn3iU)D-^o~uynRi)M(~zwLCT`}_XAE^m z#&rwf1a~*ZHeH1~0Zt!Dni2LIPC*twwxVB$&A-KxvSb8Hb{eIhBjbXZsql>O)?F#M zywf-7WAT#99^V+mPr|9^IxD)PuRegZiHi(IQnJ~hVIA$|ZdoGk3c8lmB{Jcv4C-EE zESYL(W=&m#e-uj0p*i$)}@ zh0*45>G@d+LmLqB~EzeCsotq zV=%pOnqJXc&;Eqv7w6o$cu2?aYZLL0VTs)bR0@=qQoR=jF7)fXPeWu!%9}C{B-YAx zN^C^hPBqkp*>i>zu^|Q!lP=u%FXe9LJkhTE)AyuOT+JfZRR?#kAz`aeN6*}JR25B9 zv%z+moM9zf(yaEC-5+5iG-^$7j}dQ>N%PAlKg>6*QO3H13jPEd;YRN`+KK+jd6y2C z?riS`g!G;0pN?;AoI5Se~YP~((eLb*TwHC2Yb(DW3&brf%Y9%Q+ z$H`bs#_AG;awHazJ`Dp5Ab+?|q^|W>D2!Hp+H{sh0q(Nh`7J;CB7!dylQVOxWFi%r zvY~`M)Bf`L9MWA~oPDY_Qt3I?nwUF@uGw{y>zjZ=U>OT33J>#uIWA^5Es}8yy~Cz6=hTUC%y_jOhPeW=hgCU!XXtusRmuXxvUQI4>$fAT^|snwtVG zUEe@4!ZW~zUaC=VKL#pl3@|4RD_zIojpHBteYD&v6FrO@HD3ut)!C1mS9eaF6GHn^ zTB45)KVv2FMEs~HYHRlxvEc!u6?p9T5;8UvA*+c^)NTJVskylqsa{P>Kbj%SCb@Zp zoXT4kB4hhm-sg*#*Yn!aQm>4He{mq>4_$G9dRmbvN&PDWC6ZfkSP0OIL{NvHRS_qp zY);W1V#bpaapel>@tpT$DyL1>48Nsi&1pb30EJ{TOt}ZyU(ci|5B8$u&4QU8DCZ@l za~OvS-M^9V%9S*CY1Dz66U;{Bo!h0@iEc}m)5nbq&7;J4isXVW1aKJX*^2=b$hM$cEnj2@LFK2DmjXliHM5K@NyK}vuEy_@T*L%>46@{TUhOj|Jv<>SiBOLgDgkuwtAY>}Be4!O&QBJ|#5RV) zjOeR%x2G-55Dth`?`(_32RD$~n=ROuR$FNxZ>}=>9B`q_pL_)w*i3>&Cc37QyP9an0X( z`}MX-CL+|8X+N*v5k@ayxAWls>#qDH=PNm?&WVJwPK5$M?ecGXl7ng`L^-))igJhb zG%ygBk&TJYVpysRv zP6~tK(060!#CD=O*>T+J*abicT5;oe4WE*$-V`_Lkk5RA%Q1fc*versdv2~bLI(Ea zz>JD)fs=LnSkCG}c|<2uR51YpSC7yvYynS|wqgY#EO3Yeg41ob=v2YuAU?|%gug`v zN)^v?6G{)?C6HLK@Dq@@YSx-V+O<_J0$& zd4hWz?%&?jKp;PI43LU1pEZBsHi)u%l$?C|o*J)zvT4rT9C;1Tg$0?ob@ztOjS$5h zyvSNM03j@wo3e-QNy#u!h`BO9IW<4l0o6E}MX-jzQvhNaNp!MlZi^-5CMTgxd0u^NommOt9yh2ALw$1~8yD*=r zILbn4l(d7VKsN=^SxaL*&m>RAoxVM06L||0I>>;!QcF5AL)2{6_$1Rpv6jI{eJ0!y z!EPPvJ8EG(>k2Dp8BbjnanGA=yg|-xVjU~m`%;xh?Ox%pJa&hUj8U?m!zIO=Rk!ER z^0n=sA+%TaG>jF231;9Lxt{)^KY^I%{~eB73uCfF!KFtwQM#XT!<` zo$~GF{8lq-ooE4nQ802UH?jbVq;6C16f!`AiQ&`if5I3XYe^0LK{iSzf;xC`p^)E* zyP`#)%$MrL0e8BCmG>?Cj3x0%*58+CP38wkS{YpD)YhFsIw7<@RQkie>lX4v*diIo zp>waMCu=zw*(uj;E|0b_&BjBj5beLW0>9i+y)$M`CmVqhE;$x4LkS?S31ukOBhw?; z14eu7MaUGoq2_wZXHQhTEgS2^tmvmi47{0vU8XOyZjgVQcaA4JLEsnXJ{c(GBcm0o z;)lOFXz0OfLDu3g?e2aX`&2PS&o0ZLP=p8MNag>6ic?u2kwr0THqGjA}f@wln zFJXOFc0s*VCQ|Dq$Lw=yYi@pRAn`h_bKl;&C-fI7^{2D;Lal|29$EwPX%x4@)Pen8v#Nz7?yH+ zsuu><5r^AM9Sj{X6}@|%D&{-E63TQ24f-AN?2quO=$jUPNwa4kox$is&Idf`Y_;gk z5MtO;*c!g)_3djwZbkQ*2W%h|Ru@&Y1TEd1`6B!IKh!{>JZwm?xSSAFm3!aY7~zJd&yPX`0fo-45PKUK@_Mb z8&w#gK2G~Af`P8AeOSH*7rNoK=9s73U{uCENsD?>&Kvl)18-*8=&d#JfpteYgVDzd zDglF0s4KnXR63_~zEssUpTi&lneF=kR25e5LnVE87A@;CLy%c^tQa8^Y`AOHdWN92 zWgs4pJJ>ka^Vo&DNp>UB^3hgqXCsa^la@2cj~{8CuB|D~oOLKrqGoY%2 z1ie-3NR0E7{ zv`MLS*TREB1#mW^Ob`9iUR$&t*bRgRs24{EV~iBLVv?NW3Z$(&Bpx~fscquS0Il06 z3s%zB2396fDSFZ*2F9#46kBuw+y+prc%4q2cu7XqyAhy|Q=X_7By}LD4k6 z@J+zOlgM*`uq^j+*Hym4F8}-*()c4_3Hwmydixx+ia}4`u;3- z_lSg0>{HZvb~U#n>~Ba!2(?CSyma})t;?md6_BB?Wj!xa9_9rj<|l^cyg4*Wve5qw zCTtL%_sdN1yt57$^$8z9gSp@>XFyva#b%!OMbR)$S#L@xovzVrdATvt4Jju?dRQcb zKp~V6tsuL9Z}{rjIcj2gO7qQdKYH?O1y1E7^Qd3*yj5aRJYYk7L1^zlqR!J#X475I{4<$~xR1M9 zmu3L7g0=)Z5pDcsnFtnoSETNk2%@V}P1-v?N7i5#o6%8NkQ3h$Q8NZAK((mkCqsqu zHw8Md9vi0PAf@Z2yh1M4FfdNA((`dtUW<%IOK>%DzWMa-8kzswy+}lFx^I}x9?yDv zXN$wsf;$^NuV;Q7BMVlu5As@hyR?Fvg#wu(IUHw=lGR#jV50TCE;EcYo3gpGWBwVt z;QfPhZpQF>f4cO7X~q@f%9i=OpMJp8+O_C^`eX@Sf^nvju>V@g6c#883I0{fP$6Zz zyFH&~u5JRmq7APSkE92NE#MPhu&tT_GZpWAtb8@FOJpa6CWKgo@1n zjU;}1b-Ax5gS$hFlVL6wyD=-h6$mX=Q81|G@kdL0J|DmaTWkZ5;AhO^y>CG0chW?g z3Gq`|UUK4uAo-{czJxY<0RM>b(EVSy(9#^*11OgwFVEIlSHa=ALxae=@MEQmK5kT; zU1;!Nl!Aa_An^gj3&&XFP(n)F1{n_VKR?cB<#)J8F>4B8QDPj16r{WGXbf$$ZHL41 zq<`OgDfwBjK!5HDE<5Uoq$5&AF`P=>B={N<*xuC^?+~J@8zHKCJtPRd1a&tE5+alf zs;divErZFF)34prER*MHfiBz)6L!&qS!&`%71W3<84S$uzgR6XbN87R0_*0}U8wH2 z(Uvma!ba&94a(H4>m6iQC60u;JZ)z%>7}|!rPn|&eBt*Ez;>XF)t>ZZSp9Cvjer@Et;vM*neIKi z7S3fPzObtBhO6EoXW7~_7)@av8)DqOQO`%brNct2pwVdE;B@LEbfbzWPeTsW+D^Ze zY}>k7$g=}uE7AXbe}zTnrWg9S4s?>4kl_xbaep2%k!a@qe4N?A=w+xYEU()bW^2r@ z#DbeMj(DuVXN^apg1XYfZ2Adu7NL;Da9`(M$H)EIr}ClC!l_yAj`F#WoMsZKgq~tW z1Li9PQ7m(!&sJCgD{8Bsb^NHUhaxSiPRvJ;de)G<+RMae7kt)!g6XTM<=F7_( z$%vt(z25P0>e&}D9J~5s7@dYIwMCbIZw5t;;)?Ci^QbTqF1g-Jkxm)w@?IejL^yL+ zL>e|SWCLm0Wl)5h6*A-c$S1$7;;IrNhZARB<5hJCD7)C3#r9LB;hv z+El*`=HE6V8W5mWA*mXorG?%BB{XupumiGW*Z*#J@KlNXAaGIYPki7)Al_?A&2!2f zd#G36&*I#luGw#w@>{T`vC;fo z%G{&dr}gv6o~1~?QMRkhqnlq54Z(KXdlb&AV=RUJENcC}v$Zg86$$pWTVKUST+f$E zyKg@T%dHT8V7)zq+3xnrrKNCdnn~U^yV#NCL1@31tFyAq0L%iVKHl744s`_=l<5|z zrLtcQ4B-6~3>K0LZU@h<-B>elhuhwLM(k2)xe`Xda-{M+__fgUlh=I=~M986L^_UtFrnD1Zto;q27`kJ585lE3X z!>>7~u?a(Af#KyGg%r{he)fHt;q3yNpxvNirHbXDrAl$xxPfXgieYQKG`S&2hTez<|BIcD5#@#@( zUs{2?|BR2AV}{gvi;b1w>AjZAfjvYu*kn;Wog4xEhHAG9oWG*6E{&iCMI8CHDejMl z2_h`#tZmF)(MMq;y(GcWzML!k<3nUPPOkWUVIkKD-}dwa?RuM^m%Bq9o(#ysEfBOh z6Yrer6C^q${3RbRNQXO`TKcydXTY}~tW8;W9<~h|Zc{ngKLN#;@C*KYK;gFoe4fF0 zw7tx!-aijH==5&_HH_$DP{W8LQz3drJw;4Ts!l;<%D)?&7*!CWN*m5!0kg<}lQmyC za$y%S(CdW#!YxAvxKPs9YNv@8F5fluVdU_HU<;elZ(!0zXGK-L1u;Sxaa895Yd_Y7 zvf>a`P2qpuD$gS+nRR=KrKbX2*tw_~c-@hQOez|3kY^JlSlg(kD z85c*ZBZB2_iWqur)3n{a`x_b9!MdkoA#c9fm~1EeNhTnm*t0I~>ck#?9K($f$8b)m zB5eEuHb?NL{S^CB>koWEC0AO?I4~I*5EI1l%IK_60-i8{5@AAsp;HNN-q?1?ik!!~ z0%y0$%NQNK%#2aIdxmjsYQ>~g^Ak9!l0<^eOS>tq-^{V#gjGilLSvBO08Xx8rbMgv zeJ)wFN#Q!QET3;MISj^}1&Wa6kYs(v`5UeJK44mq>{Z&xs)-&?Btg z)4_aCEzWSISeyyp2%$>v|LIiTiWRY@8Q2`z6f@0k;Ggr`ENXw{_PkB7#~2pkH|7d| zf>Z+CP#nx4ktkrYdOrcG*d5t~6=vslK9oZ?TsQ@zbt0XPFuuH#g6eBRYhi@a57f+h zgm1X+m3+VP!_s^G>F;!;q_3eBSOejlv^_|6vwg zN(5l<3A*CuDMc*=dV5bZ<Vc$lRKq=Lw2rqdm`4ncZvaE*bYqpNKk z2!$|KL^y38Cr>Z$d*uSmQyxe*=$o0DVMujtOt_Q0I2vlHomt8*_VMZz+maxhz{ zq}}?*ds5~}lUw5)JLi{tOAREhMgzk7tw8yQ`76XR*T6-_tc^R&gR6X?d4`jC9kz8n zf%e%TP6_#>i`Er*e@&Eg74RR8Dud64vmRd6_!(qLF512GO4hpbW4I~1K^oV0RRZ($ z6KQBIk?-yiDW?ai)u}ZAK9!tXnJcuWOL1S6o$j^tK`Obtl!^?EfCF*0U7Ma5Us4m{X zc$@_}+z==D|Kni=?!QHd8(@>!vixQF?{k4`hcgjs#&n|yad)QFUi zPd~B%{OLI=<8NR!r<@1RowtUS8AI5Z_jNeXieb7zD+6V|AFa?VV9y@?TGTwvcOHWu zy5t`wDhsYb0{qZ_bA%6+g3yHizKusfM2;4-@0lyForyl%_ks&sZ2Pf<^3-cSMkqh= zyje^H{^XoIz_e@|k55Cv7e=B6dqShy5zwh&zneebfOqoB7jFgJ1cAW*2FmLRrQu;8 z47em6p>Lyq`lMXNB%mZVdiPK@@L&a)9LJAa_NtTFI{P6$^Q z&6^`YB@cO+KU&9D083W6Vgya8mHwkwPboVoHgY!sK>h%{c|}jZvu`>)JYv>qB6NQQ z&c3>*Z2n_$s;e3i;EfO6p^Q}m!H@8c&gWH?pPJsH;4Om+qX2=+vQF`|-$*mIqEIz9 z;QTm0&pq1m19(dxN3cD42~>l-sjz5cTt+gR7qr_uPIl-u*~k>=bN9`E|Jy>Esatnj zXVO|$>MypkP$RYJe$!Uww^KLl`GaaDAM~6bsrOnhU5AZ^9r4=5ZPuKt@X_|V4h+e; zpuJ(a$$RIsyNfryRyBMD`){Or)!Ovf^@FlYyPE5_!OJDsbGy@@_WxI%REk_vNFVUh zFNISv%@7*if^1xzg@sK%SIWy1SV>&1nuo1dhW-$GHG&dl6JI~jq{xud9*aCT08-3a zhtbLcUr;=B+V1(lM0E`^eF(}8>j?&e142C+-w=o^I_1FcOsF}SCrYaxO*m4!`G=}) zicn-nsD$zdyPq!v8+%;W$>v)N>I-gEsk>AFPmQ4A{<%D9_xawc{aUqJ9o)9?$1Ag> zG^~r-GKcry_gs3#BNnGz#od%oXdSTKF?W>ca(6g%cXg;J#)MFna4rUt{pRj9YGA?Q z&;bj43-a>wU*$20HzD4lRZW6@b`HENFG8m^MPn=y*uMQIfs^hVXu6OXX&h1&6QPe( z5>gRD6YTOMga%Do1_!_c-iK(1cDyY%Zw^=k^kvG3+=X}49jHmwSmqy~LBk?)5-E3w z#Ec#Ie;D}G5L!c11|A6KJ2@dj@wGmn?E7v8?@=|V<6HZ+T;T<}Jk)%CzBHUawa z{j6m)uHbm%4{A;M#(_cP8(MQ};pB&WUYXOov5wR7g3nswS1HqqZgdga=)|%~kN1J% zL5(?{JN{r@U1_TUuF;|!JJqvE7Rd}TQ|AFsq)XV{k1a*7685@)(cj~l{31CYLEUGH ztTcA2`8ui2pL<_HTTjxv_V#b=s1W+)R2Uf+G${E=sCS8wkFJWw19WJz4{o>?bd=>3 zdX(>WZ`rhy+5wUc0O)9gqs*^GZZJakfA#g%QCUUbwu+#1cXtZ{(j^_z2+|!Q-Hmjo zl2QU9-5?*`A4p0!NK2P=z5VI$kNfVs?>YvP@H^h6tg^srMH z7~UL!4vrfl{{oB0Z2YB<;Z5Nb{r|!{l@YJkTa!IomJfK+J2mSKe~2156BWW>Y&GyoO_684VEQ z5QkVIN(+Bj24`)#_r-xlG=0_g*_gs+-700{uy#)%6Kw$5<>dKG11sRgSm`^PF{d*tp`jGqUn~!!0^)` zpcqvp^DfUG|6g)0IYb^8MMm7VCH3nJ!=!D33KYLGQ6TbVC8b0f2|yjMtPi4!bO2we z{wu*-7`bRj8J{?Vt%+sw{dj=X|jyqmqBPc z024!<)=gFowEeC&Y_nl98T{NgeJ#Yne3A&w<7^G<*zUkeQP(dC29lNml>|1Qv);6a z^JV8bkTX^OR0%dcd2-zTsVD;-SmxB~0J&QDt5y&_*I%n_!E!HuL|S>6E$I`4(Wh$| zIl;7be_|Y+I*}M4fFuP0W+wCk=PsED61eOX%})Uzi?1FSmhwldeDuX1Lsa6;lJ`0U zv8Ntc21bmX2L{iq6wf^QUDlNzQ6&h=ciIUhV+tP2!yH-_`KYoLb1s-KH57$BTDTA#vhX&?!SZM^%t3*z0tgM~V4muLbG4Biow=a1;R4c$|A){M**T>QavKO z`ZvVEY1vSk2>sg)i9!sZMIebskO(Q}z0Fm~LY2f8x4iWJq{zdhQ-~1CnDiBsfUp-R zN?1Qga=_=w&xtsEjlKYz({_c-gH=`W_m?1a`C438I*tJgHZexegN7}ITVVB{oGIOD zS6seArdf(TA02WS2qN2LXCAC+F%3ouiSA%O*HSMoO>ytr{@#22hRA$*sFI=gUHr>?m7rAnus4 zQ7arteefokvXv1XKVO4ejD)3ct0`JIAktz6oWO)nI2I$KCOB0dmrP}&Xsrg47~ zIsniMl&m-0(NBU14~hHt)UGUmH6HtCEnKL|!3beCh0jrd=S1_v{S7{T2*+HmR8;lL zS3u`JllO*Kf{_Y0q76$9QxOw-B+rtZ6X)><;10ypou2)pG+%M2Qf?*xT7y@tc!vcK z!-&M%eivY&hLGiRfPIAvvw1K{m~w@!Q_vivR$=UTnulW_fP_OoQ!)6avjdTi{aKs9 z8ovsv>5E{mM?t$`GMWw(eTcvECD|y&zhJh(Z0A_+6egd0vTU@;&EyeY(mK;;55c6Y zxo8{NKZqpJzk$+VZfWS@J@-ZX#p5;5)ff?QqIuK?OU;0|c*z%ND3WEbiwd%;4_lp@ z4Z+U;e*YfyqE(h)8SKTi)gM^2R&j(`wLe1c3Mp0v;)PZuUc#GT?^^rir5Kp_qq5_v z)G2A0x+lcCBTBe*&vf}0Xoo1A38#3p!A5^c+P}ClL&5Yw8~WG z0w(1L?O^Q)mu_P8;$JG&sSWWJ;a6F}{2CuUkT*7V4+*{sE&xY$4;X&E?7)c)=e|{x zrMk&|6LYg?soO&5ymbsJm`324p@YZSDi|wtB0HV`n&1YVii7LQ6;m7&NzO>gfKam2 zL}R>IWGCdo1ikI`wnh0n;2ncJ(1%=00V&@1dW>b0ne<1vP;;8JmP=`sk_rx`Y+=VFZ?^ zDc8xBH8lk`-XcgR-0r&BG{wnaMysPhMuiJA-BSh9gj^K%<`*Z%#t9DyA}3ig=SSk2 z>oJf*;7~GU0W3o#~qXA z3%!WFP>Ap`IeG`!D^=cMhxF$J7bq98BgZ(N0bD1!GV9L3ySK`IUD)CMPGezql^>Vo zaI_CyT_4J5!bNzb&!v&e6cLa%~^p-h}V~YT_<2#6Z(n+u-)Q~9Im?SEc)PSaMyMLy^brO z5H}{Nkrns!RiapXd!?T1qy~*)HUEh*`MJoTRTq~yJ~SQ!jVsRw6ogT?6$f-QGcPcL zd7_oc%XI1@fe|8}FE}hFH#@Jd<12m_=EZWV#$I{FWk`m!HI_>8X9bA?39Ce2E}+w- zzY9u@e6`{%npA3R_;zaH8OTsGkv=b}LxY(qkj=%}OcWt+jZQ#eA_W^XffPQ#)>Iln zwd9>&o}zRs1oZ;O=ktw@ZIrcCQt@yn6R;a7Vf0|T3!DP6R1ie>8c z$#en+L8MPB`KiuH4ynbvg7Y9O0pPrPs?f3f4&-)RBY3Dy*CMH57^)F}nPlb1#x@D; z1LQFOag;JVTdc1>4feWkDOSzNvGZTVU2kel@2+*Mvp7kG6MEzWMOQOJg0g}hhXGGG zfe$-~XY?nY@2M%639sO%H>;BRW8C(HvoQwF3E`$dMQxYGTV%VtYrnZBkuhi;dIppF z=Q2ERgxi0=)a%;+%UyAUibBiS4(8V0Ff#`ZSNP-YoHWBn4 zG-bHh*}4se(_UJA^eq4Sy)3Q`%M8=Vm&pw!#H0-J3~^W9{g7scvX#TWyTCL~c&amh z$8?WdUmbL?T>f^j@?ofo6-)6II9OnEjP`g&7xZ*K>hdrKD2$^Rik$Ui-E#)$M1~x? z!sYJh1pNGp+MtDhkUXEeAO^+Uz62SC7?j`mS3>=h5n^#hkIWzKD9&LR7+T*%IxTZ- zc}CJaYVeh>^j><1aS(_L66!XdH_P}Xfx3PMsF~=%-h7+Kp+B$HwIw&mL4uw?6?#>Z zjfkNGeBUkFN(bfO?nt*(0=dtS(0sDnm%SD=EToG65Jof}M9EMqTcdD>;K4EjR{){YsNECW1Q{Utp?{bgSPlo6|fJHjyLqqTG2;x8KA0q2S=zX0Klc zhiT}k!8^_Kn^e=-D(@kNeZ~1KX9yGw7$wC^_lc!5MN&WL1=ZS|>xssQ4_ijaxyTkY z?$U3=7n&jE()!w0_UmaD&oBedckckYjNqR^9*R4MEB7ij)paTD;XlX}hetA9+{sH;yxZ^nc(4Mt-o6 z1~cFAH#CmgHty#|fx4B1I*9qgN6s7I)V_X#vn2NoEP6+c)nYPwF%p&zEG@HAttP9X&(>O|IxANlXUX>2!M~LHBEabujp5NIm$p zvjcLO0S`5xKlljXC=b#vw$*Oln{Ay!JrdXSp*n^yFaOnJh{1G_Ih8-KKUTip>4JH4as4nq`QcV=2CUKtRA*-;uSxx&U_drdC;vl1SM9%9|+gcr{) z<^Ua*1c;k;-PrKrzyPJn-ym>)7fvb9ew4E7!2vx{O9)PHRmXt#UJzWpJ4!wc%-?@d zxIi^F1UZ_C1_IC2r`@lD?*wgTUZ4=sFnKmBLx((R?dT1MLB^Q}ppN+yvK2g;OFfY= zA)Aw&0%s6p`(ymYbNg+3UK;!bf*s&)yV_u!{HN1&TS(G(pRLQSpfu)rAmA3mW6lo^NnJrS?7jGKVW|1?(|d zJy}9&qz&k4aG%7fJa^hNGV(f%d2h=fC$AD*gjrrSDov8F$U_S4Cwvi(^u+jp%3WFD zIA%wcHp&)I1tN41l~;@m<9U0%<0N5TU6@3hp&s2n9{Vlv)F4aD~jF(V}z43O^4p9!Z#oZ8P7S z4VuSZ|BJ}0UGzNa`3(G5LZ#a|#avZXe0g#8d;G3qQVhS9Z-Hpe9RtDdISA9z;1N)( zKyEnEta@ZZBy#&u=Wbu|k!<6pefNUEg9<+S-5VjOKYTZ%^b>NGKjNmSS?!c!VydJ< zG80IvRLqFPqnI43s1doUKdHoeoH4~heuXeH3L+c)F8k1a-?rcD{%8&K)Ff?_x`{#hAsKy?`y;|}i|0Jzq7 zFz{tuE*UL8rwpd%`ml=EnQex{H=BV}TCKBO5ozyvr{Kq9VmkJ_0|N0PUg|#-oqy=; zASru@Gq94MruMq*R!ar{__#*L?kN}N%C)0)BZ~-M zU^EAJpDg&h(`asue7Aam)Vr^~Ns1=_Xv}A4zdWDGyH-Lp`Ttk|!hXf}^i3lI? z+TUOFoE^`2Eyks%bXz`ZVsycbaUC~Nk#!*rSZo)U!⪚Nmbjh4%571OzI<(48v_3 zz;!sKT^ppd!Ngb3SM)x>Yfr+KSIw_wQ+s4wB2vnfn<$3Qng`kBMNn%u&c;HWt&<9Y z8!!*@M(GS~8n3>O3E9iDcZxPqtfVgXpkJJev>6xibw^*3_^~lo-8W}kTmM|VK3`= z@$v4cw90&!xA+?DeYBobbxKHvRB8uX3?l?kV!|iWV0K#YgGoN{fv_i-^62r0b@f+E z;BMS{#7GhBt+$Xnn5D*fHuB(#+ycAioscj6@JUPdKIq%3QZ~S;wrhPLT&{+5Kc=?> z!=;sG3PR-sP0jtt>uVCBIZ@ne%Wi`c)@gWXZVSE=os-x%?e6XY)Yk zAeZE5vUjh5({PRC_wS%I+Vfi@S^n+8Ur`u#2?3eBpm7(SGsN`nftcWaH%rReO=sNC z@-c1e&)Mo^2@y7yKKbfd7U&3apI_!d*LKTwSclkVM*d=f$XO94kNTriVX)#T9bUnU z&yZpkzCro71V4%S2Re)eqfv`5U~6(Ov0@MenfLeqpme-7fQSF4LTwXI&~`OspUfkF zRXoUj7bAp1V!y79z=!b+nLlLwP?gcTGA~0wR72%%bM<)8Qly3@T2E+jSM>!Mav6k> z)k2F2ajN)1O3D`=xgNiN3pQQ#l$98NkxR4yZU4ie_ww%7xJ&M~%O3XO8fxN;c7I(& z7E)j8ulbc^p^l6l37}7333~gTqd(S(5#Gv zzZ+WDpGfH#C(Lm^;6ia}_aTOI81B#TVsvaEQyP#!pC~yk=`-!8Xz!(tXJ%%Sw?SvL z*C`ni_{z!ZgDd&j0vklZ1Crr_qp~3%s>!Z;23vhfRB#eR0!Sz%{o(gosxKxjHK^Aj zQ&AGjH5gfX15!~RE_+42^l)d*8C}DeaIFp`N)?#TT-=X!+(b9tP?*$F+CU(0-mk^R zS@$6)zr-C@ed_mul!8-T0fBduyt{KN^U%jK?inY8>p~ayCaC8RlLe~fBB;K`SdC&J zfC)+N8>c|uv@7;@QIvf+A!Rbm(fQa2;pI?np0IBvwis=Ll9)Ni%gu#REzzqV%5M33 z4*Ij+(Z&W75|l`lgPGF)t5Rp17q)ZuGm-OwgP3M%C8?BMM+Pqm82DQ`3>W^0+{}&~ z+=FavIej3gPwbwz^*APTK@u4mQ%HYzoM*z^m--7g=NpR#AP0bq@bhEpWHGE7#;i8q z_>{J6yH9bOJpSQB?_ZfJ=j=3o=ODfAv;Ep|=jIa~0cUre6oPR-=OEWBeOor>gSI9_ zvTmH}&3txWtGI!ei(xw+_dZ*K%i9NvW|sqo_f8!K`V)SL=~9L3dSIW6{W3xXZ-r2) z2=Cv8#$9URljC>HobZLi08wupCV0Jjb&(5Qlh4PR}Q`Xj_DqR2UwIuyLW zyFEK|npAcKYt@`+D);V+oKm$p#?~OCoQz1Kpl%;?v7+foj3{bpw@!B%=5LT`a3;oh)o}Lh{ES7B3pINhwJG!x4~$spBm)` zMUVT3yIsQz&w$B%HnYmmp}bT93b3YbNXg%@IosXQ@OHkFA_zko~=&DXN$}JAT$3H5p9C@*gctKS|gc$imMdu+u7n4 zZ-~Z`iRP<=w2@D4SQ3fmVxM{rzX&OBy&>}@8j17!lgF0sSjG10eU;JAa*Tmz+u_fQ zx#GsTpb|V^an;a_pExH=3{(2DTD4-5fr9DO``VRF z?)0pRVhP$_9J&!-7nJ(>BmSC!Ep_M~LpQzJ5fHG>(eUca;3FLg*XbN1DHsc9DJ$_7Q>ZIEJ9g}#eJt{xi6TERTY$u~@Gko}Go)kkB`)MFq?>jl4&a+1nZOqXF=BVSB~d<- znul_YYMuu-m8uVZRjQC`6Vgj~Va2r8&@qHosDhdeYHUoGWB zujwW?*kW?;eA*DC7n-m;)o>Pjq-UnVb3~opD63^vozcl1E&cg@}PMlakz$Ci< z+Me-^Gm#lNaij+~uVL?C#jSfI5L@2!^t+y*2x6S*-L~>#W4P;Msf3UK40kjS74+pl z-*{c8=?E3^8mvC3YuA+=C^e+0`ZLv4OIp2dTOXtF#8i{TS&!UPW<~o-ku?%>UN@jG zA&jpIUA-kl*+|HLB=ylcV)vR@N?*SARf8<^-At-#Dj|`LipsgZiw5J@6|brjFcGIgYU{!KGE%n1!XD zrMA}HuGX*HopvJz#$EP@ZWvgICP&$+L-A0p`ED=*Oj}B}uAQ?^Gn9g_W8<;yWtwT> z^PQtMJMa1of}?%w)NfR?P22k57+6T1a6BVleg61H!Xnur`)bg#F+ZB;u-t<;D?MEQ z;+=gPcBh4?i_>#hG^rE?C+>MK!`r_PUF=sd)=}?+01^o@O2}3ZyPu@wsSPUL?V|iqZbECN@*~g0icb z5J%w$tr6F%ix)D6H*?bh+Fi)6NhwSPi#W|Fdwtl#PUTb!>L*@Ks^(o`L&O;@fbSt5t)P5qfU&yRJMGHQm z%Y8)2J1J?7Qc-46suz_pH~8JX?Pjser1y`2DSC7sV%Cx-mQdNMm7afg z1M5JVW~4q#NR7s$bxmXomJ1s8XfNO1?AvvQvITh^(AG>*(`iyTbu1Q~lEI=aTlKGX zV%Xk_blA4e=R9{Q?Kk;WFS<@HuJJkRlxVs%M{5)T32 zTfp)%XQ?jfP?#UjHDKx>d)*6r@cX6|H!}zx3Wu}7hv&u(6K4%fyPrkAy=|tKG4Ou$ z2uoH@QtZtto^OT1^qxSkNo*vTNf|lWFT5Yc4Vh}A)KMquB*glT;IE8t zu=4gRma4@iDlvxYA2Jmi%^~96cRhgN}y4W8E#ys?(IeMmawoN@&(yr`Ialu z;*7bob3GE)Cw;$$_%brx_2)(Wns{~~z3FFG%R2-)6Q&c0uLkJM44!@2apW)#X_uFZ ze>3$m@143~>3hh0Wwo(qmx*~F*G_ESDJx!|$U1Gwh>K9(eRZB2q^{3HQY2Ysj^yrR zCMm0YH}Z-IgEPL*W)DA`W+0Wb@Cts-4}_=CVd5Ybl4$tAtW{_1HJWEc5OT*5uEXIN zlQP`ddfj!ckw4$E4)xLomsR@-1zB>d%1QU1Y=xrk3{6w{l!|T1sE+VY=`MWkj`%2t z)_Q6pJcqK%Ikv-YXvFM2@f(201tmW^HgzB9C%ip*A9 ze$U|TY&ZlYv~1rBOPwySKO%&_eq2w~KV16(8MB_sPBO5WS`Z!;Pa>40OJzxuX}V_sE8SwBdVk!-ILEyM_3fX#PBg1V{6UIpsL03aIicCN;i8gIpXUa}SN>^yist!K z7yjHPWQrFZ@rspp-irpmoPrC!c1t$1VkCC1JCg#>||u8%=u`x&TexUQN)yQvgTAKxlq zAh)*uc};b{uq86hwTxOv$vbJ)Aj9v|{2`~_lWGy#lYvpZ;^_jf6x zNf|MYmsD5gOdt~LR8e|urm%mw8qjaZdz2}e^BWaG9gwk6LuC97r*UQam3zMa?#2$0 z#8gSsH498|4aY9a99QY3P<=bPM{X3Vp=MPt@3y@vOc^w0D%E zGrP&W6=AA3yzC^nmPvn-rxBZOb8SpSOM?0O>6X@+3jD)SnCKoBE-Mv8N~1Z)4yn8P zoq*D^c3vg<>OrSd)Iz$J5gNg+(k7)HI~-qyH95X9yfSU@pd>2<6CO{!2*)O`mD`NG zlt`0lGh745{iXE}uZ;2<$K`y!&P3KJgebaha=0%3X`g5!4lNFY#xvw;rYS?&1l5U@ z`}Fxm2P|g_79sC<+V2k?9~|S>%_f5su;Ul*dC|q@SyDidk(K5f-qPZZ~-k4OAWFMUDb6 zTzW#C4(1x8s#I#5@6MqM^B?fZ#^5keVyS|>mNeBD7{AEUYsY;pY;79*jgDYH(UvGP zLqswb(S%B;fJz9D_8QUbQ|%mU5Cl77AYkGJ+eOSOFAI{X*{t~E(pTe)WU<_1wFeMO zJemDY!%))M94$jtYV9Y$8s66z9ePc;j((Aj?7zshtMZX)e7)jJv#*`D2qjjj{)LSW zhk;WU1w9}(x*7(Z^6O4Rq#riPjA?nWonGm9;1j7w;W`!fD#rWZdTf7B$dSb|>uj7F zR)yNzoY$Q>!kAc5Y_y3STI_26Z!s6`6k(3&e^}oIO%UD9J|HSjCwy6mkAKs*UkF{|L*DY%A@i^H#vNHiW3p0A~J8>6DW)WjbOm1a~(OLWXi5Z&6(k1mr!s-LX zn8l*_=bwp}$)qXfyvo<*BYq6!*y@%{MytHha%RSB~fSvmfuPhEIwn zNPXPnTKtr{2uxY?OD#V#o7Y;F74Z?5CDvzc$ZM9cFgr&Lia?nMQc(HFZ8#57TE6KS zdYuzZn-4S7)&7X3^Pa4|FNWIM{dtZPUn;b5GM@G4D??qi|IXdOFD}l@0>WwIS2_0>0}5}O(^zS@H4@>r_ELCRQ2Wedlj2W2_E$~b9o zS!IZ31a=Ic)>HuOfy@xN!w`iq)A_Otq-FK=wQP<&8y49RpI$nSb+2u^+qW-cb=GMh zdp^j?bKQa~bJNt9-^$Az@5J&kUzEbgHj^M>9V@E6e9U z_$-%a)T_NTUK~o4ag6kuMd(Z`6fAdQHWfkBp^aN2{+bHaMZXS?-}Gg12omu(1gzrKOvfh$2?P zswS-QQtD0QY2LRpa@+E?cZ2J0q|49fuTROA*0_3o7oKs=G>DW5RUbzbc63j{d<8YzQTe`r(r%fXb$(r?})$3N0&(Z`WUV!u3)S*SZ%iENUSD>~icDQf$* zJ$|0g_w6sC#KybCs_m9Fku}{(Ay?z%TM2`&9gOP>81qG?r2BL z^%K+Dy7D!>1M^*j7LGWAPd!>gXMUzx^;n%2v9f*u*R31^*y8Nw?9j~Lo&rhuqmXq- zu%fx{wmfMHLFAJVBeKXLmDW&dF5IMTPxW8*_YJPMz`xxdPXP?m%qBN2fK(SZ03zo6 zuHnP=0kwF--Bf++Boae1M#1Dhly7T;GIHj&HE4nBaI3Jbz?Y(Sd0RMdIcQ06Cp($u z`ylbfqNC4+_jLdo3sJX)`Kz7vQR?Qsq<-v4ITd)cw`6Q_ zedCdz#+Hskd0ocFwUl&;JQOwreiMIQ^8V@l?T&QoM(ieQs!8iZ7?}iOdKA^?U2rPf z;D8ein?-OD>8x55-85KAu7wowK|MfZ9!!{?VgCKQKWV#x^y==n*cUkci&QaI!#%-W zCw=%{m+WX?uHhjpg85(;BZS?9gP_x0zZwBSFli1pA>wA~@=t;2;`5&nYHItv>~Ra@ zns=o9?tHLr1bW2;I#|9nu;$q8DZ%#NtW7uU(by^^7_B-qy_eJcNX=gw%R*Z{Ej#(? zxH%f_s{+dJ4n|i;69po5x>%K7nC)*fViPGVnc?rxDn6VMa!$=#&9TRMFRXdB+v0OY z^(n3uK%CRC3on7DysSEbd&*VxvIC=0cbby-ZHUGkKN$>dCOgGs0xN=#2;5Z$>hLz* zKeQ7)ygj2bf%`YWyxzz8?wO|O{1`N4qV$n=7<}T^bpD>z=UcgU6ssR}#yt6YlecZQ zh84yJzhlgj$?r4xEnnxhGTt#oZInI3UJz0G_}ry}wdQgK`wUQ$E?7;IELwEGMKrCY z1^mgdq?@0UTgVOYe2RYJF#Fjo#KXFWt_kpz=01Va)=rU$MqeHVjvJA{o>ffEXO6#h z0S{q-n2tl3k+Bo7OFrY>zhN?zMJtyWn`cu$;CYOV9C`czKm%(4ZDxCa(X0~6y8lyf zF+&Rd9{zhaxJn92S@|A*5ASP?%Pwq6f#CKZ(A8+w2hwwIM-RoM926p-H$~;X0Q;A5cwbak+?? zrehlg9pGdgx!5b73uW)Q@x~BE#Prv^mfQT`?VN#Of)*{8-_HJ%+tb1YRbrW4z~Hbk z(Yb;l&M1PmRRngA4)O-BmaN17)J@k|Moe_!e_-ZLWd-M-2os+3oe_W@6#b^^|MYnO z{yP6Y`WFdAgd>T0g_s_b)rv>~Lk7VMfpv5L-NN60*wDcCo}(t$xBv2=|K%r)@?h_8 zbev%2AL#YpJ`NvXD+y{Ap5gaU-2cx7UV;nocwJNb{0~ zVK4^V|NBjF)8DZIFnQ+rN{H0|KNlEb%M8FD8_GXqgrXY}HWQsbRhVlSTrnZulMIgK+LMy^VNiw9Frt+qs+Ac>CE2wqn}*I_7_#Y*ewp`rYb1-^{6YC}sMQ@b zw!12o-&n{A@=Ws4^bv)(errV#J&D>M0PBy%lH8XjGw0|y4*D+m=g1Y|WG64sSH@iVb8!5Klb_DR@RaS%Y|`nILh--pzy=r~w%;%@P51O_m8VGTr}ndP+s> z$ldKWriaKH19xTO)A!+m>mf$SkC@pSz2tNk*Qrp0X?RoiRzT*goswCdb>$l@XSyT~ ze#H>U^s(AV9t~xI&?4>jl~&nkKUTOiX;=wN2HnUum9P!yL^$USgOF{FCMT`l0aDp;vg54Py=G?Va>c|9 zl$ac*_VA-P#YXWp>#(q*5n{JWL>XfD04ut+@IJ$+RD{w(3=Ni}5v+|#l!xh^8hRfR zB?c*-Uny7_hq$7SdEgz|Mzk$XWFxCpi-3MIUQsccThi!WJjqs&E{^o96x18x>-Rf> zI?!EXEUJgI&N&pcLc>4EhtVR{hkJ@97GKx$*^v&#D(o_Xe<$El6mG8#Tst@xU!At)CB9j+ z*w{6o9r7mhad~$s3)8P+tn%c%FY)umyjiCJurVbnA~{ zcO+j4@fH-Yc?0kDJ47pv$Iet$L%`vYG5F{(Y(8#HScq1HrW31~#N3L(gBS5%($F4*p#U85>eJ5N|M1--uzX29bebU!~VRnVV=NocheWhbfK~ z#x{XW!hZ6HaDmMEFV#s-X;x{dX`{teKC6>Ho4xCAljPn-)`(s+aQg7VYw92@d*Tfa z_Y7l$Hk-vSuDk~K{lmdSXUi^UE5v9C=NA_oem}%~Ay=}w7awewUFy_Vj3N2Ij52N0 zo50W!mA9a;C<`MUz7jD&WC$c0>O=8>@JC1P>X1Z7!W5!=4=P~@+S7q82*5W$rGd>i zz}G;T4wN;ZpGEi`aG#C(DrkM>RRf4AcyEPChwos8atHEAh&c#M5faCOF4B!$B~FOJ z)eVOr+Ld5IK`bRUk;ot#M3aEu|0WMhGq6;o4y@W=(Qml%+6mQC%#Iu_5p@IQ6ro3f zpg24|n{&ru63#SN(SnSJcvzS^H+09x1wkuL?Q`?E*7<8Mh}O8e9OiL5WCH@c7wmnX zSV&@mdyFhKsHnb~8B%KCt3^&3pd9n-_smzy?UT!O|&fYJpJ&a zZtIhGSE_fUKSTW~#}Im#%1k+af1Z}+JZCBrBrlt8qp-Q=6^qNJ>)nx*&`~{E3O)Pk8Xn`H;Vs~O!8`C0<x*`8a$j_VeIs$c@E-3+ z*nsI4TN>LXyFH5l3M+mWF&SaCh2(Goy$C-ZG6y^-IzcktK1YMy@j!wpp4ofjhQ-Q} z<2DSt4TD$n7E$Bkxys@hA{SvdXq#w^_)Y#hM8=b|ov(j-G@YoeNB1sK6;NSNDZQ(u zqJGOG`=Ow-V6LF#ZG5`hd)6Vy;cl~%p|}sN2ES@%OLf$^_b5sR7iZs0=oq&3wDh#h zbWM{D7?tUdzO4Ts_rU$oe3unpNkmFm zW!t?DiU#R*qz2Rpsu_5#Y<1Lu&%k;LrenDVg9bv!zgNs{yxlmxxo%Y>WFiLLGNZf<<0%`Ga&l}LuUwT%CKa_9Y(MUEd0uQV zX(rn=6DG!|&1~;=#@|FE^7hC_g~J5GV8ie!Cw(D&U(Bw`cCq#<+%md=j~^g>4@u4X}TGEj-6)EyQpE6d$c%BexFw@Ri!NXS(e#wqw_&xu7G2bBZ|Xz zbA0o{x@?+B`_?nVV(B6(A*yvRc7|E&(Jb6a%hzm9?tUFU2gK_ zc<)+&vZ~uRJ!D*h`%U}1^@`(u=H+efZ!VH!ZUyJDo0S`p7!k@^$%C)+y0A6kw_)ZC(iKydZe*hbx38YynRx-9{-V=bD58ItGo7H zu6*66C?40}E?2)^n9rKWWuUxX@EAE-Ts`C8-edC6Eo|NKJQ+L6Y$vfgu_|ia@GW|% zKH3bM#we4hfY#;q<2c{_WxZ&PTi&N*-+74Ip_`D%{@Li{2BmVn>2=);Ed?cJC2FQ%hInW}v5-altC zemY+F%;Fv}=QDfrMn6toJKxhctHV7aKI|FIyf6PfQ$6ErkJHZK)4ZP1*SvMua0qvF z_Vdz@<`?V!-1YeO`UiR1tjr6!>$f*L4?Pbu-)0IcE^kh6Zt~vXS3D|%f>DNtpjU=b zK|4htmX*d>9yM2-eOysjzXORC0mCWJD*YBvVPM87d*%=54*pr2C@`1JcloPsgMg-H zG|t+7_N65iomU>1WasA>tsD>mlb{YILV1Q>3 zz?~oB<-gCxA!tBQ&#yy*K;f1k=)cFv0gq3&1mOO3&Ywr9q%aU1@E0a<_sE9)=V)k^ zY^Z;pLq4A=q%0yO1stSo>||8pRrWY zaM6%`%VTVB%VcO`Z)D2kVe0@K4FbLK-~pc6nz|T*J#204oOwL>$exej0iHj-%uEJ; zKE%bEk4!^W0W4zgWD4eFVqsz-<3|L8!Ec;Q%y^VU#s8iT{KZFR;o{=J!_4gN?#|@S z&SdXo&dkcq&CSfh#>~dX2pqxa>}ls>=)q{`O#bIX{`nkHQ)go*O9vNAdpq#c=NcN> zySng^kv)CUzqdcvY3gD5pKr2r{yQvSfXq+tFtakTF#r46z^QMZUgc4+^f0y26t%Pk z<_!1_ehzMyH_ylafA9R~8~^J}jsKj<%F6Qkzfb+IxBl;`s?MfPBKEexce?QZXTAQO z{NHc>J@F0m)6oBGD*nv#^Q*u_^CP}t{&&&%5p`XvHGy>`v=o&GcrgfwvZosaHSiD3 zpGV+11eL!fwH+}CBnXlc6;knl*w27#z!IzL8{BR>RCZKk%?2BcaPLJG7DjMObkV_z zVzFiy8pz=(uml9*u^4oL%!au4(y@QA_xU!p)!+UsarZN?FPHtn{=@l;x!vec(a}d< zXY;0#l1Eit*O7L)_uph3)sP`XI?!35|Hp#`IrPJ}|3lyX&0dAwN=KkwhU@hBW#d+Q zNvzhp!)j-|oFr5l-tYhWWESX8-NDbieApg&UU zBHkvBX^n5ge3y%d%MS<3TCU*&_vi0|B|RUmx0fc1l$U(YhG%x0w59^oKAD6_(+k!p z|Bqn;V-GYFUh#jt^j{KqI6=ES8x?47f4HbPJg8rGV>cUe`0+*Fy?MVnns>J_ck%9Q zbeRqlI%(@-M&CUc4$ZCkpzh;K7K4D>`y>BlE9*3^i)akGTH*O}C zU0wptKRRrG^mDz|G>W8L{Al0y>#+HVpUb2loy+IS;j&NQVSJc-`;+OCVWmgo>38}i zY;~PgD*4R)RDHke#doxec^Gu2uC+uq|NDDhkwA98y$05f!wk_~em{ATsd*^F{ii23 z2HoV@db~i>4>{i9`_n<@Ltr-DhlJ)c;wJYeA3`%uJSIu6W}Q zd4X0k;ZTFG@&fl?G1=jWJ+fhQ?=OK#OyY_{!aV38E2wqUme$Mbh7d1I2KD=K{;vpo zn)+q*{v-ubJjO$QdH(A>t)i7fk=|^j&1mK}hXlH4hU$o3ZCY0!4nHp8e1bx)jQt$0 zPPa&oLlrOZ(D0Je<~{Gj)%ubg??JLxK%&ug%_zTeNorkGqfvSop$)hAieom_!0)jeD10~N~ zcHJnUPLH?z0mFbWk?d+l-`~~$c41j*Sph86@5Vm*<}8n&nXunAlD5mk%ti_BL};E) zT+%}Qjt@x~hAs4b5W3Fypmv53dSF6^nW#P43=6U0l)Rd;Vm*#mPMJ~u)9Z29{n@I` zH-1QzUV2wdgYM)i)uKOk*`8(rYz|+U4Mtuu_BtJ)Za!*%)SG*}nARSJ?)!9+!qNBl z(l6kBE~eDW<+J$hwuU|sY{>fO@RPvW03A6?DRJXO9s>W4nyw`P72v*s2-q* zlB)>E&o)126O;&0R|lHF-3n9Nl@XWO5FSS^1?AFOG;?NiMjUL&vfaSn@rc9w013oy zudLqfsP$4$Q&WJ){jd~Yx^CIen_R_eQvJdS--HIf=^L7Wj;@k)EpGy8i^zzs`;x0& zACWtQ>%2|7^TSK~met=&?)tScLUwyg&Go?czILK)HWex~h;+VOa&JarZW+^c=!CHY zRwvW;8`>VbQ7QCiL>V@{W#7w;UR}@CP~64)%jF{+JrBd$epK4`MIYZYtf&nGk;O4y z+w>e+79!9Z3Vzp{xJ#QI6xG%<<#rqWiFOON^;2TB2Zy>Ze+KFP4X_S$$xu-9q6SpU z^?X8xk~ypv zrkl8&r$IwL^G#!^2Y4K51=jigV#bFmsnIhD?e3BTB?NisO3j15KM=Q&ZASy9uL2uzQew> z$(}aXxKybD>VT|Enceg5oJWV~PnIC@RT-w4I}kK@vymWDJTh6o%D9mKJO?b$=~#h+ zg_ZA?@?~&cVKfXwv86u;T8r1)EiDQoF(&VUvMKGmQTquYOsGrAE4_kNXjBuLE5E*H zH4f8CQV}i`5;qO@pT*}Ze99gm;03mm z_~b78({{>^#V{~j?J*KTY{J#|QS|-Bny+J1)suX0zfLv3`FAkLfp0M)FzM*az9u!V zMbW!Rv`>mD)O8`{$6maUflB7%4?l8X1q;#~O~nl;6c1ng=EXBQIJsfVJwU5E`^6iBK8?}ijO zHXXVh>>akleSiLADS=dE@p$!Ht#5uN#vz(art=851lOZo!zygJswV4cFl%>oDgRSV zOw;#k^agS)7Hp;r8}#b$dZ0+vYn)=0Fw@_~C=4JG}G2+B&LS! z^bE|1x=F~iciNR^{}y>uQgmvS-guIV6TI(5(3EIon^WrguHB|rzrJKG0$H!Ds$WC< z_VVb6bHRq)8HEl`g&T+Ry@#DV-$in77f8L7`3iL}8w&Nc;PRfXcpw=3^Sdn9k_u}M zCOE7TkIU{;(}SR-kPdoboo=(+Y67QsMY2L?ddzFq?r>5T7XoX{OvTko{QJZA_MpB=+o2mn zf<{|^5T*cP<*W8zJXQDtx|&QyAqoNVW=sUF=p)>Bflfw8Y6A(MlCrvVMhmkH!AY}W z(Gx0}k?ZZO3LKb*(E#>_-@8*XWmsUf7=;h5(O@*cyAuf%yIY`;H=CyEsx3uvE;@)% zV$M$#Q_B?+dTu0I;m5+rR<(($ZR!O5uD+HK)Lgs6=A{M)N27fVGf^PhLj2GQPV?Zb z>T267Z+0QCD!_T;B=QgNBLG2Yy0q?>dSYm`T*co2*RfpD{&1rwi9Krn&oRh>;$nh} zpt~sg@*UH(1sOsZW*jQh$Ay`fvEf3SCR9EWiy&jBjcaXS@pMEx15&*=Rekq+tj`@XENaYw zsjbkC7t20pvAEGGZ+Dr$VBXN4Q?pUC6*N$I&Q+eR9d7iT_9H14jAB%qE44cbJk`S% zjV~UK#uKBK^MbmQv7hMY8e23tuYnq^(f)37+V59>+Hc``Ptv^!m9#n5s}N1rKSkaG z1l>)jiCb;>EvzV^x*JpGidMP60~(M1ADAIXPM!dh1Ie!#s<@_pHKnF4n_?_UklaWC zQv=ZJb)OSXTpaQpKzxTix^nGsNE`C~?>ts-@!8Eh21+;Wn-4fCS=Rd~ z?ni4I%^f{YZ})5B)4;XFb7@CKFO*CH(0NwT`de$_m+I|^j$hDyLhH*X(5BD_-%poQ zv`y@}fx%@l21c0;CHaMK{%JKtL&{WKv#;J>mh11(ouI27t|Nu}o1_WZf|}Qjj<*FK zHG}Q9D8EU6ExS0!=OUTAa#Yvz%-@@>%I$>~(z6T+dZ|3ksshO*Oki>MfyjMP(X{1o zsky|uUczu6K0GP+N93rNOF-Uz zRWJgC@)nN1&xwkii_Yzm`?9NfmT%ol0EEJ3ZMJ!yyu4yBw}6B2u~v;yD8im_BYG|c z#StJRM#$xd+oPih`1YO8(Z%wo<;{obR*A(%KT55hk9o?oyYMDbg@OeNU}f=m;xZu% z#YqLT0V<<>SNM=&5=V(?mqzG8YN%CU*_N6Ro!!(p5U}l3OH~$UtBhT1kQ?JQ1fZpH zth44MqpgIX9EBKgjn+c8^Tuky%1M}H5qqD^Hs%b1QCwaqEVf6PT_RjKw=%a~3A2ka zHf|)L_O? zw^{^j$1KM+qxJ3{afDbPY3f4NU>2q#bJx=Je`@g%_h42@5JPxB7(JTo@bjzXk5_^x z5g6ozshs1+h*P*D@ejniv3|Jo$>-YV4GaP=bCM|2)zY2;6Nh*}Xz)@HKp7<5M<`yW zX`6j8b-6SG&_&rCr1ttk-5~xyJfQ}hK$#O%{n-1`z9BcBf1VG(;OTVN)7ti~nCLEb zx+j_hhL2ccf@?aj5hiyj(8~(ydib_xjpJQ0J8E}4FHVhtVVLlV`j zgKKQp&_=jR5@cj6z$9#Q$5=b5bTQEivtSt_Eo89|_q^Pv@u27zK_tQ?ICKnk+V-ur z&0Fc#Z;uE4FEx?=R#Y}&u#7NFeu_$YQmd%0vK;@liLSX+kJezuQB3xLEC)I&W{BFl zIYY~F*WpvnU$Ci(ZBp4RNW^}XHy?s1D>4F9f$c@dKB6yb$}xQa6qzjhIw37rjkM|I zBdrFeq@(mG?bRY>QY25X1eW`FY_{k*e|+VDgm(7^u*H%P98sPcUALT&#xJ07mg7#< z^$wTY|5@`fji+eM#?yD5Qk9#yEf(u&z5Kby`P0K#S6$J9mZrMJLA^2w6+-Hwq*FM{ zYnPUq)@}))2f21CD|kH|Wk`}uCDVt6S^zDHH5pS(!-p>d_@USULa7=>nNxWmIjg~r z^BRU>M)e5K-Ye)2W+BoCnXJu8&ujdFJitL%p<3!4RZ;%;f{%8)RC1pR6VEUdO8>3Q z!vTV+A>X&N{11Wi8Fb|x^TD|Y_?8idfu=O&bq(4>07t#C+p}%LfQ`Gqx%d&o0!{C` z5(t-^jM$(ggEiw)yR}`Yg+aFhL(2hhTWZ=MA|YMo_B$tP%V=k!SNQHE^NZ!Wyu%9J z-sEq1#S1sK@xP`%e|rJ}gqxEG-}S~jtFWg{Q!Kz{xg&`ku33%!?^f4*rjOAB7!rgz z<=D^)87ziDAN?P$r+R{3RcVBu3becKP0PB^F=jZOB!&%jZJ~X52Y-cB^&~vJ<_t?P zr+Gx&X*nzRAjrm++_QQ*hu0fllW zG7HVK_1<_9Pyur6Xo2-;>-G9(`ynR8;C3@jpKhcZ3Lj7*$h}bbu#Dm0Ye)oS=ulx855O z%U25ykOeP#IVgJG{QAPN1?Q2J;SfogYT>9z%GnP9`kbQVIR*j%Y62N@>HN}5p~G6X zFoDAypx0JK$i0Ki?JWT5IZIwF!Nn|AGp;P2Nkr(3dftX%f<&a?{Wjdr@gqP%&@;;S zm`fQO6}USN#x<;rJM2!k`lmja1P64l9MhLKe9i*0(tWyh`7eEs)zE5socu}u^Kpqb z@5@6yET!c}XLDwgDBKy6(fOB*pNDRq=vH(|JT5M_Kfpe{t12Uijj|p?18g9#_bS?d z%9xQbf8>u!>n=K3D}RGj-p^7kwE@5(v$E&?l4^+!I|Jl>vWq1?AoGzwE;F@!J@1SM)cCv#PrKmH2 z6ne5!*v&*v+`aYD{_x?dPoU{Z##r}5mkI6iitUl{5F5{BT8xa!heO>yk<-mF@I+@@ zoeZ9WJ)jz9ruCDECD^Pi#uM;s(aJ;@%(_K@aYtuxh(v?S6pksv9z>)s9mBNAGKS>z ziNkSNRsGHreAbc7pT{&Wl%*17UdE`>;J%pWw&bcFdmB8Z5>L#x*tng!r>1F&?F4$&lr`e=Qu9HI7v}YSk9(HbvWGDQ1H2#aFLjT8wk}a;`FtKf>S<94 z{z2xU@fV&0)E)!aso3!7LCw!f)xo?f0`|msFOoF(1`@3`94cYqG)g$3Q^ZREM!`l( zXg)sCMS%EiRMLoPnx8?1-VbvvtI*>)(mMgWG6R!==CM;L*jaNg%jZ;z+Kx?YIqap~ z&U~%)m+tQXnOn!vvLLO^B;}Q3Jq7U0Jd#L4~03d*o=>We$4y zW+ZjyaS})W(-zmOMyM@4nWTrD-!a6CH}M)n}Z=qhUTfh)260Z0v!C&HgSNHeg$ z!C1LqC*baH~lK`>y5#m*Plc)>o%G5yYUKR& zwEf6xXcCellw}72Qz~ZhT{zM>tdxJrODdzpVo^bb^P+GREXVK@Bca?Gtda-^tHYm` z%%Z&^2ru45wP}310#M(cyHxC^#xc4zF{KKHb*%qpihA-ZvSWTxtOjW}FeL^gQ_By3 zALcpK*1VSAMv-htiZF+3Joywc8fe#O{8xelDPGhnUGE{)At|CmqhKGfky3Dim=UaR znRuZAu?1D8wMiZ*=$SagtwM4;zKw0y+$4i&_n+3g?}!ehV%z{7GyvjVib8-xASlBA{jorGhfy?RLEg~YfuMIYkdcAdv3W9TK_WRzecIUeV*`RWb}J2 z(kb2?%)_Iy`k$vCZR?5)2uuC0P<5U_UM(!7HABD^lU}9KFDBZG z#CPBt#6*Nx$wI+4DKiRJb>kl>$PPp@U9@o@@AdJ49H|nPm0|l-@dby`1tk|OzhtP4 zb#!~wPN^yXm`msAU^4j!4s2ir0)nfK`R`?ae_i7$rw&94DN+giJ`S5c)s87wIo;4$sknC9k&t2Ft}icx&dKx zTV2;x)m1iWpSy8ML+ArqDkVByAE9$h9iN~4BFQBTsW9@((hvvBq!^hSKG7clZ%(lY z`qaroQ)KV3x$17TVvwU4@pk1V)-0#%efn$1&vtdInEaQE&h*?cZW{cWMZ`~|MAI&r zd)Cw5Vt~riWVdZ1fV@PF{O4EO*Po+;Hn3EyUuPv5T+I6%g1(%up8s~xUM#K+Z2s5!( z+(RCC^9~P+938Ep-C)3JBT<6RF+n1q+o1O0X0Iev+95ZpKIM_Ls8Se#)rLa1w&_;T z4|)*$FpZziWK)>r_pcuk2p<5Qw53$@L!9J8BsOx>* zCD0ZX&S$&ZxA_yFaiAdB=*Kzptg@}1_fAQQ0Yt1JaQ~IV$rWgGAO{=)09O)gaC6_-F4?-nR@ zItVF15v!reA^R1cqLAn^Oo!Ah74;?tu2eE2knC5E>Oean(W*Aa*JI;{gWDUp4Ksvs zau@8|b)|6nK_*0g-%ug~T6^HJkF!xrJ*uv94m~VIg<(Eb!p;Nim*W9BL>eA)$fHJy z4@fyh#C2*le@G-4JtVR%DJmW&3Ab}+7RFej?{D=AARXI>2PF0YBHp{wl7{F6U%1P} zLN-){4zCyv1WR{CwJO*Mvv-{bkhp}Le9{`^^SZH%Ip@ni{9EE~zY+!{%1D%r7-y3p)IjlLeT@! zSKJ*s^TXTZ$fQ4qAgVE$+)H)kV<2GP5D#^v2lN~C%#zh|v0jy`ph1wU6o;2hJ-3Mk zso=rcvhP`h+?!_|!D8f)>a(o_8)Ryce`v_FfC#iapf^cEGp=4XY-IP>ObfdS79>pI zrV;vOv6)6`rHEp?#We#sF$E7iyB~wkj){O>swg==v<1&PuIF+dsNj&Oh66W2ulXj!hgN-qPCZ`1~XL%8Y&J` zb43OjJ0cfd#IztUN{&orr{YNs8hBY6-gb97h+{+*C6kLux3~^8$0`DSrSw308oO_R zF6{V1{U|=UL5Os3fR^h&5PYjTuj`|Erst$d>Lt)3bi)ZG-?P^8{TB-GkperftM&6! zA_)(|jiM3aTlQE-k-v+#t(y~9egqndjI1lP8-q{ndPi;2V6^Xy%RNXT-+>&ED`ZKk zQkmBQ8HR^vQq;+3)*XQ%y(5b7aZG$Ic7-6M>uvffg%hdYaNQlrXJepjX0CkAW2egI zS9_6n(L)JSZ3>WU>ALQt&|5g>4dTPJ$-#zGtna5H1yrSm(|X>vwZ~FTN8(f}5zI9Q zbqjQgeXqOL5R*owFVNoy7gNTj{FOlhKaqhUQ{SCO%1toyU_k0LyB&OjH!4@J(31;^ z{!}-gZu(EmPNG6YMy_^ubw*YnQjll9``Rs+yjx8WphYnaxXIM@#yQlnJgIr_nv zQ*q>UTs}1@ysj!3yHLlP*4#CZ^?Pbc0?qD??$49DP6L$X?ngi6B;RSq)e7mRZ33B z#H{f*L&}#c37xo4-O-LlOL-S>Gv8?T2}dcj^NN;weoBy+i{O*+?vN~X`_zyy9A;~V zYx_G2tn!Z(wt)))S-Xl9gTT_v4s>rTaY^Yed6llkCbi7|%68v?pzyyQ z)SuMd1hVvL!v_Oe#i81;Wq7;=F*XoJpAjGlM5xuS(vxN;&#{}1^>hNuXXR@ZS9L+O@^c~OOB$-z+CURqV!NpstB zHFTMe7{!0W53j@Pacn@T@p)ST4&MwGg&+_Awwu5dXCl<4b1_^)!{YnuYsZd9qhC2D z$PhZqezyk}7mW{u4zWh*YN?&W;h>UkXbAGPN>LNDHPel`N7lhAy^+A3UNK7 zy>!ueOOWVRRoD|B&CUM{3BSSyG8h$p-D)~<)+hH1;=fNJlJJ5ak{UuY0~&qmH3_L5y!E5=DBl-HVNoG>AYB%`*qoIT z^rG&x$Vgx%&<@pL$`y~Dz@s?PU^?OC=z0{!dhLiip#kpxlxWZ*7c|#&z4dVvp=hJq zw?DEZy0@wEX162*8dk04TPu>#=PHfYsp=8()J4Thq~U~X-Rb>|4J$ze&LIul-NdxG zGjF;TR{cWpoQzV|x{V}Gz-GghhTmSn4g+2*5^&XJ%oElp2bU2%W2tI!P#|)r%w3^s zgRmrdB~j3O6N1VWBTXo~eXv){B=e&hSbLLda$1PdZOv7%e`yA%#JN@_FE zM7Hu5^ul5*(UIM5Film>CA-_9xQ3^SIhjC#W z`DtyS(9012;M_n^bT)LJD-S(&A!V#05G*AvU5v)#xtrCpciTgEn82HpC!W% zWL3z*#>f}c#aZ@KFAZ@2HtAWQccBB{VA8olW^k~RL!SbMkS+{mvq8psl|h<^ETH|B zVnS*r-zcgme*DUYL%i^n&CGl(pxz0#l6RDMJ#}!2eU61L0x7{u5O9PmKzV#65y1K{ zxDOybmItn?DsgNYej`#Lu~c&Rr^q4^gIZ+^slutA_d&j$taz~rm)3u?_#yNTLTr*% z*BM>)dtX*-+nMqV-t`!EoyUiJ4tQ-d+?uF=$q|o42Yd?8Q3tGrpB_B6iTHf=lG}nM zSTAT&I6Hw-Ny*Oo_~$DobCfYkjz$>3zXVyM+x?tTPSFimDe!b>%5}Bx^sA&hm%@<+ zpbKjLZAIciK8kk0W4}bOjRm3%TS)|sh(~2p9Fjn!Frn?^t3|*vXfdDNJB3FMa5*T| zk{{2Xiilyr6*%d))a>rg5S1K?tEZt119(Tm(x)rBWM+T;1zspYAc{A8;_!t7-ZdZ2 zU*=4JgB#=Q$(aEpJ3_(v8eobkf(RkspJd)AQ^<$UKv%DvN1R7r7J%sb_l9N93ZY=` z^*grfRBgL)WC4aKEVetHCleN6s4Az;QDPwQYiUlG4TxWEzkhDE31S6)Ju}aDrZ)Nh zWVVb*3nv_yL(C8fl$0EmbMsM|V9@p3#lJa?0NiBmNI4UL1a^7+4Xqhy2UQikbe`&ME!zV;l4af< ze;SK|$PmzuIGYequsd>)yhzD_$+j_?sY%iu&+oE%xo4z!q#d1CG>Tf-hgLY0b6DW< zEsir4>S;f419aN8cw36!x>#m5cDZ~y}Q?nk#tyx&-6Q%JK&3J0z4CqDQa4l zIBOZ4s#Y+{`^2ZmA2H}WcS5)9M1V;23N(KJfV)FUd6U&(FCiO2>Ik65x%gjwBS~+r zVQT@rbZpQ+ivJi5k)So8Nm-wy^~>%T@~?gLX?<_b#~B2Oc-?Gj$8_y(GyQHXpJepr zc>WtnsTARDkph?9Nhw*G$lJdK(o76gk~}K#sAG(4z^YWtP~A^rBd5wH#vHus`$ z=A`vdQo?EtK(CXw$O&lSlq|>>Msonl#_ClPZqYX=fYN@Gv#zAtj#Pk{2Ppk_#+o6v z^u+*fgP)$;&;Ep`b{0dUP?+&3d*8pxKrX3&mqO04qnxNfSECr;mKzcc{y7SBV?-w6Ey?jU>s zRP|=^xYu2L7n8~`M=u8j7-YZ$5^;pTfId`3k~FIHIM6-)%sUduVkl=e+^}CgxCnSL zasp{I1>RR!BF%JC!aF~TkD!}}NO$I#2~5HVwqVdb`Dv0$vV4DkXd*F6aRRz*dp=L@ z(MkA}-u4&_y4dADw{T!{DwxAV8~Xf^<(gW8fs9r}Z;%S?Llr4=!#@JN$&@Mzs18EA zbe5mqEyh})S!Qs}8bH+pHEtSzZCd9~bru0lK=tebz{C}BcZhXHac_mZkaGYcuW;lD zfI{wyUBO;e8hLs;-e3tFZ5!5%G*WeV!RwV^G^@KZKxdaLGkM~Jx)KVRH@^=`I%$wN z3rG9`F=d3e5+Xf|OL(*SsLh#ozcSbxaA~?a9|G35)+dQ<5l~x)_;|ma01fUi11;4* z3@{KBf>_S^p_0bKBQ&KkkH^8CnXrh7}H7LUH4t(J+n;N?avXtyGb!HppT6 zleu3ZK_-xp7P{wnKLCD{scL(G`pAhDDxkUg0kN}-%HwMyE%8H)9n@IAQVl<9&Gi(6iG%>gb>6NkYugIcQd!w|L_r& zFc58A)z02nj);|TD82BsV5(qd3sB?_0Xay05Nk2~3PtH<2`+=4LP3N*Oy|UoiMv_| zBHpPH;}##*v?0qzMq0`#dV~>y!zR(&xfo06w`byo=<#Uvwxu(E_ZR9tUVv}Pjk)za z34?AEW%;wj%N!*v;@k1(oHtJ%fbKq}s^0iyydCbnCTUl{TW6gsH)R#hPqkR6ee&Ua zK(}DbffN_w?ty_f+Wh1^qBubWNxXRuOq&bPb!_`K6|VbBoLaeq0x&DY-1z7z*nowDcwwN!*z$2KKP4kRRH66#E z^rlx|?0s*@fjLgNw?5w)k7DjDrA4Rf?kO=13OF2CH@a?!nb}>Asi+vgcHn6_9!>yk&}kNl&1G&ZZQhu+0TR)z=27aaE3= zrh>uyb^oXlgrwkRgEs%i2R)+{4~(GJeWn}JRL$OH;Db2q^G%YDODfu$(y}A>P^oo7t^*1D;^`54#cA!A& zv3Xtr73~_xgcC9?Db>4WGr>J;ozW@5X)zLh!yGdzgj&wb!YZCscPG`&N0i4ExJlA8 z#*uuh?Kivhqep7rMFioodr?yo+;(X(ZF3FfU^>k`l8!Y!~o~)hq zzZem6)_`(g#mt%v51YrlMM$dS8dGn%PO!7~9!cwOtFj*h5}ht-Yb4q^%L+Z=^}PdBMhVx0#t0V>iZonN5D_y<#=`?L@KuG5BP)b0T5XpMNc8ao?Hr_ zLZen1nDC0`vDN#sPBxV0b=e(5YW)Myt3Ad1Cch)`yLsBPYiq9gU(1c$59Y4cD5HpTqlMO z%6~=c#8B6+dT?x5UgHeZPU(@4{H*fRoCQXvH!ndoeeGkzE5Ju3>xsR2zSGip7jOZv ztY2;mwp_ym2sRYlptE399!UYNU|xgTaNzecQ{uC`&;6-vWN36wuu9#clg#B#Zv5_t zJlbCKTfi{j3=Gf?zgE?+5Q8qo*n;DaLns7j>_qJ|(f^YBR7Q~>aS>vCHC%m1aUr;R z)TQ+X5GG=!MCN}16+?Sw#SGz88!uz}vCD&^Tmo>1zuJ2XwD4b96gP0xw@naceBcre z#pCFfAx&U&44^@$%TuCyMbC*A3SS|jm{#50MzV}qopxbj55_CM+}Jknci|xk4bbNg5G-e zlZH0{R-2_^z#SpWE}&Ag4E$(^s!6T>dG$=mKsjBX`|P>^8%c?I%|~!23`fc-U~Et@ zi%lW23z0UxnwxvBj99`zH0CR`4&V5uh9L6Znp_%EKLTpi#MepV-k5Qsd$Y>lMbmgm z?}@EWoYx|$=^~u~5wB6;@p{yXL$S;Y$G6 z;eo{qO+cSj4WCG#pc;Io@flD>OOC~*g%GJe6nx+R1&H{nvECo9q<5tn95h0Qf3r}kpceijG=K)L*e1_tb@D!-BcAg z{N5si;+HqP#k25?yh$EAY} zvJCBar;OG(qkbAS9$ZP3y83{vyU&U!FbORadbv-UY$$&cvxh^s?3$Y{KIA*#?pTFI z#>-As1B>ajmDSCEmnt9$NBS7M^t~_szvCYt@)2;syt4lG#5@5m$x((YEHTkcopFdMa;HJ%7RdoV6L%Eh;$tkpN#M(@Ou}RhWtgK23bj+06p7AFrmAGdHv;m*weJ_`5&a*uA7szcS|`QO za6VVMnOk)ZCPy?DmPD1Mxf2=cH^0y%*i&h6CDE;Wb2^hzB*l}d#|X(-K`IE6I)OQ} zTY=OiK7#Jlhx`ak`41G^=VRDsz%Q&(c<^o)f2BWtB^yzU>|nF-6kUTw#t}I?aE0xs zz)zG!LbXX_P!kMuHi)vl7tMyh@rzW^2P4A?+l zNDm?Z2pZ*Gic(-H?pr}@V~|lDP3CYSndZ9xc1B0&F9@Mo zXnStQ324&DEr`QDfpXsgq9oaQ*K>9NRN|9{Zg9zjM3%C?FYg~F3Kcu_GXgWt&wqkN zfnA1?_L2kbEGz<&JqawXAAz5A>Am|%QSjs4#lC-_;~c$o&Ygon;(nP}R>;RnB~?c; zw{-_1;S-g=Q=0#NgOYy-ZSz|V6!MZ3)j6AH=N|=&_YrOYGB}g&<5&cMfoFGf5*;tL zIN8v92AE(@!7Ba@2tXIkL0R|VeFj03uhPT5RLe~atCu#k3oz%&_Q_gSwag@t?5SZP z=`ft3;H&{&+*ir2K5RP($Dnw^IrpL5dEi;`m*mT4>7CfKRmlR>6`pc;St(#^^F`~r zh~zIb6*;v4FQytYdnGOML(t_{vXHcIUI%$1=UU(PYuoon0Q%zh_)_mYI9^s{p=4IO zHzoBGphXcA*6u;|UY<9dnmb94HHbb3o2|B|7T;LbO=6vQCH^?L{FntZYDnNHsd0Sp zqzFi5?#53|*DItR=)a?v?5R)YTz>XOto9b5D_Va7#>m#iVCS}L)=f)+m(uyh+a0U6 z|F4uFCxUVak^3OD{e9j+@HIiAFS#vYCUgn)pa^MyaVc^91cRX>+|@E1%OQ0&k5X7T z;IICmrAsj^eCgsx6>XeUu6;~~-3sQs4N6AY9J!`s}tx|d(PyNgCu!)#nuRF36)2mh^`$3MXo{S+9W zfAyv#K4wcj8;gL2B#OVu<&wLbd2VGo5bJj`kiwGpSlso?(S=B`uz$t_+>b4^#W|G-sb z7z7!~1sW{LwKQ9QhADoAV&|K`p8v#HzW`bveI~DeCSCGWytWzrt$1tyLKvD(i<@}L zUMtmg(sah$P4WK3#J31;0=y%cK8{|}ZM8L#0&gahFftcUKd{AyO=z~R{&PiT7-_6S zz)w-YzZbg|E(s{|nGf znZKTiLk|V^277@S8_xV5=26haH2$vzX7pRT=es%Sk{y%u=QRfxsu;m zqqF08dt5AZUp>^lLV;)~f<95lzPf|vFcM63LYZ-iS8`c8GJbeLr*O z{_nE`7-Z+k8mK@lCgEUgHx4M0>uEw!j-9Vlj8pFaXsu2xZt>zj6dnN(i}QG1toz>% zVp_|9FGp8KPRWJg8b?_Kk#O?AhWzrzB!*6o`xOIMgSR-2i`)E$+)D zp00MX`cyxbKRgy0<8AoEhfsVMa!lX(*cZ9bQOL5g7dCx^V%)xF+e1z3JK!)Yn(JEE zpRXqlxAQUK8dgD0!2e_cV8c#`NeD!%fL6!5*3h%v0r?}~m>GzSl2l}|Q+(^-f92_0 z#P|3p%(n|*T&@~mUSv33bjJPbK3fOp&O{+f?B!M9s^N4Wy+Ik4yukQEX{|u$&Aw7O zn{%t&S*B$~-TTG&Pl)5Ph%_)HuwAm|6a^ZN#D%Wr*cr5#uh?N>Y4wQM7ED0> zt&f4%5&XsvD%1oh5x8)i`n+OaI#F+t&&S+&7QBFTO|EIYEOV{MVju2;O*9F@Y;6I$l&*j9wcui3v)>SwM!CU;q>F#pL@jsG!!A8&-(+IT;|X>-M(y6 z`iY($v8v?0t}QDuH_8A@qcwV`XatN^OmBZL1dCmrlRNVBBe0B)K;;_Dj3D8U#o`H8MY7u4G*uFEf-fT)w5qjhxoyC4>-~x%aXkAVNRh0-i^`g&evd`AhHTZ z`wr{rkHF)+OW>HC_*QOu>&>~*cjPkIx1O@fnn26rN{YVT$Lbn=qr0QXJju;?fgmX@ zR@L%ZdIv-|-IrwZGT08=T8KuLs@6xiZ#VG1Znqhxn#rh)TWEB|&aH|fWGcsivLTe0 zhV$72Mal8s-`_WOQ905M87J8H3xSN{`111B3GTO-uu-z>$=lQgreWNa)9E5cYoheh zgjK9$v>tzmSk6aGju@(WOA2Ybtas*7^C$cW-%_joEI7o1YErL)?dPn#XT&E3=v`Ts zi<7|1EVi+NBL`m){hH|P<4fXs-nGp${v5lq#eDY)q>BDwM}SThBcjspCm8}K9W^$a zC#EJebs0-BoCeXCymlLR%wm|(fN$LGd^z+WX|%zFjsd$H42D9_<(PKsb8k~-d8;xP zLE7<4IYC96UbPlUw$NF$^IPN(uY)s?2T{)_yP-4@g8RibL9>J&zPKU1UfT2Aq1NDnk?Q~LFl|I~#$azsKTPcrbB^de9>q1acPofuY; z3dk2}XZ&iTi_x5{5>B5aY6ZNY$*BYNrjYxz7;sJxJP%!Nj2sJ7tNLac2r<~URIjr6YO=&Q+g;_}g>jYDBi<`H;i2wcBaAJ2n)Ic>V zR6jcDVPr|3C2SzxnAoRRvHU{961F29M&@1=;_n({wQ7oYljbkn7|E<LMba)Ts41Ql)e){Tya_qB6o?&Uv2!b%1R&%Fm!^zBFX@sWN-_xQSRQWN(V0 z&W$r0sf5Su<5cQp0mWHJs~}JvDLqvfyk2eazO1LrTh{bxYLp7|?Nk)4%6@A2ZJKJ( z*V3=oUC@xUff#_3(>W20l8GYjeRR3wnd@&%l{2TINUb2OuDP0&-W!<8Zan+LzDHWF z!q$m;5J*#>qek{{8; z4IBmLs?r8Wq9+q8vpK3ddafT@3!&E{aJ@xm&PQF^m8`m=QTbVoK|8It>FHH+COn*i zKw+^phUlhPp}aHcpcpn3Z!H~lr6TdidZcVQbcCYk6OeLl1J3*;c8{B}5m72p#P2Y~ zD@2p(xG$eTJi_{6sd0zO!S|Egb8nyRRE-&n(!UR1^bip_XIkQKl_GwmawciVNl3*+ zI)!s8csrUQaPCwUOqdl=3Ey10F~szBP+|L@Tsx=dRMD)!CKgo){!sEk%<{hI%oz>&{V@xwpB??$?;TmxMUi5UPDMefLI+ z`ZX3?+lF$?SHbHrEm6NeVo1sDB=+T8U=6c&cH0ecp!|(v6^Oj15uSrWlNaVw{80P3 zxDegOG_ne2$W>VX+@Ij}R0S5n%dCl?Eaq;E!9<3b_CFr&F8WDuv$L=8zQDe(ff(Wq zUpTY_3%(s={)K3Z$+Rz9>^*2h6r+p5OQe3H!^vV2Qa>e&^FPag`Z%4V25=q@+>^Fk z|F}d}N=r!b#RW?5Uq02c*jA3|z*YNCSyK%#QW%<-hI=D+Ft84TXvb2af4yRsu>Pq| zEjr%UI{-2M!BZZy;Z3zAe3`AVr5phgELvWhJGs-dkYhD!;Yb>@ybOw5`00zbtyYm$ zx)7p26)^l)x?a{^IubRYV?+?waHz%7AjdwyIwaIsIw^m@@roivVyDv&6OTOtqbcm;IMAr?~@5c ze3j0ABCBUTc*fYpTEPz*{&LEm=|{7XMyVjIur zfT#Tx*U*$uCn7EgCMq-Gv;$!uCjJsBI)NEthe-mv@H%Z6QTRSNgyY+TCO*qlUktm% z0liq8IOQVUhFYx~5LO*bL8OPuy*ju5>gWL-0ytb)$V&y(&f=(G$EL)GJ;U}>u7*UA zD9SH)#Ef9-FdY~XMurJeUG95K6D5l>1-9o7=BT%cxv)jq!e1Ch?A`F#@{Sj+sp|Gs zDz8tj_Id-6&jRBD%!i|##@NyFt}qplW?}F(N?Q6x2e0E$$Wg%~RU+{zL_&EZGJZni zHb#!ZUB#vvGI^7J)&7>*Dz2hpzB65mv)QqJu)~-_LncKoJ z8I}4B+!srL8*pH4;rx}z{JRG#&JapDuts}-fH!|NE{kC8BMSe6ls=JOO5$4f@l~Gm zBQYVC3NcIMJ7gSpJ?mjAs`T@1A=BoEbE-Y>FzCVdav&6zYm5Bw0SjF5T>-0z(>>S= ztPf6(e8ardJ;rYX7$tAjh)gJ=1Entn9;+l53QH(8?=kUOD&+W(RCoJ8e0ptgdrxuF zF^M8BCjC6jXDX(}o!da?`79UfUXK)cG|h2Bu)(e;y-ciEcXSr}kC?31n!5w6&Uxk8 zPByAWfzl5zn~+W>tD;t(U-jGms8^{kOMkdhO3z974Nz^DzN9x4Fh?HOhho=!tdM@% z&t%rdBxW<12NFcyW>8155UD0{v;q8>~@ zU_=fo_${l5_rhw$q(%h&waqmMTv(`axj#N2vgPzwCjKa8K4{S1>Cq;iq{Ew<6B_!T zb+F*F?EM;+LYA*X85XK{6sxZ(m0}JUo!J;dy(KI=V29kb2v&E{_xTS_xt`8n+`jXe z_}@b)lYd_e>4cz=BaCDhQ)Ae_j!ie$Q%owpwQ*tX40r5g&7rm%qG0mwc}!KtT*2@* z2}TM1!;n-hSuHg?2)A5iSpUW0FdF=11bSD$1eqzws6B19%4TpE%qaD%UY)lA>-X{r z>;jX^fl?Dtc@e66`x2I#K7>`lxdY8N6vN#p5V*$fY4YlXaRwLr^qQ-Surl1&g$ljd z^1omIC@pgoF+e34gYE8iDy}Kczi*D%`Ss1sBvNX=#aM&`*goiGK7^6!=v1A?=)P8E zD>WB)*=U&MM#+AZKHf9P+j;KeX-75>nJ4=4=Pq-7FJvS!zVfpc=X8hkt3>^)t`%0f z*p2TYzVOdNSPEtlA!Q-GO^(L=;g^h(%BC}W_uGC~wkb9nq#8$eabW4YkA?JcMeVPE zT5-!eN+>(~iGLX%NGf^)kl8-WA7e)ZlnEq9ZSvusC?Lw+9u^3wD$dLJ~8qp-y8aZ!MdK>ow3I&!W_NT_pUyxRKAwFDE{Cw~&= z9k6(uaHulbyo+ZK18PiWZBK&e8k<<4tlVjoKOJ#YVR!?c^953ats9-f%7RLxV-TKE z8WB-y6w23U-LqH38oZ-c=CVsvO!U)@pjVYi^=;FxR9HQqd4vmb27X9e{9E9|}J8XVH4u6-@(GSg9V0GQCp%bw# zL9O^n#Y@wX!J~e#<_2Wa>d}NJnuc1>%J|8){ONFxG0c7p-j;Tx0JA2jZ6lN{6o8|l#9%R1pXC`Lh$XT)WK{h&&A-0)|Om5!g56~!#a=taJEDw}XLnqw(^N%%;x6kB)tHAbx-P z6SBoQd1V2qQ3YWs3bYN1vHsUcg|2sP=lNncmd8BZ_W*nh*V2D>g16*yuj%y7JUJ6Y z%V85LecZA7XF)dcF1)jnBQ<&s5mtKcO{vrArHuwl7@n$`}XgGspxjHwC? z9>woPNWpP1Eu}VwAlJa-Gx-~$13OO(^&V&8&#q&h{E_4T!SQ70qfTFGU{QJ zEjfkxVFbx?5=8+=H_9<5rzK`C7)I>!+-V4z8v?WW)<~3t7_kE{L@^kojeTuqK$U=_ zu|KAJOI|`w2pJ?cq?!8pRYsvX(I4%VmyyyV;s`ON3ounY4ei@j&S|bl&CWOpT+T3% zWe`cSCF)tgYo_W!s{%FdxXv`e)raW8Qs=~bp;ox%^qM03tvwTZk@ej@tj$)dpsp~;8_X2w;S!D%{GMlA1xCEn=Z z!sRv=ShDVA!9-Hg<&WGQFm}871p+1v{g&l2NhEv2VN$AkPUyf5tJ5=;QLPW?C#o_|Hw&<;FGnus(HxIK5-xsYBeZpf+ItKpV#(cC$0XF{9bW(YV7ejc)b(owmi5zDp=8 zS|`jS|r&z$v;^(mZZ>de`lLiX#Y>)(eM#0xe+ z0;0X(l0iMghx=9+z86MALe#)L{7mHIoMx*`=F(j{5y#GCHr4dpqtehU(WK01LQ19q zGPm9tb_1rU?J@(0#uEjuq$xUQU5tG_0^`#dyuK!nC(VV2qD#B)>mftgc!gL{zZ-0G z_R*(LVGKW=z;wrIZ#BMYBXW+N$$XtWBW02Sws>p3Bn~rYbi-T4eJ0KWp&r zBBGgJ2O9jXThKVW%n&?TUlP0gFfE7}*$}=SPzma?jE;g@;#5qtJ0BAS24&K7%`_XM zTmj$9;(yky-X}ri8gho!(XM{4pe&@b{ix4uXn=_j~H|Cy}z#ZZXQ&!m5d0hYoyqqQG*<4vlV%2rTZv)qT4cgAAbLmDF zs_L`u5be9%pGeCom-!ZSoo4mY+gHHc^-8ZosUA?HHB!&eqh$KVR18JdMG<&v7fO#; zw?JY;Rv0(om;O2ra;A06BkhPtq{d%#k<*Me<$?58G>#(QL76xx4DT^%1kapubav?C zDGCx%{@%Rac%4@iM9*4%%tY+DFp?F?@miL}4V|+kfw(r#oZ1f_fe2iDT6`*byxkJC z51Nr%>Kq&+)6d6NTsubeiLZ0vQ~4kyF@k&G#xH70{>h~i4}qCileH5*Gn57rl=NNP z89}cBfJw^0>dEL8Cw}tWCXw(!#z!+$qx`d}kRkNJzUefPszs&I=zF)$iU{_WNcKdV zz+@?H*A=Y#(@maKYm7E0RffY{_qLFq(D-IDZ(x}nuh^}c4;w9}9j&wW;Y8<2oRH5z z%~_Sz%w}T@XXMQ^_1}Y=2h~qybrVG@UYTU_f6^E!^5!X5eMUCYiAKG|d9#w*U@)-n zLg@vrg%KBqMMX8%B-|vD<`1-jYrs=AJIw7k8WBAOd8+C#Z8kCRqg!oH$eE-F+$SDE z)umjvh~51g*nTG6tVQMS1(G>u64$adJn!fgL&Y=Z9X6Bhxq8}%D{%318IoTY=4PKd zaIIT-#q$_NCV`fz;8?hy?QkTawOBWBn%EnLij%xQN)3>b3rglCAs zlI|~K?Q^pZRhYJrXY+}Am7@fQ@!Qyu*xOmxuw; z2+e&`$RTxCFzd!{fYjpgf}*o7ojD0*)Lc==u2r}uLG2YB(rM=N#gK4@fU)@o!?I)3 zYZg93`qn^%qcTk>(70~ zgE4{W>_Y^*wR2LA-l}YX4+Y{K@=r$U7$H3q`gwhZ$TT=tTC&8(ljwD zZzu22{|oEM?aAK6;H75^#!88nt&2_EmaM!^9OyL?>(1a#S?!oBwf^0$miKD4ZH>Z` zCPAIqr}0%L<)va`(!d_9xR@=rxd+K*wF^u3_)hP2OI+=X>NItH*D?h}NTRvN)P)B3 ziem5Pw7*%gbrqv7Wzx|9Ru%?~eEvF4Vc+VaUEOjD)#EOiFcZ|M^OsOJjB{D{X{LI; z*a=MJ>tN4d5Ap%O(HKObmbtX6JYk1+3kqlGWxPW8%mx>Z`KTKYr8|aOqOenNT}K1k zJ4yr0!kpiXbrm)H(RBBIb%pJrZ)J1eYkniGGq%O&Es+2Xe-AB$QovHhYLgU2KfohP zCWNpC&7wsbMeJMDODaOeNr}j#A>W<{Ni`+PxSxsb&%h(k&lN}MH}c|#{oj42Nv5>6 z=k_0X!n<%Vz8)d9Y+d%7&{--4ZDgyE{JPpM=`r%y-4r9J5Y43!>vN^|$w-o`H%(Zl z;;3^vo+Rwc6$HPsJgVD~JlfVMZm(CaI_Er^RcJh>+<*K;X*w4xeMl!hBkx?G>Nw+T za+k7G6I$8Ns`l7*Zkw|>Ujlz#UA9d$8-f5V)W*vZ7TKzf$d z`P9n9sdm^%qY5dxL9GREh2+!11Pi#tll+Q25Y!DVH-B&Dj(XlYVLR>ewH-dqePr(q zkR|dbKi?Dc@>+&>KeZ{&OGb9OFc$Y!TReM?j(r#_r(ZgU?TvItIAXNS+#7fTL8nL6 z7;%nwq3BMD#EkG|i|qE6JrZ9_ag>4IRr1jO=G8GyfWY-qVCw8-Vo6fMpZ)&Pdv6yo z&W4OlV{fG{bjRGqLVNBI$op@_w;^dfpu@TTXy#A881Be#{)&FTxT*(i<8~(QU@5A0 zW_oVx->Fa(2zR)w^79q+HhC7gQbb876PcHJ{0DL+y=i;I)ArU&dm0me+BU}?`ON=A z18wv)#)RWTt+!i|7j-G7OxK9W{2BcU?tMaib!`+Zz9&2`{Ng+3(zeBGsmfWfMtr3; zG<;|U-ZKpd_@Ir~adFe=v`0ny#{Wy|_M6FK2_Y~2|vC3W= zbbM1{Ml!|meL$=5*~g%hN2J?&jtqE9GNVIwMTbsA6KN9Z&VR$3S%ndRMC*VnSoI6Z zY&_|`SmY49$EpNmKL=|i^Ip&4l^4qFsmZB&b1{Q7gam_T*S`66rYAbSQ5M|%wJVlw zp{qLj@y*T0230PJV-H?S2S^n!25W2z8Wo%6VmkhQPjkr9&oo9mcy9t`U>}A3;jO%U z(o9pr>ZGCDV6|N5CgDi@5%OkErvrITo6FQ4;~a>G7nvEvKk03KvOI%6xW4cpNz?yn z{1*%pSD^DJ>o~}KNM?Zloiw#La=h|mIc_WZRM~e=UunKcReXxHe;nsNrm`#cGF(#` zP8MmKUctAR;YZtqRS@stL3ogCOZgL@^Qm#CN-XBH>MYfQ5JA2J^NkOKc}R~iLpK%K zex$~B2g#ehLGwUkL4`i2L{#({WD1@5vp8sGx znGawpHhA*?YAk5z>~>BQay>c+P|<$mQm2k1-#A=?7e#UNWl7IsB|;g(Y;!^-N)8du z43@b*_vYq(r>=02Uok$!BFZEnidTQi?CmVi^~D{^)Gx)%b0X`dAmL>+QT0SPJ(=DXFGb$wJ4uU!M zow}-PA^9(czMTY?_lwpmPl|=DG#5VA}g9L(k2+u=b%@B?nyZ@;6t;d9p-sRCy|zK<#E z+2fRi+Hnv!8V-?T0?J%8>K)sgfTQv*7o<}xJpAS9$l22psYz*pHp(C>JoFFb9W`sT z{ksMo1gSc*v3uYITH0lzszvB4yJ8U`n_)HdSb1+wUELbpME5iHb$nwIKlcE`2ia_5 zD`nzTyB<$x2fPY(Xya+(`-Z%^i0ibgeb;>b-$`zqT|`QI!dQ1U8IZZJLx4eU?3g&| zi7D$Kq4QS$2|>*)ybAYhU|sE@f=+T_uR_a@qD+o zWyBLW6QZ#`guD8|A&+Q^ZW0M=PRWwVNGxX4C0P&S0-Zvo-4{egj(T{!z#cVW`k^m8MUuU2r5BMYR&AxptQgUZ8GCm0F5!hvIw@x zn;78>jC!Br!0k=57!mTxNkIudzX3N_(t~-m?Lrv1Ht%2X@)SU*JT{O&23){nFiB(kwSU~IcO zS;F;HGMV;hjLV}>yK$rZ*qD)5P-CQ^oZ?Xvx8iPZ0FUE(v+4TEox**RGYMu16aPQ^ zf<^4{f3|}EZdNZox8xi>xpMXrN?-!i-1w~*&N=^<*wXCXFB2#ft&cn@qH^DP+U833 zmJNMOMm-Oz%XTO{s7@Xeld!VNXS2mq^edNn<&fyljiFEp>pEL$H3pWG32e zEr;)&Le6XbOq|hVRk`QVu2O@GiMFb9_+w`()w!X=veotAtIIIFk|e$*HPR+U=#2?u zfS^Hva}jK5f}Fg@m+m0b0%glzl*=$?xJ2$E;lm*GEFq7JeVM?UMAl)xP0H%&;+t%0 zMs3s*1-!pNXy8UfquY3EB-zOIKQGHGCXcBLx;Q9Qla$^wu-pv*zE?%K$s>-KH|Asy z1$?VuH6|F7je?KO86|5bO|ffx91S!pa(|Qp<111I$`I)c;#`FEtHZv2);v9pV@Md{gjO+e3;bY!B<#pu!Ck5X5oN^KiZY4+_t*v3qF| z?zv%E;}*^kAYJ&SXWi0g=uh4!)GaXXMxQ??qNEg~r6#5+MNSzMmIOj6n66&m?Y>1q zR$BD&aeSj3mcnnBTh=3?`bG5qypzkQd&l5bdPX6CSZvk&4V*!)7B|E4t(h}qrXu3lv?_gv(8mt)emwa){CFcuyTWt?TkA*Oi zs7=K5NUHF{%WXclScbpT`r-eW`H`oGAWOl&sqI;_H-`b^QMN@h*KW(BXNCTG^pBwT z9PRF_=uej4Z>W%DW7rowLtJpP@`l2(ACt>xZ#DX)Sw^({Z1mPBDY3KQ>wodxNKd}r zcGKFawxIzIjWdkNC{*EJ;iJSc$Xapy-E zVGT> zWk2;zf?>9!pw3(Rs~bW;57S?@G`_*3Kk z*SXg>ju5B?hai*mdX}xIMx&74ujBF6HQf{h2-=18LMv#y*gw@*Z)Y!Ipu?-L{{9yR zpGp%I%wyR#ojf$qDj^|iVt2Qo+31Fku7!1maqA^&yQJ(>5kUW4S3k?9?jVTAa_ljI z(sO`!A}U6MuPvSRL%v-jiCBbXT4mmEfAF4nj`iHPEc1B^Z+PPTp@KrFWDkzzi?Ad% z;p7j_dEjv=4k@-jrg<9!vYq`vgW(*2++Z) zqcH?!a6cZe<$zFZ?j#}D4{0Q$aZNqmga7njgF`|T% zaC*BG^_YTuKk~)GxlYI4#B1S4!h%`e%7F_h5?e&;IRjy^=9E|CJnC;KHp(~T76RrK z=`0?;G2Z82es*JJ=XA35vpeJ+pGo)S1a_D8h2Ov5!T%9QznG$e6G|~0dK|?y{DzAH z`S;3XeC;IWBfiyI9w+27_7vzRINz*0?>D5%y8EMeJU;@G`sKtXyZAJ~5-*N5yPub? zRaNp%-4Z5bV8kFt6d;YGm5F-f22rKJY7;crAwZ;Q~}54o=)4WiKlXv%pa zAHr$`Oq6I{hq8}?_qi}Lc|D(%yg`W;)4gOT>GDtXv7L#El@9s;QnIgs;YUeF#%@$( z07Tej-X)EnfVIc?-DA7xf4qZ~9=?~f^m8G%SC7x5Q9R;2gwcyQf=qxa-iJe>*KZ|h zB^Gm?6?@WsrKi#>=IoOVzE?f6Pq&!ZfrSgf%dL>l7q_7jsUnc0tXzR$+L?%KoOI8E z`iD_wD3c5lMnwe|KIchg!Hxr_myrb{f*6p#9_KxBrrqbt??t^>B>gE-+PaSAKsLLq z2hHy#>0eJL()+pb=YOE@9S+Dk7;3d^Gpjr33~M>WD8vu)&qw@XFvs(}odraP2*U*W z>>8rNr|5BTI|Hxf`s)=D&wwusB0>R19YPIaFrjCc6E2MkT6!%a-{%!1<%EewLQC`_ zvi#}StOcfywT1hh^VY*G)Uu^q8Y1K76vULua3S1ixiz5{TISU|ZDoDtYG#dQk8GVw z5#X9~p)g+E0`UdU_^Er|+?ywxOAnHo&w%6vYP>>;+S^%v>-9HZ#nB zHEHO}v53=KL;d4cj$-@9(_mub^?KYb$Z1?l#06j46JYkx8~_L8CpFcAH5!%b==h*U z`m$8)c%a4`v^f2z)hyDsiL1_71JAUVd;dMjDoMzC6q5WQSAM*UHvQp(WfS>B%7?^k zoZtEpq*eX^g`9@ko>g4#6pV<+$06VmiZKd3rgzu0H*0{T^?x&9+De^}0^Gi_5do|#4#7q~Y0qL`j8l@3Ly6;C-Gzm%Z$%Ic82pD7NzD(!Ui zhf|A`LdJ@CmxM-)qUF?E;sBkcE}}~=RHH;|HkjvKSROF^9grpas(P*ka-|p^3TvVk zZ(!+qi~85|9k`L0nl-B)AEV|J2eS*nzO`CEs z=zpnX7KL#NxzCUtcysveVHkOap%8;tj6A@AL&97`8;a(4PDJaRea){_w{g_yUPv%B zVf=Y*g+(bQq|6IMTbU)&)oX{e6dzH_bIxP|bNK`GS9P9>_LMtH9ju3vCk+F{&m!#tC7u8zW7F5;c%pufnpzD{tmq3XWl@8M!pAiTNV`2Zl^z>Oly9be3{?G-_DrZizAaR ziJQmzbfhE$*Oj(GE3}_(p=>5BmIIt6?=IuK{*q1zcWUE2wVAY`1Vkwaf;kGJ<8C6B zOr=@tO>T_$$5bPcaFEDwMd2@i(5w)oxA+-fmp~2QaJH`?bwMv;+=SF9bt&a3qLjW3 z%`;$MT!{@%pM}RjU38E(M1;O|ViaBkvUUy~k#97rq^d{VcUgXS+=O^?2`~{=VthP7 zW@AjrD^#^5g86eS+7%WBm|<$q2bvZ!Eg%PFG{*I1N~ zhkaW2bg$A1H!*jBRU7zLy|8uX=1_oTrqx!iKY&`kppQSWm;R9);%3J`W}jTJ z>NYtw@_9{8@+T4nyNn^QiQ`Kn1iPY8Rhymmiq`UAUVCvPt2B2)s3u@ryE^P`0P{{P?PZ1_Jf>J|_~Sv~7{b~%=%RLX3-%%zF8mDy!L zG34Zy?3eyB0$MR~5kd#M{q%E=VHMcl)S>EVSPo>6nrIxIr~Q{r55`kYd*xbHFM{8s z3LDIA;H}FOCvd24#q3WSsL|vY6pe#^dw_aWR&E4nol*q%EtDL8UlLe`vAn4=z^CHE$DJ&y%J;wgY``h5FU{E=`{`$I}Bd`mBCf9>ao5A<^SBghs%`Gwsf$;;4I z)jp?PS;{P(O^-G~;PJfUe|Z#x%c*p2jH|e#uau5$^0o(Usij~k@`rm^sbngwt6bEs z0tJDkQYb(u9C$2mZqP95&3JoTt2g%GT)tMBg##)v-i2a3)3D0%07F8M_61H1tu*k@ zBWDn(>V;5>f6#o>P?KWbTb8L+h!=~Z+k%oA!=j`)~PEL^oT~O4}b{cdtQqtg&o|Mk_e)vs#-K1jLiVnZb zD76ha{x%$~)zki1I=YP}Ti#4y7D{ZC%2GdB`AH#~nDHBJfex_UZSIr`{>P0;QCEZxo@$vsSf zvOrW=N@icQ(TUIvH84h9a~zE{bKh|iz@_Uf7l5>dsyoO!;0ejp@26b|Pn{>TFp$}v zdzO&S$}il2`7d#6HJ(A_Yji5)T8Va80ioGr!>HGC~sGjP9G=Hf3@~%L#3X@ zs{6ucM;~H3LS_d3>n5I1CIJ&Mr&YBEqNfwg{Yr5HneO=8xkqE~WF_?KjNd#Hqj1KA zbmzq$2|MNnwew2@g7*gYL2b1Nv1(%hmgMVQRuRG=m`xambUp{k z2<5k}uQg0_8;gi3SEE0g6Lm(C5GN=Nn3MwXQ-e-RnwML=UK^Eisur?MC}4fFE<$qG zffqWE%10%n$Ebnfc2ZBBOO=IESwD1(>rRd3V~Y`T>gUn;*a@dmT;bN-NxzBI+IQGK zo_O%%!QsXJljrNtx4w)s&7LWbUtY(n`tMyxar`zln#lYd43hI{R?C% z{f6JHD*joO@)K@?R3I-gD4@&qU#N4e>-4E+B+qjB*mcO!SC+s;>xKVBa*JIcC797z zkIn80ayUM~T#_zh+A4JAeM352mS)Qu#(%)LMN-2&Q@rY4E6JU6^nSEUMPu1<3BjBn z1rxEmv;)PJbT#xHz=S!Xy@^XTlh`xQWhk68-J`0`VcssOJVmBY56Sc&m1f*b&4i-g zY0_kCR7zDgUg>6vH!_Ys?lMo5w%s_EBo0`gyV8UdrZgVsgO`XOuJ9rb+f@b{f}G8lAQVoVPHJ z^+t!S4@$gU@yhq(gXYYAwZ}taXJ_)dW(K;IE8|<3FfHh*jd%OBeU~>D?*0Qn!kv1( zJkh6XD@6f!vL1OX0ByI@5RgM!2SA;KK{aM(#2Hwa7gP|XNL*!YjZ|^Ru-IibZa$j&4U8^up62!g|BEaKI&leHcUCdx* zVxXhc=^;glQX=#)v3bJa9UWB7N_^RT&nqw+}6kCHN#KxRhtdnG_pUruk6cJ`X`E94CHO(hr_#D zzPG?f8u5E!6>Bt)nzMI+X`T4a@9JR|Z}8lWBj|3)gG8d)7AUxYSm=nawk}HGdoenRs+B zummr9(zvHQZZlII*i*GR(+D>7KB%)N_`Xn{P{_G9V9dap|Hvhf^ZJMOw zNH>_rh<$&Awy!n6pH+;#?x=#b&CW2O9`8PGuJ^7tof#$TIasZU5Ph8(&ie+sYvEn@ zeA1xO+&+{n=4i3 zK=#;JGqZ*J8YVYO)SINL9edvL{QvlR>!_%{ zKl~SlL57l{L2BrZ0YQ-NZiG>h8W2f=k?!seL0U-(NdZAhN>UIhK~j*AR#NzFKKFjt zy6djH?ppq*o|!p&pMCazzn|yzV6J?t<2TFe-;#=~DR^?)O%mf0mEjRhbl(v5r`cW~ zIt>JtFgTBpi#I=z@7NlcR{h=?<|WkJcM~fYOpQ)PaU9F8Vi^$Bg_bRx08@X@(DJ-6 zfxotSF*0&VX+BkBwcbm5)Mmc^*?mc-9@^jR9we){%e8qvF@HS1Ub7f|C~tCqryvJ= zAp-fWR94eok5pN~v4%A1C1iS49;nO6n|1O>#Zy>yw-jT_v$_1^)9b*?-bzeZy)$S? zkS>1!>8wqW?HxZ@?cdl+7hrn{e`NtM3{g;`U)XWvD?D|`Gk}fr4D;hsD5g_6N;{c3 z2qPd7BA`!Y8os~Q3vzXSytzE`)hD&-$`rCx5S#ysqgZC4tgretP*-naIB0pXKO4y+RXdf7(LtV z&Ky|twmg#P^QzXOoS0l(?S*{Dm-iYcujXF;IQru<3KD!*7&k@gw)H$4-1<&;v#*B zho>J;SI*m-J^QGv2LTc-hxlXzua0{IO^xLNA%;|6=M7?G`4 zo>U&~S}}HQeddZbjr0rYt!$0zQy~FNj_hKeAuqvLJ^4IuKwCktCZd8-6=)<;p~7L5 z@LIL-Zd2}-+|{QKIob|&*&hfs9N|_YYXYYPA({97#{#Hbp28P?4NLd9lm4@@aTTb{ zlo-vlPG(#q@x#432XTr?4}@?S5#JNl~*ivyPh2p;`Hx-agiy4lJj*rruO0l zo8t=)J%^t3^oBX;;qBG@MNt>HSUJh*eVa^pK@>v?4ePgwj<(IR$-qB^;ASglF4kBD zWp*kDA?msQ@;yyL&5zhx8B2Y$xFJehU7AM+Y{JU#vhXW;;eOBYx^&;i(B#&vV4??Gm+EAWl zq@J*Q8)#=@l?AykzHP0{(JQv1T(X8;(S;peHy9iF<^wwaLD37I$!Kn_%+$i@11uRE zimmk@b|Ok7;+EV!vD-q%&sbNJ(0B&o`RAQ92G0)HKFb#nc`C4F{kHnt^IS^zj2fNo zI?!y{hW;GROSvn(1X;dypqM@MY+G)({>f;u`Om|z0w0S#{u|fxL^(+^9`KOeR`Rg$ z*FD6qBM>I+NM|Pz4RNLoBbZ)gw38L45f-7VKrO>>V@Y@vxS6n?Omg>!JnT*<{&W-9 z#+lXy5EQ=0=^h>Cd6D|>5f!dQbyuvv0*cPShCIUiz* zIUx<_Yj?0s@!{LCnowr9ig;Mm&nZIEO7}3lWOh9&`IlU#pH|J35Jn*#WXGu%xz#-? za6$7p7MCrsz&ni|eg{A$#(pxrsjpPo7j%6m^V#GQq(QMuE$xaH;E&Fu%+^|^0+HSH z?Qt<(O9y9vbTx%4)`JjIi9&c+U$ZX6o3tou)ejjh_ipj}>SRRwXypTx3d8Oob@ zHwEaE!_TZCsRz=3T*sW3t#(%|b$qmf>9AdIf-x{AS+MupgPs-!n~A#Zq*5<0+zt57 z$Elr<@DaB;xdgC-q`&GwJ>C$f6P%2BPpr(A{g&18LM*Yo{TrV<=`+52Cpp{UM z)OvPA^v%0p7|!7pddLp4gtC)A@zL)wsSBxCXvhP-xjEFP33$rG_=xCit)80Q3 z5@6+WIxxc**g7y&aOME;-MS32%MPzx=W`~Be)Y6YGKQccx?d8xdP&igY46E>>S}#| z0f}&&Ogj$T8b5|LMwCEMQ05I*YA!AaDHWs)z-&h%Xy(rmQNhP_dr8C~f!BuXbo5yH zNqIm(X+WAUg^pPg9-N*&o5GpGO}PB@cKc0A`#9CfG|@`PeZOg07-^b;XDh7u1Q@Vk z!a?^D_Yve$KkK~DmojsrR*24)k&jqd;Ty%32&pwCAC+16KxNEya%Z@2c&0jm==8dz z*`^HQwO$%9xFnt47KSLmED`1t(u#U=?c$2xF&*@H&hxx%Nf$+PAd`>cq2i(YnrtYy zcr&|Nj^@8(awoJxiT|V`)QRHcQ5eCWB>rg3W~5c%f%NZCxgb5}!+^$aR}1wFW;H>Q zBswMI#%+^|Qp)y?_5}j&G_OAU?r={6no`%KtBu?v1Ljw{i%R|EJW>sf1OzSy6R{dhgX|FF#Q>U zF+Zxk;JEe7rh&=LW>jJ$?r8o`d8xE%A~l=5!E4?H5^-044fFo5Kf;_CCAUhYi}J&v zR)vQ|lCF+d%`g0`Q`0W(omTHJ4V|TLsr~-xx0`(5fpSaj--F-ceUwT7^#$z)>*C|D zuZtf^SMB?EH~*+Lts23d zUA^n%RNDj1JpUB2BwzI)g=0MbMUnrSTABsau*ytw9C|GEKIRg3b&}gs`<-gtLY?-H z*I)GN*=1Efvsav{G65g1jAdX`hbP0HRPlIH>C0n|84s>#xEpUF%w7{=1tB9pR#%3l z1GDbMeDU#EEw zzWY)8+8zuf6m*q=l6(fJ33yi?ZjjM<5xK{ava;Xv)!G?*?L8`m@QqyRF^YE{URRHu zDv!T0TFz4|{Ja#%8S1Jv=1&*e4UX`mZL_nxiHz~?;xNGdUt+xjdEpw8a*Iop{Hy(rd^FQBt3 zJAF-k@aoVt+hCrh_XoEv8)qDdJrj?orYV(p|ER6pt@?*_YNr?7}uRX!oZ-vk8Ie&8kiMGnt z}uTFbGG-e zcknYLd({;$aR-^O!=hCJrk{Auf|j6Tak}xAdbo7Y$k(5e&|n=7qZBLM2oZ>FD%C5a zg<)I(?AEjvU_H9iFK>4E5Gsepk+dWVD2m|>rK1rE9J%{dD|sp}&J-V=W?;sJV*uOg ze}yD0St4q;;|t~OqWs+bl4M0^-{YRZs)zF>`w3%d$#h z_hzAT;x7ONuqumc81d@lSsUL(ZLo@r=lSm10ot&45h{ahLItOq_|@F!FoHqO?mkdi zC*NGum)STTMi-f=)@A}^<_&6$h_?nXGubD&1i%a9Ag6I9=S$2dZqkk-w?JTo8d=3s z5)-G;P1C0cx}i5-9hb@~fA9>O0=PdUg`uUXc_PP827`Uij*DDrz_cnfP4*@!CiuzJ z>+p&A9?{A^_(DkGP%k}A)=zmFi+~vyj`RhYo5~)PX##u?nYEJ`W`dZ*`SSq?K_}GO zGBq?V;Qia0obgVL7QcD)E5rn zVy2+2v{$efEQQ@7J@qePuTuHc{=UZ-U5=220! zGfgkXycUWi=*6WrwSXJkAkUrZN5l$;6(1dZ2j?eeMhkI1-he+9=Cf}rs_j+jU6o)} z-OXZsn0u0j-o{`IO{SPVB_CC z=nI-GMBYq?*5JHyZ(Xx;QvJ>FBNFD32&}OafSPs$*oDWBTc%?5vg-C)SLpJ^jlw#U zSe6Eji*#hbShGkk0Q9d!(Qj_EoJns%ki@YrNtxApq4e0UJ_b%h1ig>s;vY*Q=a>a2 zHkQu($7Ltx78QmCb|d0a=oj_1POjt59rr)o=KH79Xa~d~KXJ zQ_n`t)(`SnX3Rh63zaZQ-IiOYqR1lrH_q9+I&t8$7msx$gLbX6E&mQMlfw6o#>8f= zQ->)-%+QnkwRK>7#V(8r5}D;QWgg1W+j&xWl?@3FtL5#R1-5Y!&SK%ZO{DYD5mtVd zU`|rJMDM~(9T*K+RvdFlO59@prlvud#*oyeF?aIlwE8+sZQI(;13VDc?btKqa=Eoz zgBbdIzZ%KDU5j!cVO7F z(<-^Ribuhs2L6jpft_2G>wI_Or8iNcn*3&K9c}jmGdYd!^6Y)pbgpF#id5h&A zqkmXB-(-UH6oIr$u&4-946%4vXhn!NJ;Jw~S}+t|93P{>oV7o*<-1k?d+DqTkICBE zjw~$gu~OJ;^PVhGNG~B-Z`HdH#IZQGV#;4Xp#z$@5Benokd+LhtjEDm89CX}sla1P zd&DIGaH_ssZM%aR3@)A6wJIbkt+y(4Zd%`uZxwdagjsWt6Bmg*44BzRw^9C-?JIs< z2es4i`3ZP;Z{8SSy_}UCm;jN+?(4=+k%9_pt*vVmY!o!Cta4=vZOwEp$9VrHZYzgr zNFE!UIZ+fE$dtUT&4(d?eg6GiQPx~9n_%&q{M?GKTPg*qz#hq9p!;Dq0F{T3yUF?d z=<(*T)1@+7*LFt?t-O4|eJE>kIV|$o-Al&%AZu^S+xV+%`sh2aTB^&G_pD_;ex<|y zBcTln-q+xv;eDThW`=gopmnTw{~8`NU%pHi(h=WC+}_5#EzN7+tf+npGUR6CRhV6x z8W8G;Iq3jZ!6E!PTrqmWQ8-T?zJM5EXS=cb=~}GBoV^ef#}0onDk=ZMwn3HMtU zlIu!pft7#NvWAZakIdRgcH_~~gwIfKL3GZ_EI4wD;Oal15-88Jl01)%y!o{0A0Q{g zZtrTlc}k=(U9>zMSI6G4qOqd#!ce*@T#oYwiCJOk!x7`Og(*i4SzkIuTpl?gQ(5F{dFi8TY8|_*K*R5@ z$O9UD+9~+%_xu}EGG#yQN%Rm_HK0$RXG8gnf0H|>IRp`+B@!`r5*s_0O#Ap|N?f%V za=xoEefyvy67Z&KM@s}Rs1~niltcTK(Si3`05h>`%V8!@R=@ecE{JrN@2Al^`llF3 zY%LY5$L(pIE?>4!e^Xwhb(Tu<0Wn%n<9PGxtt_37k|z1=BlI8-egli6bhce_rn)VGh;2B zimZy0^euocT@2EYQe2!BPDBSyz%(&nnB2hjtamG>c~8^EIupdwU|uf>z~=z7>Uqa$ zuOQ2>7szM%g!-g$9w7Q6K`!rxJQ~j+Ipw$E9rL}|nL308)zE)0X0o$-rsui{#TH(? zXRq9;0r^2?_|T2ttU4(xm4vkN+vgmZ^Je#BXcL<_tm@9z%`x7-ah3$^1Z)az`>U7O8dq zGz3mQ&^V)76()Ov-Df?)`b!cW7c>a4CN9={-0#z6n1%S|6`mktm(q7LNLTAl+c1s+ zMC~A<;Gi-{BD_{h$WMsF3qy(?{fY8C5#}6cQKWJz9+9#wdd|4p6_E?O9cOq=@!fPY z5VyE0OyjzPmz9g(A^P#lS%A!1Jv*B57IRx0;J%}LU2m3J?sl`AKg$TJ$QeRutVZ+@ z0ydYv$$M6u%v4d8Ae)eyBhw?t^?qXa(sTR>&HcxHOsfgD-g(~Z(T$myGugmh3Sy^Z zL6igN#!yL<4rIF%(#M3KCGowu$@aa0%OU$&$k}IcOfy%1Ic&WkW59zR;a|NarOToTS9{rceoVL>@tlM5dA;4Wh-We!Yjz3Hx_m{G*y~@e;f|VCS8}JstgloR& zx3acCf#l_ag$ioHM48LVQRjSy_IG_ zlLIOO2$A3pBZ><>I$h~Yi9nuC_^>L*tR3J$=8AKRmHmP-t%mh)m zPp1xK8auM1gV3LK)K%_G5s1kku$kv*JROzfj*$^+xuafgC(PO6u2xce&b?1>au||) z&X9@=aR-2u#@ok_xjWX;kBM&F12Gl}7`^$dY~J__E8ocT>zn@Rgziz!SLZsGV8dD= z&boGAz4IBieEM%XrGX74v{#W>b^yX9#3#O9q^ehmj-WQ>s|1|&Y=IIE$CSGGMP;EB~cOAyVtvlV?PR&n}Dc_f7aisry^uDJ|?;J&G<}BCVi? z_UT`t7m`}|)8wL|fNH_xc_CA|ZCA~%%1o^09=wCJ&hd34ylGcU9?1I-hp8=x88;`t zbJL>5nWQ?Ly}@la z>Ww|(Tb}E$-_-DmQ_Pbv)hml!%yCH&mu>!VXnM%t)Kd(*Cu()w{O3{gho2kdoE9rt zPk^ko4#dt2Vzx72F+($N7}vc+te)3&-9j>%`0DV8wS6`ZA^-iCI4+7i+#!ve?PFfO zU#d7&(>H(-m9<6)^VJGq5{JE;%kR6U-ykK%O{B)8R;^Fn+<2`_NkgM&#LgrVJ%W%~ z`>2cD%8{JQ!JIODiQX!^H`+M4tdGZkMjrIjM@W4hiY`aRc2y;x@hs||&(mR}JScDK z!a(Cd3Wd|Ck*h5YHJmcf6uK~X-fKMuQo~yDwIoG_@zQ@6JN3FS>GZj=5MW=`B7qu4 zVO!~};SRrGTx90DNyRK&dfkKYD+&1$F038_54#1eG$PY?0j*M{)@sm{YF04|f&|_h zC%$AhK7>4cTl(M!>HLBL3ye&S%rhOf{K4R&SvAbNNhjLZ98?zG_T^xqp^pYq>`JmY z>PmxLyQ3GG>jnarpMER+Lj4Xuq34qYJ;Flgg&T2i69Zr)`Sl6YD3-}t7|~H%K#eu| z8;zi5N~m{~rMG$qv}bYX!Y(c|Rk>MdIFR9p+XGfOdF4scv04gow)^X+pctdZV|fny z8GALrl$zia?m5?sdRmQzPE+giyn{ItJ7gG>ux7C4$JkV1O%1a4-q#bi0e6Za4e58uqlxoi)c zS#uvKgAAv3{fHxU#=)3qreKty!v54{n_hX9;u*t9?2675qxBsZH{J%bleFhdN1Qz+ zkAL9&J}t!d{q+gv_e1e7y4Lh z^f9cykJ4>tI&MLN9^u`HmkCriOBN9i1cv_&)DgZueR&1vf$5>{7WBv^S!4y(8EFcI zL*?z3y{H1x0=KxSC=edu(U)VpKS)*S$2El%lazmj=oJd^`KAaBJ>RV+qq?;-m#SS< zR--Q)!(%M5Q#39x>d@QmWigF!ne<-a!vpfl7F{*C$3MLq{OTBshs5ZKhZBL;Pg#!0 zR)sBb^SA_}^=84TxTG@*Es;RIVdXD({hqZ@;R65fke9;dQ!RGBnAMPjP`yMJC^k-T zuUDlp;<4UjWUGtjwQB&6pUQHCp8RtEVa!HM_aEA!ogZ$Ds|{r@T-*BtqlR@+(*>Qy zw#-l7JosTZU6Sp52TeX-GglstoRUm>5a+)Rp{TaUg=FLgc*YOJpef@J81h1e7~`}! zBfVWUew~sMQs+h$mti9>k52m6Kr$zneeBLk6RW5GEaON9t`H8QIYg(T;eC{w{_4@g zN9F)_$;inXb)Uj0a4e z;w7k5#7A08-0#n+oc%3Tr;+cEOf9r=3MWild}=hH?^}QL9`!%!ua7AvXr>2ion@n; z-RLKcsKyA1L@kA!mm1x9Vy*%h(bH(4kl6g#aU)+6bi1#6Qv?@+!4Pc^Kwg-As<0^R zWz{2_KoA#Ck!B0kh8iY1Obn&41B5T;hI5|>iVV5KlQbsB}`0}J=s)o_EX zFzo<5*?AF!k_VxXmKt+C7%?{(9({p&D$RWtBDj-OWn75OEy2RVAInKd6N-_zgjS+fKn74KfW`> z^(09gemBm0v~*um344sN*?8NRLWEEJ*HQlU<%S{L{K500&$2V zp;#uA-1!Y$2jJG{zx=&B{gZCUQ%do3H%a#ZPWj(TO{o)$cu^VwFFTpxX>6)bxdl&0 zk5D|67m2GaRJ%`-ek`b!AqWQ)*k#>r{e2iV*t!38_P-5^0cn|18G^~xgt0<3DU3-NX(O3U55I%3lzE5c4PU5MfFjpQAVI!< z1l&#kFKL=ygn|5Nw>iTQ!EhAmGhWKMEa(%G6iGHE<6;lW4#h7|UotA-P7sJ}UM4)b z@)EKtWia(x=72=tYm(&9C^(bNe?3VTQdx?`vcQ9DQm-^m@=T8Jh7mySqq7)n@}li= z2@7kh{S-?O$L63gB0VL|<$#J7QaM74#s3VGc0ZDg!JQ)Ey5RNNvVpyPLDWG`0^KIk z`p(S8%T1jm8ArShH=TJa9r|sz^x>k23WBYZrjj|mf^r{y`Pva_D|Z*k!M#MXj60e| z;t3ABQOY+a49~xq?vW1$d~vvAIb{7Vd~5heg8pN%Em@+sJQ+>9PdiXGJf9llk#XOd z*oe@?gz9|dH9jHGBJEx=7kYya`gbP%3^LLjNk|)wOvNh%-E9h;C#}x zCBz0B01agT0rA4xxzFskuOQTU!gPHJ)_5|WodH*pX7w78@WYoTMic%Bn%4+RyDkCJ zrxQCghVY(!SH|U%zpx_ZLNe^wBhv1&P;`nr350r(ZLu_@F5FJE6=)foEt}Pg77zCt zQlC7B-d|~{AzX;Tjb(=sLwzXxq2n3^A~eWqmvReLrhAMqf&Dczq;KB{!Ey}xh&>TW zAo)*2NG?s6!krKCwuQFXr&*X+ohF)}?n?!kRg9biEgz1nL>2KF~Y(LDqNk(i|p*OEe#wf-}1O@ETGe zZ|tZSBlpiVcEN)$2(~ngQTi7Tnk?}J-%e`5nA1=LWxH*h;{4@l$As6CZ)Kj|{6kKZ ziPzJwwNkOe{ahy~eX0$cKKu@mzX)^JNXsi5dPNzbNC4S)%-U!ZuP)1PxuvPe_wbp> zwA%C@UHLjgm$bIWOEy!7B-_l?)9M-H-DKO85!Na0b?lI@8_`w+>|=fu@08?f2{=v{3bXLRJ89K&*!uOn`Y@)z0ZkJ!TwZ*!a|oX-oxRb_+kXr zd*uu=LQp#sD|mn$PN`v7uNerVgwjEoJbvl@H34yudK{?}sU3I}Y98MDqY@rrshrW` z3P}pjKvZU7T?$r&8?M=o2KtBQi@-GgN{%wgz}xO#KHuOC2(5`xlXhvpl3Zx1k|p4E z1vyut^tOcXRZp!;g}L{7-wT%@%Hcmqix7AQmlPheuKiZj>F03aqwwoZJkw$qZ#h@F z&zx8S>(iGR{o;Yn=R|qGwaS8+2yU-phr~abu)@Zvfsn)CE*!UY?=)TZp>CXWp)}li zW7qVtA&C`~{p>lkL*;oI8xPJCwq0*4ISMD+`mL$I)Yu^Y<{3w7%7IYxieSEsVYk$x zCtn#AGPl5uB+KYSuu(W0f3_+{;Z0lWw0_}_%rrsV(tipFcY$w9z0A%Rzt;61g%r~f zu=M-}ajsO>8v1rR8Bh;JwnmwdENxL*13JW>U5o`u7_p|2H9+AU(D zyX?14p*CTS6u0aSLp-NZL_MhM`a)gUJ*P^?Y?LX8!d%?x_|x!i2c3yQ$Ub{hDpaje z=0cC*t>#M78pcbn#Jp`39Pw^ml6SLY{#E|2;xA#pCr)8^X7?R}*abGjI+y<5TY9^uoJt5mNe%p9xTg(Ng(%xRd6< z3*c7Kzk@@REc+9W2cU^RXI|F~T*K_S?qy1$gXNQShQn%K%b^B%jk&bAa$VLKLmYY*lTzZM20H~?rM#a^HRlUH&&k0R<^KFg)66>^H>f$^C9_N zXjz_dh7lQK<$Str5`c$%B6PeL8{Avkk;zK(*p4>y<0H&XnH-b#Vc~)!>JSO#3!&mw)75??was~cd0jsF8(yhPOa=;8F*pjn!sEBx;cWrbk z{jjv)6pfFK|Hy7vYlC8uwjVB3=~S86hW_2-4y3KTQIBEiX%Q{r9Cmm|xEP5r0y?a? z;uY6k2a?`vDH1zk`Q#1n!ry1_-4-h^hJBmY6pT28i$LkxJIFmBlXc`ZdR9GmX4>%> z*p>29!r?_Ruyl|p02&kMw|D*<-+wAtmaz*4(0>itSmQz8lw09E^HSXe1h+ECcN)+- z`4qRIsuYa7c*4+nZ;!K?+z4^2jd*mFZBRO4H;TP5Y7frW6?ZOv+kuSY;e@&Z#lEc5 zGOAkFkpc#TykOxo;W4FCz_kQUo33g~!jm3pv~uX?S(a@ct)!1dN*E?h$-^|7H?#m> zy(dA+n!LG|jsUwi!9g^xOYmD{&|odxoQoVa05qnAqLk{RPPT;UMQsIZK_Y7e1Xw5O zE|z@c+jz0?J|b~6F>YZ$;m|E1#pTr;4cwgam$gKraZgOYMo~g&KV4s%ReXt@l%3Vy zVfKttS7JKQ(hsblnu|~n?dG3!(N~~_M1~>Jl;NaxVG@v(j5C&pq%M5bluOVldh>FZ z+`Hyq1ebc|3FL8@hfpQe_Thi`B{vR<KGme z*cx|mTNe6jPE0QWb_=sG!l)*0t<~ou#|Kf1Uly~?UPMp(MLgd@M2z``%gtcaTN03;2EslB5OcKsM_eX z9cD*4}dEaEBgoR(>WjM%(577YU{GQyB3VRN}6;S6T_nwk((cJwrmDuzOP&f}Xo z z2r~C-UOiUJY<;B^JFn22gC%4VH{#j*YEI+c$9tUY!?;u!`7iT>U4F>w3gULXY(nq# zS(4#@y(8%gM~*PpYH}u5K>_i;69KF&!iTP7AT-bRLg=JP%*18b^Kdx4Pn^Wht3}A# zP+K&;g;2At^wz=0z1{47RiP8jdre8m#h)T36Dv|J3LYCDpUhx`iPUd-Td3KG? z`_1@mNX+e)+xZ4OFD9|rfwXAnh>uHsjYS%pVv}h3Q7_~a3E%r8+Qfl0(HikQ9q}X> z2{*xX519VC>$w<>A?vdkIGNPOMlbUJZgFbILyyT>GFp6*;@NNu)%n$vegtPx zMKVWCWRahjacbh4mFG;SQKIXaPGx4lsIbW|9P0Ls7Hhs%E2nC$`RYt~ z03Ar|JKME8CjN7<;=6Cz7AlK=<(g06E}3k#n_}hMVp*M(dC^^(@B8tqZ5(c&G`fqj zo2I`(~~pgs*V24 z3!V^gSfqklBd649Zp*Vp|C2_brQlj;A%Wg$9%np{II<($z&JB1CRjS7&0K#D&qGE{ z#}oeQd7Afr&Zm(aj@RMICT~9=I&senco&D|zhoULab^jGtB+HAUp+YcHMl6;f#k@U zk8S^%_4Mk+gv3-AkY~uO8*zRwMFsLqKSL(aggX1%?;cv|K>KLWRcO8vYRn!8WhHf} zZhe_mdLWkgoD$<6Fz-q~7L+QyPQ}08Q!Y_BQ?Fy}(61LTR=)@rBVTt)bl)wO3DYf^R~}~9=<~~#MErR>Gklv9^ZxmzQogZd zsexq7B9dgB$>fRqgZa=_;aAT+C`cUUuHz(|^pKnj!|Gej0>U}{n@C-xlXLBN&M9pN zM@jPmyrTcja~P!kwD&e8$6EXQ9zx(w8Jw|m_MnXlCvb8vWS^&=X|avW$0|b^Su8Ip zSdQ(&@@M(G_%g+Dd-N<$qV|q(o3N;jSSqmHYbDbK=@VO_v!te6L#C(kxk@GjE55@q zXSa_`JRrQ3TWg*n*!6~aRv(P+zo%@+LGxj1nwjMU4-lOsBIeXqzLvuY_orAahHC}d zPVPrsSW>6wqD+KELf8#F+g+qh89P&Q1*{@0oRk*Q3D6&M`Hd!|*>h_Dc9}5csHbb( z3J(+MFRG1e#0}<@xHUxPq85(sgcAx>s(IPYy9Ko0{1pW%l{wUbl5itfR|xQyZye_IXEPp+HUIT4`(IKcTIs7>mJVTUf==od056qG2|`y zNg+H#$RkHB_KcL5EKfJj+bX#Dl!@N(`LYBD6`3)2+`GGbZZfNJuD#7`y_u{vb>v*L zH!_=PUYVmfUB23y7nbw)X-%FRMjVR^eM*UFwjRJ4F6>c5|F)9hlgVzpGgLe5T`7$^ zt}*_}A2E?*QgwTD4{R^{kkuc#okZQ(YbT>Z`{?He#H%BU+pjwq$d>ujjGn*1VSIAafdA%3w{$4Ake%_EXWe7o+K%1 ze%eOLiW~gV>)ImBDke*efD37t@-C1)r`e1X?3W&QODwFh!#U| zgnQKRcudKBGPW3&{$DQw7-OpYP*(IsqS2mto0l}(6%XP>^DLXR@I8fB1ea4@zKe`c zS-Y*aV2XDlnr1dnNXtr}FLJKCgI+yIiF;l z$^Hm@j8&sV;$AC=k`^F2a7fYmJkuSYGm@&}^86)IjN-QuQF@`=lWEYQmxCF}2Px-e zVq3o(5DWkvPh&2N2Rd{O?}iXYgO2k((d4ZE1U00Pgb3L~TzT^1=%qBS0%&XRBb=X6 zi!=C&SgiZ(*m$ua^dWbsU_CYR_-NHSuHp4})6w~)?fWBmXow(7GuZAIUZ&d9cEzEn zcCJhe#3V>AYo_F8mCD}8e)t%K?~0n|=cI6*h)@g(a^it~fVCBAsm}JF2j?Cdo#vi} zpn)xQXTbQ=s2~CTpHk{ESQxgFeEXG=ZfQH_9qPT1r^02Smz6X`pd5glEb+1WxXQRJ z{l>gF7;i6%ctp$h)Rwbd)+RCEBJ7XfZ-#!$3w>dB^(0J#F+Mm@AXSg{nF9@zlyA>9 zvlt0>dToK)n2k^WA+CcQ*!m9?^v#9S2f5R28vs16&n@*&>sV81l@Ay6CPeykRQ*Kr zoOVA>Yd#E)W*5o&~I=S@9Bx8)IsRua`3V7->Se2YHWh9g(*6AO&+q4fk`rG`B{COj&FywO-3T10`fTn|?B-f5t@bA-N+fD0+hZ6p zEY*qW>r#1vTPw30;$pW@C#jl*KUcUs+8?Z*X%}B?|o4 zNRz8TED^@HxKMOaKf#ZGDp6I*(+!W9oM4{8OG&0dXY{FO~XoS zuW;PWBgd9m@SB_WyAN4Dcdq^2vXQr5awRQ4-8o%lH0=3qn3D78Zd0FY&R%N(sahK1 zy?}+XG_-pd&zd35*S_(7*9`i{OgOa}jl4Jhu%?BDt|46)Z`~r+9lLdjXNkkFyabgtSCe%r|2l4{aJPP-9xZVc{J4ozIG z=Dww;jflx^+)`cKbBL9&=NeAb`o){mhw^r@QA9g|P2w~zdUL^zOV^_w>_2j(YHQK` zDqm0`#^jUY=Hh+PDPA)8`y(^Uv!&t>sW%t`FAolO$Xuq>J76d+7t<#3sDh=umR|9Q zgZa%d(Xlt=uSw|_t9n_I_{Xdc+H?043pR(nou!}dCQM_9w#fwZ9LxqF&;`yKL3xnP zg0G;Jdu$@z<$B~lXgfEfA#c~B+msR$w9p<*F=8{KoX2*Cee~c_Q|!s>>+KtB$3N7j z8YruV$r-};7pf3OQmDtUe|$SiJ*wlXXV0J7iGpxXg5W4EHRnV6HF8=LJqx{ge`*Pq z6Y{P+B3QeFl=3ibVgH-OCYAt5Q9&R}y`ZhmVp> z7&X~lKp$QhEf8-bEOIa`l0lmohv{P-!}9tdsRs{@+qBn^`lB!SFmfQt@ON`zkEIZZ z@8;5pvx(xYv7j&6U|^^3o?F0UxzM=E!n0j;O&CU54`TTW+l@bm^T7GJ+^tWTKfz+0 zSWjd0xSKW}1N;0{k@>>!yM4r~i3Uy#mQrSGVT@xEgZ%6xUDm3hdNwkMRmu(#rh4b( zE&-d~!d?46s^JX#k+$Pie#88KtaA4pMniL~M$yt6Sy01KW?ynB!I;ny#2P2_15kmr zFMFMbRl4sQONC#4p-!t(DU%bby-FELyeJZiOvdJQa(r<CS{|CR@bk1ZfhiAgEm>=-agi3hH3CkZM&DW4RCu7aH=<~iOHC8`@E<=>U z;7a4{#iG}t?yNvn%KGe&zhebRG7PNDMMZw|5o=-1(;g6?;-4z;vy4-KSZRMoSrREK zp>29Ie_{@DF3)n0Y3UbTgqemGd^V)ip!?bw#*LQj^p+ueKh@lFfxLy|OSju>6|JU! z{7eKhhWV?k3n7KVt}8uipGTXEYVy1?loa#$oFgWE^Y2&R1La1a)xv8aFMvZ76R-#` zJ-$2HGX4??3!pj!rW}wlq`G5tuRgd+qwFp@&J)f5x{oxV7s*Zi%^2(^KC*tBa)yC7 zT8KM&HU_0->7*Jg)ki%2^`i+}@^|7R<7e++>Bhfs8|82QXObe8LT@M{LY?F49fgN} zG;v7ua}gjgefqk}0Urprv~Mol)TCUN7F%lXKp$+(YE+Z*VW)10Wr{Y#odOVN@}wjm zexeeOXPscM1r5=JfT<_IMfpVqW?`iRzpr~XjTY9D(+gqpIqxm`5e-W(>8^^sxw7(q zSFZnmUilfx!3oSPu8{!O|d83+` zP0poRMCT0Q^28s?L5ER6URC$;x|(+PePUTbQ_apvqO zK-=Q_cKs3f>1rpELqp|2f|>Q)cfX^N-_@H@Mz>7+V&I>N6LKa=_7GjxH0{4v}ih~2K^$pYM+pR zvS{FGd$s?op9Ap)Z1U`yrXM-*`vR(vRW*ma?m6SCj~k4f(3K464^ zCDP(&oYr5UJ_#d=d@u-9!44OD9Ub$8L)#wYg$68{Z#DkAXSX1yKgiV_bKR$E)l)hC zzLK`^(c?IP?a8P)`3bAVsBhp3I z*e_$r>p1fK;B7IQx^L3)@8*kK%JLoCJ%c}E2jM%vK;&Z$u%CQ!`L`Sh8P5ZfPRrtu z#uyEWn@0;z^&1NmOj1zfr22RxqP9$S>!mbNLLhl;Pf{3kLug_exQLw}u-*J4ID{bx z-2B_Wdc-=yjaiIp(pU>SFOSBa=LcSSJNjr#{%-IX5uKg^+hzyztHYPy>3>hzwZqb< zHVGS*ZhkQw%-WjMT$gTq%j9m`K?Op7Yid)MVE(!Rmcb(Os9%XU)8IzIcdaNF7NgRw z$NGgQjKGa;$`#-dI;z^ww`xJI3ZOC%Tim25jY#gbrLbz!^RXCdKn)*&^#TDMlnV-n z-^<>9*YN|y*YaNU%U+RwBWvCVwiKd3)Nj7D;D)XL`$KeYo~Y7e>x&<%yreuuP8|T# zbOY<&Wj3NR^RjF`->P7Z4&uulqO%1y!A}OF|9yal;u7#MdqPD_pJv(Y)Q`!;`W%(K zI`ro!w*UXSOUy(pa2uHsFRJ+30|b$zO)|FHmE zgz_KUQAAk2a{1}l&I%W44zX)pozn;#1fEp;H!AUU+0GxhEAzTMfPt|WpicSDtOyE{bi1@<)p9>z$ ziZa4rVOY9SK}~C-JJ%FQqFxGLxgL2QAR=}bfn~I%ie=hG)^f{)LS}Vt_`;;){97j_ zb(+VR+cz+@&2`*OD>1xt;L)5>ECcCN0&3~+{C_(*>6(3$J->CWM#>YKSX+&_nSH;g zAutT^K56%WjL@zT+wH2R_vi!(4iD<{2jx0s|4kh5#t4R`$(OO2L_56uC>G**^;0mg z7TE8my;RkE!>vC4nKCT-c&ndeMmtYRVx?31`tm4v156|GO+RWNeNS=#PWSfrO|QVC z^YcxjOpNs{roc;A&=c|F(8{!+bF7hvoBVbro?L>{PkvG^BjRa$YkD&#Ujm^s%#uIT z40nKOEa3@*$MdN2!)^2HIY6SDx{h%#`QOaGtylokB{c~uxlp$GK3m+rfKT}waMy&> z3?QG#*DwHzA=aD!%yAz?`Z!&DU%Z<70n7tYq$j{wqUxBaEjG(0d}s9jb|vtzg*I-D zFM&W?-R~N_Rn|#TX*Wa;;N&y7*whVt^hfI7m;bFw>^C=%<^{AygZbmaSl|`r0(`eS z)7h?lw2A}SKnQ4EE%ATYd&{UQ*Dh+9lud7tl-hKIfOLb1APUmmA<~jcBP9)zBBdbR zNJ^)qNS8E-lyt*)-JJ70-}}7(zklBtXN)r(=YV_P_jRo`*IaYW^;`}TANN;M$B zWLR6HOo4HJK1fnjycx-Dy%FSuMfcU zj0HwyuF$ph!KQ?qWayxuPx-C*m-Y)85DJW78}3kYZGqFa;I#vtvK=@Jb>gQBZ>96f z-a5CUlLq2*V4E|;5VB~cT8ZiOyOR_y}ys~%bt4gZT`#T$ZC5E-Fi zze+H^IH0lgEIBlf6kN(KZ*q*TWxMxK9~dd>|p8| zRa3TuESYMbu=pt6gph;3=+f(?VIEg2o`C)lS@sy1C7ioa)cf5JkRVw@jArdI)GVYG zUHTHt2!Dr%&{FS}1J9$*qd)+99#Ymohq4EL2=~1i8i89^qMp5Q`_?JN{OO~aTrv(^ zj{n3xgGebb0ZVbu_i~x0oo*Ex3mpKy?Z8qmy0gA8{`(b;W$h=i{T_N~fT8STOW0EO z`)3v4^nJNWsQ!^s3^oj-#XaA!lDtR(;K9a!5IjK|@`5t7wx2pV`cOs>&wY72xg8&d zl-)ZZIByGOOf-jfx*#RgXo>T@f&1+^b~FUd>SgLF|2`c~=;?H%Aq=76z3q3*>97es z4%W%JB#Rd}A(LQ9uqCuZrdHYH=q>urxSoSuN62R2Ppm*Jo|ikd%TbPd{zTbD-}Zam z{^aLQpER9_Tjbpf^kfJqaOYZ=TqVq2fN8f&oEcpenG`1Y1 zd!fskDgKK;TFozWKfVeB0%?$|gPS}@QJ}X81j)!~y$tH1Mvua#?miQ0byRx|xE;Hu zv$glGy+{`+whC}H?W=we6y3l*45rY?jZueQjczR08CY5tsm-R168q^K*mhAE?`|Bv z1ZOrsIHHLZx}H@q6qp;F<&?bgpyk;hp6Hfw^W`dcLQ}ci`Qw3q({7d=@NHTJ1C#ih zL`%8G3jWyWQ_m4vp;p0PSs0M^)I$|oJ#%AF-cE&^A5VeULoYGFR#;@3WaRT&KdP3- z_=BU|AhOndJ7#@Q`@TLJ(`uhv2WE}YLi`;!!r-jgfl*7!{*vuK^>TSGt|yyo0K2`w zKnopz2G>;;Xac*1%Vm$-X|8_&SDN&{qKT63qKBWRS^l}sZ{I;j*k8@*>lOH4jBh#E z?5Znp5Tm~)MdA@@s16gdq))Q-UaK{@(>|GmiGVR#dZt-`S6@JbiD1NA%?$!jwn8f@ za-;^MZoghz={dHdo1x~X1t8U+WXa5mo8Ei_p;Y=1!ac`@MpwNg+Q$CVRM|!skpN!# z$mZU?g_{G?epEbYQ2-5g15T-FvY1@L%i|Qm4K6FKKiEb5NURZJmkXGP4{GO9! zcyDC{pbpAJHfW)*Ub+F70Vsv2NOxZ+MRiPqh;6}eTXZc^$8&`paj3x2);1ofIQiw-eJ{;t zbIm8uWw-Y5ha^yCULk5Sf1B}3w~PY7&RFC?Ymg2emr&H3vKME2(Y}<;Ur}}V;{!n| z3b*9&g(*Hsv9I5tBl!XL)s?z@_wPYmIWax?_v7eNA{((((npK&9(3|Jzn|O!De}#l zv^C%Sl=WU>;cMuUA%h!qhXz@hAawvZ!KHNn>Y24;@B|a-_nPw952rwuOuD*UmmJdf zAM()^hS6uPI!;7TWBK=m`8=DU_?2bewcF%U_YPQu9>I4TFO@I z;iaCScb>@a|DM=FM0t7%UZ3R-Ry;^+M!jM&FV|9DE?u~_d>3rytldOOsG<#-f=%gc48%%a>C!0m_EF}^!@EZxa z{mfQ4uFSaVxBOuJ#j$eI_$_YR03FqOuSG4cFy{NFKbTdz%C_*qZngw>!=kGW^2ISc z5{iv4k{4Ti4ZlBy_X`HoCG3LuNRZJQ>8XJJ5ejgQdgOeT5yeoP1-nJ2^qmLB?tZ>Q z$%2{WVo#lVXkfIf&%M}thT7@E!l{0JHspKH|HTPN(T;bmzAY)D?{t>WkaC?j{;#T@ z9L4v1B%{ArbqiFlyg!)%2k=qAepX1%o~D9&IOz70$z*E4S>CEhvwE+kSlNP58BRt7 zN3u0v_@t^{?PNxKwHwA>LMCnPbV2(zQZMEX1Ir(lEy`|o0%SN0t&H*g)AB*FL`r;X zBqv$%Z$hMW!Ooks=KDvut5!>|7n)?dWOgK_Opo;W=nM4l0=doNk?Ye-fh&v6@A+fIVvElT^P4kck| zdVUALHV=myFhF}3X(q@uu0F(T-F5ef#SqtrJz${%8cqFp@Of8Q+% z1dhuFAva97-&rB|ur?sSo>USAlc9O0l|HD$1Ug48+&+8Z+8MO!Du>UqBF2monX9k!etgTC;P&^X`Ehu?6bP7u{8 z?4tyw0VfbeV_FXFww#iu@h0lNdlFI0elRHNux8(o@}6_Ak^NkJ!pTA?N^&vqF0V4x z$GXiY5XQCG!Mf{Y0_cTQ2oCp%iQ7M-OuDQSPpF#27*a`ot=VP^uGaUD9=)Obs`-w) zU?A3QVXeSk09+KS9nA!Sw?U^5Ix6;_=JXe9@z&sBDgDOyyXRMvf`PyD?sfa#p&Hxh zfYY-zbI2%ajYMwBz3Ct=R6#z&G4gV60|0~1Q0Ak-mZWD6v_9w_QsRHSP<+aKOU$ zJ(r1Tm0rxtopHP=P=INcmlt)%(1U{0x~SV;Ar_1Z3lM1&xuYz}e8{Zw9RDzRXsm|D zm?2d}`Q*nKu~_o(jjV2SOQNe-#^6jA59m^)&;Q_p`r%iQWsn~G3}6#!R~=q~TK;SD$z)Duk3 zNts&!AN_76)(6%kb&z#T@9Y~vfmDPbVTzRKZ_>vFAbl{4{>l4X0qIN3SpvH2HB@`F zCMmvlLQ>R-T|1kYdfPBqIc9|T#H@YXRAg9SAT0U*$zvmM? z!eu8GB*km}G;OaeKU-xl^H2H+l%jzim5H}ExPWb@$emzf@TByE0zPAc?E`HCsR(0o z{{+-jYL;?9jYzL?v83ec3_`PX>50zN%{S*0%~47m^XKC=6#!;_C1pkNN8%~#p$j*s zM%0i_$;icEfSOB3yKo0&`&jhcbrjBHWEoT&s!&=1e@j3$Z2_&pwrlellNnY zJef{4CMy6cWQ?Vb8Fwa7r!1+9s!69hU0bLLaW-+vy59&Au<4_khEzLJYH~saS2b`+ z4-R@RdY5gMbcLEkb>|Jq$!@t7-EwqK{N?X^--iCr&_sny^2{w)n?pxP0ly9$d5ibV z%3N2v1uFk}91V#3a2fQ@5x!w_;DGTmyW!~cu5?(Ymlhk#A2=$@b*(lBaK=sPE`(JC zx}Jlih{ukNZ=X5`AcMGuBNdDw+L-_ib)_ZE{nwQ2{lB8UjonC?6Q9k4hSmIi{bKxa z{2sK#p(Z73avJXqitYw>!bq5bamsJn$829yBw9o;NZF$10VCK(i2kj7t5wEguak60 zOvMqdn~&!cqK>e*9~g$QJtP4v(hb2PlmPBhh9@Q)vSG_^OzH@f)>910Q)$~O69Z5d zUll&UKqg=tXw?B;RUo3o2O!CyqJ&_$WAma*Dp)gJK|FM}DeDJarK>X=EeHXkIyA>LH``Cnn;aqHQzP}G2>XvSVJvO~jw zxxP3aV1Z*JI!+b?`}m$4&embRr`}e6odF8_vkbjIe0C6wg(o853?05EYIz>Y5ANx= zVyU7DQgZl0wlhJlSk2Ou#;!6P!sjCMf?i!;d*5CK_YED-IM7@@f69cixgs(KxhsIJxSqY8z;V|)pYfmUt1(EOEkKqL!Q=>5({39|yG+HLC9Y6+j zaDDF<_+G<3Hxa=%NR~Yw_k@dlNAk>UKjWZQbsZAWCbHHVEwX96t5R4WAx)FY^F!Ww zB6+o;Y{tQB|5#JXcwYA;!|{jb|4^eZ?xApGo)S8MH8ACOWs!K5Ka+_Hz|7hHg9qzz z{rw5qwB?PZ{X0<-@wDSIJxH<9dZ1kT8LL~y7OouhwNBNf5;X~jgWnN*KfZf0-Kn3a z`|#Bhf3KEa6v3<}7%_(y!n@48wct+Q*vpC1(Jz+XAxbjFMkwEb=teaGmhDAB$b-}x z=!!MvU53;anX9_!@t83b6~;a$woAj66 zI|SgSy@iz&-10&=Bq$1}lcwREbTD#>Sv|Y28htyexFmQ5TK|R_Bp6Ph%A!~N3BV}A z6_+uCwaJWk(9^z>Vuvy&YSYon;#+pxL&A?Csstck?ZqWZ}$ z+XZ$7pcuExWa_|(RWFwYpED@FfFvf{r9^&TC!?}XGpa|%f%!FB)_$MS^(!o4#yQcF zzFKJlinXP1?vlsfvOP1o5gBDf0osfD9KHg8*xOopuZ$=5QfKScfSdanGV0QIcm&BN zQH8{kYG?9uYc2rtK7`kl{KE&KBa^5Ga`&6<30m?nQZ1Sf9jn>N=f-0f5`wpS%QA3= zLRT%`R^ORK-?0_J|2XSok@{)mQ0+FHgJjh;gvt%(o~Ropdf$m_lb72x51ty-Qdu+y zjqlfI^^ zShO;3pPR7ORRt#jWC9g9eoe+=dY1LcoIAcdZCUB9A~_+DYU*8CoXnj-ZIesCcZ$vg z@T{Nu-(2~jk1UlB6Wi;!I|aCTO6kM&=s#;X1BkGW_#*qoWuDE-!pf%VW{9lbV+OTJ$z}*9_)lW}@ z-zE}#(H7ITKujJtAh*9x`MsmuS+3e4e}j()l^7UWV{FjjdVz%kdkHf>^}H)@0Y3Bq zp2+V^xIk5zKn}JG!9Q;c;*}8UwYcNY;*S9@$20SKJ*`qhQ-`r99yHi^id0qB3W?*r z0t(}N$R}t6%1k4lbJXHVe}q3K+6;h9w@R@614QAimQI`j@vMXxMHg^vP^efv>b0C6 z`-2{UroTb|lj3eR$);`vu_2&RrCBuSC54hanecGSZ4io~;%XbGki7&)DB{E8r;+R` zIWl_FW2+cqFFbn>d&s5%nZbuLO7bpG@uXz87wQ+EB^NxpJ&7am#{qYkdQV;GAHp}% z=V;sl|7qQVfarpX0^I0|3S-EMvSOIeNEN+`2e5oX5D4#W&^aN^#!Pt=7NlMSiT-0) zJ6axFC)Ej*55orQL5DLq_4#mHkj*#HN-{~Qhfo?sTbG|FUP^vvm(xGX@LV}j4BFj5 z)8qN**X(J*H~GZ^Dlmv$)td@gsYIlG+Dlan`xB&D0mA| z%pC}T1vk-a7{OAo7eLwgg+D=Kw_@UrIB;mT4Qyy#XnTUCABsAd<0wJGt=%Lw?Z{Qa zm!@Cq{x}JM><~G;hj+^1n>!2lUfDQ5~*lUhFuDAT_Nqo_jJ^JIwHL#3{1L zjDYXuXdZRnXAombhS^?HD>pbv)7S2*t36G4$~W*JHmV__dg5D|Ptm_^UHRMB;fe5r z%h$`HVLna(-s=J=&A949FQOZ?*#E8BX9!-P%LMV0)==pOsazWQULC27(#o7hem#ep zHhwi=$DV7)U4K9S`;XDx>3<^Ky>u&WJjT+BdWhpV3vhG7Z5w+b!I}1jNTVN(CK;Lu z@RBp5$A)V&=+kPY1A9coV+1)=qP4u64-fJI%bF^C;?c5XM65zf)s6N(zPX{Xl|kpV z`_!!nfW>P%T>iZmEjbijQjOmCiMqZ0!$15-bj^>VwA%FJ14bVIJOLjmSwGO71z|bg z)wx(AZJr!I$?dwG69rc+FUYSup)3QqEUqyGbC$KB0Pb)-rF%HfwKwx}O|ee6&dgrU z)c&dQ&Ktt*zqt6$HmVJPM;zcJ-2#20YGD)8oi8@j@}09wi0S2(cXdJ5RX2dVCuv(j~0B>AJo_qPC5Q9XaTCb-7+e64)B ze|a?dF5NNcCrkk<)0|qbQ%vB=0#*}s*;v!xAnp#TAKb>9feFH|P3;-YnbTZUBS@K< z`oW|Kpl^uRg{Fv)zDVSgqC}gS0z<&mx*09~q$WdPWA{hAam}e}{=3LQ{Esy*N!p3m ze&z?p->+WU41{lV2LoNfPq@B__U*y0hn)^4fo9zlWYGt&_QL3UCVAzO3fjN23oqvq z3oZTw!|nu20X@zvdg?uUeuCa}aBN6yD)b&EW2AlsZ*kyJ6s7fT`wo@msCPW>YER~# zd!hG~@7kp^7Q{0BnUt223hYJr6|mi&WLbYC6v}`*Z4<~ClyHWvTo{c3mp0fVpNr}Lf`$z$Eukmy(WJvp6jlusXs(bm6ci6kB9^a&@46r}YzrD?a& zMq}ROlAd65VNQJNe}9fj$9ROfOBD6A^vyZbvx2`?WiaBMv#XPNL1j@Y28u*sZ3eTP zw?R?8Kj;uL7BBWD#%@1Z4xOLz;s)6^P;!LMQ78q9|M5=lbQ^i5nV(tkZ+3q18ci4Y zy}d3QZIPBXz7HdQOv4c)KIfvH?eS4;1`yA7U``{Ua47T8bU7hR#hQ{YjrKN`Sfky? z?i3_rPXG#?!EqyB?*l&@0o%#4l9_>*8&Rsuw^#OAzu@-Aiv5rp$ZN05Ql)W--2^}f;{ME=&Ef_o z4+HkYx6_D!aMFM*LOBoG<>3a1<{5n7({>38ze0=>zVEA_hwi#aY0;{<;1{?apf${y zMT@ubChtA?0oJim-CanY@z5@ViIJI2Nx8IKh1(pOf zIMwr4JMIKkTc8Yi?l*1eU%Ck*HTVQlY6L(mWM2xDE=IO&&@u?`YX4-4+734}r8tS! zjeS(agHR)xS>ht!3pQKJZJ;rLWECAA_rSdCS6b&m@WCCIpQTV@ehZ($4~$f_fz(O- z=hDTuxFbFaDO_@RSE2B9Se|AIT1>BhIS#@9E;VNanpDR$4%XR<`XeAAC!p*D^pnoX= zHkf^*dhJvQ*KqkbE34-k_=eR-_Sj0Ok@%HN~$|JdF02J2kHfa(vxq&F@1wG^u zjeL?P7#jb67O6hkzo0TlV?_Z=8}%JPo1>P>xu2(cT-T;^(6@1YqjpdpiTow->#qjv zS_sEj50#nREAa5AK|gA)$6CC*g5b722l!pHaE^-5H1PKc0EojvVu&wOd>|dDxOo_) z`;X9t3b|iyYSs8Tpp}+!57r^RU*ET?{kh2Oe<+Dym_Jk#R`+o=1j-@r5FRC_Sbkuw z_!2?JLnlD-M_jqvRRRU>40m1bimYVL@T$lHOYtHmnfth#y=2!JwK9O1r_+**MBS__ zq8wT$!BMjEX9WK7sjzmSqMGrz-0eY?4FN4oP%u$SG@>c#nMLx`eF1}}S(|qaZZkSi zvdArfBEJT7D91e<0##b$jDdT26=?ArmJqGFDa=d+Uv_xB0nK-rsF#PquqZLW%06(> zrBjD&azgz9(C8yKr&*3)_|8@gS)`9|-u?O*im56hC@}P`sue!+7fGL8k~Oma>6GFS zpePWm?t_x8C`jS?SiR80w28JP3y#%pP4TBsms!5+AAM94W8$vC%}zIMJa6`bu4i=q z%l}Y-|K6?t1T7Ita4o5RwbqM6XC7*m|L4E_d%gevpa1{n`DL~foPy+K2T_H#1^Hhp zxP8z1sC;T}Uj!R12o;2E=gk<~y`(JJoI4S}yMpLn)0OZ5ldlcE9{zHvQiRt&yPRUeE;8=LfA#1vu+_ zrl%E!rA^bIb*41~2!k!jn*+%Uxd7wUeNbY~Kr6yP^vH#}=En)8UODclhZ*2^8u)`N zRPx($F8hD~g5T6=*XzF5%sqEG9wVeqZ=ytZ%1Dj8hNC3TU(A4J1YL~nXM0KQT&dB8 zHD};IIR%WgUqYVb&BX@wx$-W#GMRi(O`LL7(C{A()4%t{ARGy;7vR}@By8HBF1vj% zYpC54Yp(xqG3K%C`R#tA~58VcKkl$k|AJmT*R;$QVvUiuWy~v zr26Fl`jsOD2-qet>of+If|9nN*@#P;*qhl>p!k3NZYzY;Y|Oj6%+nwh*aFFFEOdJ962`uO z?!4*-SjJWb$s&(I3=dA|wXGb`)WUGy0r12M!rXS3r+ZkYdZaH!9>c0IK=LtzW=_z9LA92q*k zqB6ARv?yNTbJ7CH?CLaj+Y4az=xzkhn|7e@o%3(4Z*zF={Yy zSK!64Rg}g_MjKQ$VLim%V3sR542t);J$>;&IV(_lbG>_0{bqgZVHuypNiJN)6|fF& zK>s!ayv$4_Ph0U{T2^Nn6v+~W##dP$u1{NhRJ)V4U(i) z_W>5q8kCP}rB5S)PuGm5o*i)_C%)umnfEHY5vErQW+Ca{t6wKgq(``nd+K+_!4e7= z3h+$pi@RN$K_UXHA7B%(S2ssSH^lzChF*J}@Hdcb>Ev}YtP#RB20*iM40d5;4K1%H zo8dw2F}qkfk`$Tbtpjtae{U$N4AQFwpj!9}>P`N&ohG)_e|`0JqHGOXVJ)c&NI%ce6}$^8#qSil81y>{W;v=0of=bP~`Tj=BP| z6L%sX0u6?5Hga0rK#Qvas5E?f{(hf-D***^GVmDgwLg)+EjxAb{HxT?0NXzU^m1Fk z=eGthx?Nz!pr~xjPN7tX(uW7%kHKm($c+5Wc7cY7yFWTf8IWCeX=+Z2V^NJ(c!z^pWrqfsr4?w%(m8TCN#IFd z{_RnGTc}Xnwg=xVV(z$DIL*&ZpDr5|g49kRh{9wn$~#H0;})P^EF7h_ile-PkU%f! zoYun7`)KU3GESZ2fhugHIY+SVoD3UO~K)Hnz#Np~r61~7q{CU}*c_2joFodE;>9wlO?agiktk5s; zf8O=qhnLW*!mGf0m`BCY$M z@z8(>PoAO!CxsG7tQQea*!b~U@#S!Ng9N-lJ33ZoA3)ySsjT&y>`{$^&%?X5c#Jpc z5~Q0;(*TVZK`Yp%x7h7@o;)JaN>>k3dpnaUz&}XXP%E(CUT9cBEF%b;S?X zXix~h$_90lJ{`&3`>x;ComQQjvakDn1&K1$C@#XXP^P5y@p$*8?u^RjsR$l50eunA z7xD@4uhPXwy_Cwx7%)!`RY!W$rq1FZq~apnv$jg){D2@V684y@+6dFM=WF+3n=NyO zP-)c>@@11I-WGhwbZa(77rHrphv#?!#2gMl-B;^7>)(PWvkLkNRp4AJ1RuJQdDRLnjM?9iZ4Br?K^x0pzRrgdvqF^Szq?rShfUIYu-L$Q{rAj4Zs%u8-7LqwQ zT9UQc4s*(sUMQVC^LhuJedaVac`2~~o$1%<`$b(%S-wBnZ*a1A_rLP2z0;EJzZ3ua zP=6(Qc)b_KU^~Kx)Rezs7MQ&oyx*ugzdmyS^eAXq0-)d+eVXU!Q;>A#SB^XUtY99n zd|W5)W>s}}AX=a*m33r9a7({?BE4sg)Pl078&agSg1p|qU}POLfy+A6`QBH zC+dm}a*iz+`?z$Kl*nlcG|q3%+IeoahBn>3@O;*{$0{05cAR&28ZX4Ug)2{X8Yf)S zxaSXqN*kkgUw=zhMjN@-&O^v`kS*_b{9O6lj-Rn1I9}UXPoTC+PjmM81c)a^iq$U- z0cQL@2e_ofj{buC+GtT4^0X>2F+zV(WxZ4IR)Z0t1hq`lFGvrx^D+*;rA7(dcj=)G zS$X^Gx9oNmawH*}(PBE=ie)Z|69n6*jcT!kbzG_s<`%%sUp+Ft`6$Dqw#rB}CGob}Q zRhDc>2R&oQoYX>AlR?nRlm=|BTEyI158YI*6a?UB#|KPF7Oc)5g=6%TIT>To?iAD;P@vR7jYdXu^!Bnke@{L8_&H0+d;Bj!{%A{nl7Iq zG&G%z%9$)i%9&~}Ls12=^Or@)K6)&@QvgdlR}=Kme%6-bHDAo%T+LJRZ4QA{)#M2j zM9IxVlmm|1aAO(?7_5fF-vHkLN7} zf3Orwq&FJhPJ3H^v~kzF+%uesScwYP!z}H7&0%Q|lC!Go9{m_nZ}5*c-ysOu<|VmT zI!VjM4=~14U5)KxcMZ8y=^)lV=Y&&?^c6S+trAOq{bPKuHa{S+4yY-3R8+^? z7^^~*YyW1?4yoqX2f(-J-#`6jL|6Ef23hZ*sUi<-L#6psZC~V%=srgTzl!_@BvC`x z+BH$ie`GJbaVGN9ZH0UjKQ$pfox==g4;)YpnDSltP1P3OQ1l(S*>bRv=GBB+E=pFBle;RMp zXr|J6#K){t&QC9LS>ve{k7nANs@oYFCBZs|i@(dt=yaDeYRLNwy5K3W{P`PUFng_* zajvdxEtB^!I7T9XnEAMJ%;j&IoIqpK#oX>O_v`OZ&O;evv$2H5z0pZ)>q<6?SHru9D18QQ_jRde!3_#oMIt! zQ_QRF4klLbRjbWATr~aA%x~Vii!al1<*tcBU2S48DbEqdFgI6O9I2xa_PTX~n?bQd z;7_{<6XsB^#8G}7SCr6F=rxcdZJg}P&B)`FGYE7=qGA#yDqKC6r%EuRaUZPp(fKSO z0DC#U4k1nJkt0Bvq{;*alzc*~+l+*>5-mZ)W_+LX7qt`<9AY=T*stv_U%9_r5<9uv?U$0&>Yh93O3O2N;wK?056?q= zc@MbOt|LBw9eNW&e%@zzT+#7+Zr8`%^iBg?C2h;6-)1xy+BbpBVzkBHGK&fmacMo1 zwww;vIx7)4ZPBUd6EvCzdw@I+P^aQPRZ6b*r2VN9)Kh{lsh z!WKJSZFOa4FPOhIDzgerSkRao)2tZlYQ%7t9_tYy>s~aUhBx<^RW2=^!|-P0Gsj)D zBFVTn>NlE}C+Wh^dqBt$c3)E#^^tD`IVEFxkfM6G(g}jh8z^_eUyrLa{Nn#Z&e>F{ z*-b?uUUSV29?sFDOnskmPoDyJ5M)(gwzLbh_uZ5(WZMShx8i#$V(_o@n77tL7(!eCOrRi~GXc zrxaUetTd6)LozB4U-(K#T10y)C$+;cZi^LRdVeT+a~4D^Ax8ohnbnrGnR`0mn^GFX zLd_Oi`6@!PCEb%vhQ68!U`ELivQY~WFuT#9_dLvlxj;4YG1NJgxX*o&rG6FP4Shgw zbCuupU@QAlty0@xsdK}A|NC~zynaRR@+-E4MPvT*k@PYJ5#-BXw-+NOWNU?s0<}jm zoF8FEC~;g>rXLkCz}BWrw7Oe$7XsrgOuE}(JPlZMw!;6d`&ih{0==4{82WfU~`zI1yb#VeE&>^<%S|x?Wot zq6v^zzg=cNDXD+So^qr}m;0U_<9&UQvFp1>>g&{Zgz?=2^ejqo)NJ&Cz4?ba4Bb=+ znV8bBZoEf(zsI%|X#0f2)@bw3&?IYCML)G3y$^FbNSUjRYve;7k9%8Qg~ z{0@cj(W5k~w{ZM0Q9MP_>34_+yfKTI%pZRQ^*znSz4G4moSy+Dnsr)BR3Fyh)H$JK z4Y|6;f~3E&8EF+&!3ituVQ>alg|c}|k6`1*(>Vyepz{ckC>n}xgIi+7tsJ#K)fEYv^d{P>FPcj3j;?3YGvBQxVPP9eN-r)*#aF&EIywRGY( z6TBG+&g7&JTA0dIEo&hjDTx2A!BWZF?vQfV{cJU9U3T3FkQDBVGVgVWNwOk1->PKw zbnbYZObz)6R0=)JoZNCi>*?Qums6yq5R(kwSnvByW=;0as^GVe#Qfn^Bm4%a(&?QO zpWTzVhwml@4Yc!D^q)4@M@7*Y<9Y;XXj(SQe0qpA-mxFV?PWr0?_`iolEEt4`(p0k zvJ`(!qZ}`>acym5MkIaHhxQ3Mp=Kg`iX9lD_|iXG5p|U|p6kl2ZcOD=v+r})|D&_` z7J}Nq|qlajU#n8Kb^K$#N95l4O35QN$%K6EoRzEZ8JFLKidVNiE2sIU#fyrA#zj&b_D5C< zir={n+Bec?WaCPL3D0_1&$OJ>G=o2Vx(cCradSIQ(WZ#DyG$pg9?haTs}1F zSnoL2?<)PstuWcMxe;0lBf9zZAME-b&LU$$$a~~{i)0%jkmM*aTX`9IgI7?Q=;(Oq zIY!aVbU>>}3@+_q68{O70wu6v&OX+dy_zn)nl91p;wSuZRlMUA9@kTl3m3Y7 z8>l}{+|z&pH4;pvdOIGp6u#G~-6NM(MgAPYH)o`-Lz4CLTweMt#P>8rT_yR4+Wqa* z#%()?O~a!Ctzw3B#m&Bo`AA$Ie-Gm}>XhKPnEN41D1ZAxT6L__66V$NSj{pXI{1XZ@Rr&hab-yU1=RhJGalPX^4UmI-HVJM%||rvMLKL5{tY0_=2d7d;w49Z zC2hS@Z#8JLExMx)dJ@*8)?M#alJ(mysj?#OeXix{0`^I3d>K=6)miaLjh-iKA_Sk; zv5ih;n3;(B(8g^VwiC^_(|xbc=778>X8+lB9g#yCfHo+F>-z3)W>U7xi-spj1h$2X8wxeCMW$$ta`$mhNYDQ@+^E-I1NpcLH zm+gFaCBY9*`5;oH$%{SqY2xVHmEP!?T_adzd6i=D8;lrkwl5(@{^^QMDZ zM(KvNA^v-r$nTu@xY!BQt*REDXbV=wvF;4_B65hLw`dj2W5ef|?pD)u9$S&gKOv#@X>2DfP7ue-!0w%#HcZwbfiHSntp|tvZ#(cZAAoeTrsc6sr0);PS0ik{$ zz0st%>A@y>_V@7*$C=}i6jwkYH!88jnZ3kcfM{DZ*nOj=%c_z$P<%!$?m+G=UOX6j z@>4$S#}=uaV8zb}jTeWK`YGr4yLSPv|5u{o+0(-6C|o##8CDQFJS=~{!Y~^1GIRuL ziYU}>S5xBDi%!U22{e^@t+IP}kSKJXGIGCGAjEB=;haAkbUzRM9Cw1UvIO@2Rc@Xj znHfJd!<9TAj^1e>C&;_^S6&2HZazEdH;#?!gGs5|yPc8=+T zyC+`Z70B1xz93)g+z1y?Lyt!~*e~SGIEvfN_UKjr(52E%lJ8kkekdnu*y39bM$=(X zoeg#66hp(IzCYimSACULX3fp4^digk__{f9G*^3g1?cuo`lhEhlid>4&0s?A4O)=d zXLQ;Y$-n;i5sZ~2u;&^Yw;W9r2;vsqa82~v$izW{ zjFmK}+mOvc=5wUw%RjK3l=GrEOvOL0+zHe8*UeqYy}Np%hf6F~X$y1e1;WfIaOcno>t>X6S~o0-Y{wvva}3~GHEafV8z@`vNpwtF z9X$fepSL>e>VUb#&m~I_?+c}k2&6cLCd4J3ke%pZ=!t@zNO>eCHH_8lCAZmVu8Abg z^Rqj?CjIylr^fXW2%fqdZTudX;$Nnynb~*wZxI$IEk}_Q-9p%?%NJdOO|d+h+PUBM z6G~^tQHJ;p|Jd-#KI@q*$}uR7(llxqcQ3n1n?FnY7~NxGX4fDhNr{c1#o2PN2PL`S$PxbIKS#Qwf5T7oBtrfa1?{tYi7Rv zI0;RYvj};Obxo%kPAtnfYQ-?2_iH~fUeH@M`Gtc%k0FT~k;y~9*B{lZSSO25Oi~9= zngyD!k4keB!Vx(fBrH|F2ZFwpIT6wAPO6@%``)Q)$iDQlN`$$OVfIq78cge+BVA9a z2TZbMo(U5kE}XRf(&yQXf|jkIl+*I-JHsR~IUwVfQGV z5U6&$nQ6Y7@iv#d=eeWV$>%l6Ad`a|a+FFYaNi>BUET+)C_V8b!`l;E0AVwY$6BJd zC$`_uFXWO21WEC5+;LX7KP(i>)NNLI_FvRApXU803JPxfyV-obS(iH3r5GFB`X&Ag z2kj$mo1SMTO-!uq@dC?PgvMwLWYqrv@p=K7Y!SrP9m zmA#hkUeSh7#+_tWKVuqrEBsQ^x+uKq_-m>7CyMD&$^C8p@fYLD?9-%0o<_AZ-|uBR zn1&z-C7{r%eMcZx#ff`qM=`1h+y5o=_ebsJzd#lNHP2J1=6QNpxDEcmg7ygh_wRPA zZUjN^oz4r$47V^jLcf0gX6H_p%y6cXzWbbOKG|1o9?z4VD@nycAN-ecr1#!ZS(J#b zJ>ftPY{{14$$b2(&}5Nz9LvHU+zm)&~g z1j~ZV%%whzl9;6~hZBPR+iWe{-;jwacLhCuGu3C9*dh0X!v0q`vCS5AQg4c^#MwEMe;rrXSR^u3%&LBmZm>TT zy?Q94=(=?s_HAre6Heg^#3fB~Smp8eC6~kMvxBOmBB8-{uX@MG^i45;jXH<0(!gwT z!oW3Z;WuD`f(6!w`?qB(h2OtF8xM&l_&;S(pB4NC(jQZbOk%bwN$Hn=N|4PkkkmIt z_{K`X^A}%JN;i=ZvfnbeV(4&VcuvqT*L<(1Q_@S_ENK`&l4W~JPv;Co!>|qb% zng|>=*oUSk%|5|+J;98fa=f5(@B*CqE#1ZyXlU0mQ9rM;mcQ=j8&a}CvO6CXZG#rD zKa&aasxga7^;IN(Lc?lp;s3M53_~~TK1E?I0SXHJkoL1vAGNN+QuFu=>>5V25B!S_ zPM!LyhZ%nZnX_$hO)UURIaC8PgN8ccCnZX zM1nXwxJKiTJ>R}4y+w~L^>iwMz4e1iM}^=}<0+!miiOhBYE;Y#H=GcaSR~FN)LF>g z?9m^5b+)k@$J)ey{q+*O{w>hOA(VS3`NO2EAeG*dlws})k^wrGTM#opBJ z;z3Wwu^T212-*Jd4Y_rjh^Z3|bkbb5(qwnygy%sMw)zKWR6+4$O7t<>xQ=}(>Z4CF zsR`+>t#aNzT>fRM-~P$ryeA0liz&?MWb(ljS(Nops{?7BhiumXE_mIz`<}SJo5((x z*zXe7_gqm>?{RNa|gUv%BB9Npwdf`>z&XPn!{ zq>Ad5*Q`@FV-{T^K6{o^r|Y=5mf*5n%jHJK^~S>q@pU~x!_Rh6qV*B7qsfy$Ogb*K+VFGsQbM6wws^oVc(qk_ zKJ5Rl|KfsMYMtS(nD@f!ql|p392z-lXFAR^wywVIP{IvH_JId&4*IDa9@-ma_f<$O z-c;2_$@IOHjUXluHFnO755{)q87G0UnUix23^M`1d&H5+72uyh(q{hCyYFNi^DK*_ z($YV~bj#fKMJG>K=QCJBu&0fKwqPG=HVHPxl&AR~xz@A?B=A2`HEejKva7JH5#lt! zTPl|)X8Y8tm_G6fuj0~kB=uMPX;f;Ble79f@ooB7ga9Yw^qA%?VE~*@%X&(Uq-4Gv z5aIQ%SGccfDdefJ!BJ-))`pj~#oDzkA~kyhb>wh6LQ+~Y>|ek9_vie8tJA@KX9{qsoYRP z9@72d3Ov7vP~rFAarPprM9x;;ANc&ZPu*F3OK!^;3ql}NaJaPkHuId`z&v2N$!|il z;}PnodumEa2AT?a9xB%mZl|QMfi>+D;V#cdE+MC(yaw_xi+Xhuh6AFBSHq@hZ69PV zvCSwsyK;-6ChNN9G1fTd;F0*g6io*SxqIXJ8lyUk3&+PO;k9HX*K^t_ZUpA%ddE1RDzmhh=FnJ~avacZxX=terK zJbKmA<7%%LX|5O585@pF)P4=T{W@6i_W$APt%IWc-uH2VrE6)BZct+BZcrKogr#9= z2|)zuUb>_NB}GC(N~A<-B&C!FDd|SKzGr=X-ZQ`X%{b#fy0FhV&pG#f-B%zfjaue7 z^ap+e6>VnC;qHqTDO|MQu0{hN59mlMWOvOgeY)-o=Ju3kzyD7Ck^m9mz9tEBQZ7K} zU`Mv8z!m*ItTfxep55TDxqtNinpmEfq{g#vJp@ZE4u{g$0soQOfA4XLSG2GqPPO(q z;B{VpyfktqBjGzq8ZhSZMuk{q?ei%9Sv2OvclXj^*B4~#O%!E&%Gvuc(DJ_;3iZH$ zMH~75tD)i*LJk+1&4z~`%4|NOSx+{F6!}cqJ&uU=hPb`6#Zm1w34bOMH(JL-u)x=m zU{I9CN~w56e$>&YIDk=s-V@tKyYUAInu zW<}=LbHA@a^vlggo?isSnb?h8iBwr?x=_9bCN7EIm55&v^L#NwWvU40lOBnfO{wr! zo=4T>IjE1*j02Lx(vAwc(BdNOabXF%)eKPnw0p|r&7xn06+@BX1P^N%MegST=XZBI zwZ+_USgJ49223Nb=MG{74s9#X;pl1}=>~vN>bS0|sx9f^yWVmUrm-Kjm-FQjcv*I? z>^20Azo|MC$me(E%9kGCqYqtAnAnrpX%=E~%=-7Ryju}*rNDE9wq)&JjfZtaju^^~f^q3DZ@x_I>G z#XynN&6BAJJiGThVhMb5d1c@ujgbxP90ogmu! zSEBHI_pN+oedxByCrCjEGi@@_C=RP4hqd7Sf+fF|1oP)RPohL1MUdUXz7IFJcjDa7 z|5)Dq(Zrs)m|xYV?;*aX5N`0aiV!cp1C-CS6zIrx{QQ z(hQTIKg>luzHCB6t~ZVwq(*6+91^$7@YCzHIEG zh>-F!8xwBu_@6JVFtDtDzAXCP@vrZG(kM(gA!+z;qYI)IAZy)l?!RUs`p%y}M|rma z#nH;8K2~{@lOEkq93)-A3jB%oLQ@RY8&Zj+h=^J(t{V6`!k+yu@H!lg#r8FeScPva z6Fc6cw)W-Y%HY5zIblrAud0Mos(e5ja`U*-@3BdZd&eTjVnZfNh0SQiH?_gCNTdYx55=ynA$zGOzJxw7n}<8RXRTI5+(FO)RVnFgQYS@#b1LCD!K1AYYH z`gC#Ujlsa{sjzK~je;byw+ONdwLDCd^9xlN;hr);$FDQNv=Z};mNqK`Qx$3-_)zk* zxIo9ZrYQHhv5aF+Kz!sxJjDMW`Zm7a{13j<#n~1G z{qgn*fx+d!g@B+lbT(@EEKo%xP01izm2PVA*<)!gh4Rh^o{PLS(8Z#B#6_F7fpUm6 zfAWqX2#}MWnJKB*lDH<6b%@75&1M734a(iHn-57>0RZ>iK$+!mG7IuOi1W=C*Q0Ly zo5^}8KPASO!JRgsuPwp8tfL$9y*H|Q2Opo&M<3BkHo?FrJNN`U$e&dUR*x@W0EZAH z5gs53|78Ce0t~-wz6Tx4Zw_g`?JKgd{0zD%36|t#>cN?PWnQa8R4dZ7dZiOIW3uo2 zH{1B~A=-mFHJPhEqi=$rRG1&b%c?9Ca;vlvpWC#4$m{|p{m>;{Sa=z9)LKdtTm{gK zY0q2cRNzRb-IpzDTpmwpGhw+A+gow-T4<})PyRJa{QPmni$5lYYg~rkz5pufm?UPg z*-reC?~uPmO5b#*VXUq%HGM-9f5U0d-(^gt)5?4Q)}oviJ!!O{;(w%_3KnReDis>@ z7uAU=>3ZF|CU>QE#A!nu_BbC8d(&js4$JuW#OdQBQat5x zz|3}dVO=+e+)|^;vTgE_-)$##czFEuMfFJnCwUdhV|AyUg@xZq&cB;pU55%{h#*+I z-q5FYfO1JVR7<&jdW+wl9|sqh@@{Cby)3BmQh<9+0QnhJtVs~nI|Zr=L38+?X3m!T zuZ{M{1^5~%d}gdoy5mhu_TBAJb*DrXf0PN`5(AzcRR)7OlsqZZ^x8`c{^_|w{Bwnq z=hj&%Ye+i{s^U}b)ao%=k6o+xrJQM_+WsVQh4}sNet2o+#^;r*xGx2~u-KNDL(nOd zJ}=0AVf}ve>oWb`IwDHaZk>d);+(30j)TMhWP1a+RyFbUyyr_h%Y z>LAmJ-gp>P)%WRyl-qoB`QxpckKw+TcGG84X>rqX4Gk7%JP^Sv)&eT`N zP0Hpt&p8Z+2E|jZVOeL|;Jhg3Q?2}QHLmOItBA*yL3F6~$b<@X!%WU~j$r7`v1hW8dBet^J)A6jv7%~c*YOKO zF9y0n^W?UeH$%H#3Il6_j*qWPz5iAInYf`1PLEOS4OAOo2WjY3ZIRF<+xVjUJY!zy z%ZIo+-Rh1~3qB%Xsa^0#6oKtSVKW+O?)as5DT?65yPfe*H`d2Y_Te|x75jnG4SQ7c z-CKTNrVt8Ops4XTDV=0r;~VbYp8HW|?zdJ;@BZa*=Ce#0Ql0kBm2n#mQ~%r=3b4*7 zYHz3uSi38A(aj_qakFuM3nNhYhf|k?nHRj#J#=Zb@W=JOpA)E3L0#-&$K67lNS{(lP@ za}3|`{DOaRFS0g|Rlh$3^4{C4eX2kX%@;+rvsZzYD} z-xcuo-r`cIW3wxkqbV>n)-ur=KIW2r5p{V-7bdf>kh3V`nPj$S^XTUXo+PR&G`I_l4vh#J z6x&`1yxN?9hbYG(RL&q@Tye?n|0Vc6qDP)9HaHww#YYkNiKAVe&AhSmY9WX{dLJba z_twss8uH!DeWhDHkD0>$H0&qK%jWWMyIyiV^#Nknrqh=mHTi>1wkE6w<%|^%UB9ii zIr#9^%SA-^SDmTt$cjrlTfd#kl|7UJ}A~@>m<&HPsSAM@_Z3y&#;S^FBgo})vNtzNj^XvxULkK zIp&~=as6(+&)pL>0(+Nq8HeTz?MX{Hc}w+`Yx#|4S==GVoew$ZK(%$PW~*BY8BCk^ zrBRQ$=rXY;A=`D15qpLo%90T$k*}+_vdAZ124_*~demaZN)~7E6ET)qvM4du{gv-~ zJXrzT5y^ebP6ifd3g^3qtBO4I;(IZFzja*V#uquuf>G>PEaV$HCnL~e(lGg&^(}a{ zd08kTNr{3zBH%g6yjdCZm-DB^n0q)Cp{nX#NINd2-^o9+HSII5Z zrHWGh2bzpaTQyt0CM%dZHuNbs#~4JHG?FmgV-Zy;cFa(SO^H{Hfs?A~^4Q2)U1WZj z=7=H9pt;~#Sl=MQNuyPz#qT-+jzB%nYs^^9Cb|(sJNp`98d(>eoU+x=r;Jxfqspo$ z&0iIp6TBx>TH=?bU{fFVR@;(G=|YJ8gu(*lp)r z+p;{y@b3$8(Np!#B(E)oCvMZPX~gObhu)oPaT?4DT}3K4zqxX?HJ;pEOYJuM4LBiS zBR)YP03u7jEdC(?Hc+caiv@D%lpF`YyjmN63Q_ zqifbuq)2(jJGb#A6FuBsSDlMzhmLstYMPFteN|Db~*T}FG8WGz+XQ;Wuz}w|))TF0*5$-T0Y<5uRBO{6PdKNSHCpN^ z_0%_8Nj#&5?J|+!>F0Z+^;K#8dtn9}W*Uhs-m3PvpA5sL`&^7y?Q%)k{yeZhcd~h2 zIYTy8J*yLDu*E{DZLb-IVfG&fqKXK0Y!iLb4g7a(Q-(euQ2l&AofG+6DY)}!JrP(? zr!Od0PdiQ9UMJ4&vgx-!V@dC+lJ$uo{n~=@MEvs$yeLtd?@eR56yN`f0y6jl#R#e?52MB5fU5lYaA9&gAxu;QFmQziOjX z?CT@0w`s{$8LI4gquh3aU7kLY(jBBhc2YWJE)r=IA?PCs(V*(yq^*2vqtDG`1n z$Cf_dCCMN@=kIp?-x0m!B#kYpX1DR7Rq_W3%GE?r2l8il0dPhy5-HB= z=R~A~7w-Ht6WDk%m^WUP{^nGaN^0oFVoF894AxAP?^EmLGx8ky7L5udEbRgrv^gj} zkK3&Kr+em4QA{6$HpgX;udh6(o%Lkm&c1yXb0N!E-ly9sJjt23kl1~#^4DX;i#n z?a}!#qpcwXe;4@Y4)E1*voqgyL#VY&FrHV&#GyCn0t&I8rMtkuHPNtI{?? zj2E&xQ_mHVcJqMjGmMDgd$J2^?{S3dgK{ZBKp91V5q+Zw_Pju{-B0i;h)HW7V zpU*}|o&=YD86+#a6Nq>sH%;uL&$2k^7yc$D*G9^VP zMrXYo*Sq1Ai>t$8xzt+yN<7g9_1X$SPyDIime^$a+0(dQxpkpdR|%pLpR?@=ao0M< zi_rZT9yz}H^{He*dtcNs^DB`RJgSFmXMkQOS^xRR>Y&AO1NbJQm?*NJUGaxJaFy)` zL-XOfE5@`uUv#y(CZTz&TFwJde6_c+Uy)ZU!>g*+ zW3Aeq%xwbAJh~{%ZAZBk?Qpgu1LvPBc6~K*;o*CZLvi5PP?9p`JC`Twviac)>Uvf{ z9XJZ9kIhk>{V0-rdEF4qmU!V*Q>nWC))H=IGgTD!^&H#SRw^Mmx}41Eb>&x(m2i(D zDC1rkmlV6ffXX{ZEkqG_K-v0fgXqG;tgFVn%f{Nj2fnrYT}hgZ=C;?WMo|;Zt0LSQ zVMeGBm-8xFKdoy^98IM=)p5F4K(q<%d`)5D5}^f1Cr4n+R}XE5je66<&8xY1$ug@G5A&(YC0%L_A3dz z-QZGmQl(+Cdh4%@xj1K^+J=$9lSjXmhUp#*1EL5ON6cF{slIYflw^tTL)QD_ z?FijR)dHDhdt16z>mr+*V;2w36AX&E<%p&C<(p2WH`H|cc;uoYxh1yJ?jc*HaguNA zu$n8CsI?raBgv2xO_LX>s5?I0(0YxFREP{j(IL@h@k(Bp*(p{Sw? zmNZ`uU`AA!84lrxVi1CDMpR4l@qip18y7w81u93(rEfqmaz4$|*x>9bzsZzaMC{Mv zQ3t&s(F9#%tlj?d)aGaz1#id~k?-C|S*fhn>M>><7mrURnXTxn^qh>dI!&sHAojV+ zsulXTW&C3#;=?!;ryCDSGa%|nog#qV z8}j|(IZAJ*&tqVJ*D=Qc^&h~F(Sr~PD}GP6le+$BiHEvq zoGs{!1u@PEG30+?WMJJ5hJU=h(#1HaJEFu%X+RlvC3&-BUYoyRPw&tlQ_LPvR9 zDAv2pwS#C3hD2cHQUyHdqkw|AY4h0DtIaa#oLw4N|5~Gr5iA3bA7@%e(zN%yWL4Um z|Eg4?om2C{aDoDj1R82clBpb~Ugoo5MFK@5DZLj8K8y7VWUW}E{KA0r^uf4qqxHBf&!a*6s)h8rM~VflwZJ{T2reSri%&EuHqDC@g&pa>>4xckE7 z!ylI6MI2S#4R@svL^1N7(AGvG=8!y-G~l*B+tR89CNqye5-DI3rvpbOInwdQHYL+$7o4L_uyw*kKFU07Sr9;h8-Ay?snrA5iF%03U?iZXH17?f~rB zU(hf-n*&g8f-jsxYL%`^IRMQ^^lML&rzB9iW$Vc*YrZ?>O`9KXE`iUc#qQq^0rL#R z6!8zg)mDOUw(CUp)c`6C6}2+@2(rf+H)C2#Dj5lTQBKcvpb_3J(i;?gw>7%Q0>xdh z^Bz&5LKY}RVJ*q(X5RX!bxF`9xehru4|Sb0PM8MBS-g>)5{$+fnErV;+V=I#77|PjuEw}TQ%adIb;ua3s9uFo# z^v0Y<(cLZgc8eDSEi6iY-}g4!?>e$2Ch(O4QfA{C1MzfXz95Lu`I0XP6kZy-YC4+N zeBk{wLMJuYjI~UU3&E+4x-(C5fa~gCq2JpW@bbq}zwnMj|0r<4V9*v{gxsRH`~(gi zz%3!dF2lGurTFt@=7~>f=ZTFb0}$Szb)=jEYX^~FlNOL)VQANn#-Ija?}VfBI66QU z4mGe)B|J^^1r)CiU_7!yx9lBPVgAL>>Nlb2q`d#m{cL9HEtKuc88~?6RHGz3Q_n-? zk*yP|-L}ZNiau+cpn(olhSV$&g8Ybgxp#eeN(@x%mYF~&KVwep#l#i_BuU?}U>Sgt z9C&Me143SA7J>PM#|S$fT4$~}IFj2z5vYH>6T7pktIgjUxC+YGeSc%}r^XmQoGjj{ z0ws^sA3_aKiq0zF+0fv_rBL+9K6jcMeNaU{Ox-h~-tRs-OR^9;wQPWK3&?Vnf$<_B zI=_K4@?GL)qqx<&hq?uUwdbJB!+%KcWzYcL=vkcanX~q9T+O&xlCAtPI=f4{e|GKR za8hSdr=ZNDeJ(aiFe(D z@58=_$Nnrj)Z>=z8&DIU0}G!UL{|_l2&#jNCMFGBQ*vNvGEQb`jP3J&f&Bz1+0P_f zh44p^$0LJ!tIwXEkLBI4Sy!J8YyI5?L@7qTZ)XUq4^X3TSBDxcB`DGUBFL{P&stkh zCUowg1^dVSmf3I`kjj=$St1m6b#PXv562>L3-p!cA;4Ttft{?psT2pC3kDDFbl`NX z0e=c&kfQYQR(8V76A8#Jy$Sh!6{Ft@r|*~QO0jeU8;rH?Wnk5|noA%(@+U^dln9(K zuTMk{gB?2J+MTc3lDp{Xk|@3fB+*HXCYTa1Axz3^N7ZkoO!;|9T9y8Q1e9lcd0-YW z_enbtu7j|AI5obLIs!sDddpX!3)1 z-&{o_m=wEPR3L4RTH2@TyOt@CM9X9tzv46huyj7jwOr~Q8-lZv9dhkZ{(pDhLv(u_ zb{*MsTST`Z!t`E>c8(-_lJueKpo#5_WD2TR5eVlOz~B+EW)pZ>^7z=ZSw>59AXBsH z#a=G_3TJN!xY1>yq~tL)Te#N)Z!_~#_H7s_V_Y%SD?s{&EpV_<8<1+9QTiHCF)D~A zEc{xgoRS_Ls%hpQUuOC?vxj35 zs%x^9?cX~WkN4`A$?;!AsqW#LDFMRe?S_+-qbM@i$>+}71e9Abf~9ik2#>#@18f*& zCQo$fIbGlah`ParLhI-K`jO;g3uOKua^mlbe6Z*J5SdR(8z&&^z928Qa)$1r<9)5M z-{7(Qf@KLtfU{RLS^-F9Vk1;M9 zeGGk|!p|yKmYA-bgY%@)Fx4j(tg>Rx0#qa@Kc<{_I(f2u>zNa=^%^$tDVK1vr4H_r z>?IqA85r47fU~&5Td>?S-kIBg@Xd#Ja2w;g{JgE^b6BulVMQV8o8-HKE2B2y%@0yk zZSsM%^{o0M#cn}ZJ|gr?Fz|JbTYv~AMTOk~T?&dv+5#D)y0ANVOp4F(rm;jdKt1VI z2t@!WJxg6OZkx_bxF;leb8yq?PGTZ)r58Lfjps3glt0;##XD| zR}LK7IvGCCj9Rouf54nk)9-jQ9d2cIYW5zPsZUcBz=+8iF**O$|MJW@*_BZ$Q&i9C zxdQT5NOanfl@TLCZB>9J4wC-hHS$Rr&hr3p_)FeUmqme#oKW5ht z#1^uYI*M4DcI(}YU6LWoz@;V+j^{z8KIIo0nz#^6rNOKIcf4l6ItH`vm$)GgeR$yZ zWOYnY^U37|i1XWBo^lK`D8#=_rN)CPdrwbvW}T*5~^b;)Rf80IK0WdiKF>-{5Ooz-YR&*EN4Xk_N0 zatMI?xq7R#h)QA(CMz_jRFKlwiJ7iKX{2w@_wV|IDy---1h*BL zhE`7FD?DBmYOBi;V7rw)3)Ici7lw=bZG-6Hm)CJ;+#tdwrDy9QOdf8$L9pb(IG0V2-xrS@7nW zWqxq*zx&y3mGLc}G%i6LJhoI2Y$XnqjTdd?XQ-xe94ZMSj4GR8q|cs=K^b@)ysJd3 zb45Wl={cF73hx^0T33+vWL-iLc+P78TS?~1{3KDl3Ql#lm{E>3Y5dgJoS=lJgQ!-k zf6$*{fexGzQ=Ybt-KNxxe^a%`Ym5K34+?Q?d=6UB35e*_KS2|b`Qz+=PqdJ&8TecT zna`lFaKG5;uP!mHE9Y$MWYwvh&6VuSalRcnT4>!8cvLdGsC6q*?&53+^B6LQ^HAte zX8G@nfL@nCqXc!588- z_8{qWl0KAtpYoC;9(X_|pEQlAvzBOyRtv!EK3L@@gx+eP(rk6C2Y>(Tcak~5mO2!^ z`y>wRs4>#2{Jcf5D1@NRNu{-_`^5rpTC~#n6yC+uo2FqHhWCKZGJ===6YV;xQdyAE zV5LP#Hmkr#Ho%CLFEt);bYKa@#U_`t9c?HYZdN`I z)sEHKB0;^xG@0=c)hR%Z;&Ft2h!_5DS;t47eD_*-y0V}5fWn}`=s6l-E&|TNG3VFlht$i|uTbHRnc$hu$@kVIeQp-*zN5+8;Ecjo!t7|Dp<%MV_Ap1p zu!q_CdlYPg3i{*}nh7=DbjlpDuL}SM^=@sWJ6Lgfq<|*DNQFM{x5X!U&NGo>sr&@tdHf|grauhTsQ+j z4`^0y9LmaY`pgu+T0u+a5SDIeZxP%E z&lgELfDt)aMTAJ51#kh4|7mTu}A;`G^voIG;2akP;O4lvg);S znC`P-1#I6uQ&Xc)3`5c|RxyFK$2F}#=~=>Y1K9|hT`e_HX<<53n#H_i zZh<%Zjkp@U;C6hKIh^u*cXnz_-Bg+;IL}{2CJs~an0(LH4&kB+)dCHc=#Vz$W$S(C zx`#;&y@&n$d703xet&qO{ke7GWX43N^1a*Oc4Y@0o5AKssB$s1nPh>x;Qt;R5ssDW zsHo>QQI0@w4(=r0^Ui*Dw7SHk#Ak<(-HR z4b0gLRPkyviK|ECHYv5OpkNDOy@FAaj1DMut%dmVvu7;!uj%@IGw}X(MW=ELTjI6J z#AnCh$k=dZsSa&)n+Y&WwyI!e`VEZe$Ob?A{Us>)E_iu1$#la~%;Y^%|5vi=qk1ox z&b}T1p35;^rUyY2gId`HD`}NuGzLYkD4L!(X5mo2`)X(L9BrWW3S(R(;Z*;CN z`G__rvN(Z9Tg9RQft03f)gJ?UQ5t6kr}A*n&vO8T<+W;nW7-cv9cXywj8D#jxjdtW zZyxjb4gj$E78ZLE2az1FosZYQxH=q(PK~E@)~z%~886<=H>b=Or4e>au6z-p0(sk< z4lc4Yx!$@*jo6IN7>lvDjIjwy^OEB8Q>yfmN&cvhSy2SFet(h)e3X7=gTo zJh{nFKPnj1F3ju}8E%V@OR5Ar}HY-A zAT}7t^mG+^E#0~X0=&r#O`M9XuwEz)Mj`0bzhcvl)3=2fN7=TZ!cfC;dD|jB9+a!+ z<8n1U_^NC@p@~8 z?UH6sK_2DRmpkmfK_V|P0@(wl+kB1j5B%I0JTH_CX~`(VV+I)-CY2#WH$QXzty!^( z?C2WSTx&-jFid9p(;B~OsQmEn8sfv995nk^87ebzLVWQ2K~68H{of1yc~mHFUbp?Z zZDPq=W9%Z{&VDk45Cd3n4}<7}kniDkH(ck+U-s|%p1tu{?DY)BzZd)PC5>e-m?@h4 zWDP$VeVY5>jn4`LZs!OpVViRkZ{iPJXq!~SCL5K+*b7qbH+43$SbfU&*#dKYR}ZLp zGDN!a+;yHa`-eLpKyE=Mj&{@YjTHC^!>!a0?%eog>rVh|Jrjp>q^Gp3?xb)3L`Y23{3`Q)Mv6prZC6YTYmA+?dbG3N2X|g+GZY?5%;szy zz_Bm8_&OeG_vw(@FvH_xmus|7WtB*xxVX<0^j%$*E~kZL?aS5=0}h)p$lnLLHg8A- zp7ZoS8vQkn&%h6GG_5ad@~nQPunLZ0KbO0cOP`N{tD~~5-@m!YEXi<2-Yj9vT{h67 z4M1RTJ)Af%3A-t(an2BfR7d6NGTr+o5S9vvBF;?s0Bsv#cl)w{axa#WD18lD4nW}fiuYx=ODA%pFYCA?Nb-dGB*g_G^R1NhTpp!y$@_5uj8gXAs#j75LTqQdXq3W&xk zrhwBVQKv(K?ouYRuGJl~aL$T9h@GUs2TrV`eBdT+bK9 zS0x1ud){`}bx#IWsO15tvP=2b!0qXOS}rvr-&GoYbmswe zRP>_&FeoXb3T$Vr>_71NoJ>lK!S7lsKt%0Vi_40oLn+HIO7v65g+iGj-WozOGf!TY zwN7CjAz3KjAz~egrWFrq*6oQPzwE+X67X4Tbp+J?Jo!7XOH;6dFn$c+cCyAhHp8NE z_2L69zVuEq<1J=&Wm^$V8Or84bWLmKWhs@F*TjCmr@+OW?qf0=y>`G5Cg_>k7&)0{ z!GEP4Vk=wkD!jY%GI!=c@*sWdE4WR+)xB+{l2dB6bwzZNxnyN3c6EoIt{-vXGY!bR z_DV9MYJXC#=)d3nHfcj-pxP`>v>NMNL0U1JUHthcyn7H%RDP6sb~uJ{Mr@307ko?R zA9{$5hI{Wgzd}+41;$~b2NR`yb1awmF zw+Ct5j}bWob8t>{^iY?q-!rcDIZ6JD=d~ngIN|B8kEmb&oO_8U4XZlPZo>$7Wk#hY zLKp}VhAq1(lkO_ZC=*5$ykvcNk&%tqaiE*x>p1()Ycp^xUO%?x?wW`s40Xm|Lkq*3 zfshzL^QP`ntr=fP9=zDJB82hdC($~aLJ!InbPK-eq2o@-$1BVhT$js*;gh|qgBLJd z3=pqTj=zz`Cc&;w327MQDs7Ey?8q?ME%70vD9+V60s}TMGEK%tVIwfSC*i1mPK@Xk97kTAU!pq%M;aH+O>gMq~}l^=`iR;_a&lA9Ow zXBq>Gz)LXvf&i8C#6>UK{I%l6YbeOLUU&mPG$*F8MbJ@Z|tw zlA*%FI#56fv!i{3Bb05jkGfLw6CkQCy+;Ky`Dg7WWxR6KtUgY3L1&A*e?m&a?84r^ zIYsNHjl3H<3FggvGrT4pDc3is(Ms_7cPNTdWh#6nz;@8H=p?B~@W4U183&>E2%3{TGwg*bc8nr_bmMB7XMy+*Wl(8kAE9{*^En|4~!z^JX$q zVRK0j$tx}0^?kCfBV>oVHcBz4mO?4*yfcHo&WK#rI#R@fb6^W(%{yP6BfV;lFB#I`)4RL+Fzd3QFZIe>nXZ~Nx%2H+ zCqx8QQ)U`^r zFfQ9prDxx_aCh;yo&68G`2P-hi`iRWX~dJB9NIJ^-jVmSJR+3;Olkm&gst<^mwtKv zX%3?!;#5F!^AYR^tZLFwMUxo)QC+cBdDop-pZ82=t)Hrf@`_=oWH9Lpb3J;1S$m%et8Y6n1ft_zq>=6)?1 zmM@SETOJKUC@{Sy&zKBqMoijAIG|1Wj+%-z@S*=ejh@43j$MI}(@pom2y!b?kU5ko z2j`x^q?>i%RRXLdvlXv0QxD6n+VkjwfsRIlz+IUqBE-5QvT#DyP8gn9B3+Uk)Vy8~ zt0N>geuMag?W`E_L>>`{Aw5U|pSm;?!oTOv$_1MtIvCh~8YWL(%U>4(*cE}taCbXtZ=^8lyHYg6-Cx@}&J{?K zsKvwrWPK}$-I#to78nnE4Qq#8#o+5}LfeKrqLl&`--|BCgmzV5Suv49J>y-Y)M$~y zkq7qiutR8}s>nd(YN5t8lz`=@(gi}=2F5eCgf+2dCM_Sm5w?cI zpo~O1BdV4XlbBHksZ#f+wJ*<5*vgeB`VK<(Qx1~z24Z5Tp~@ru`Fno%(K~hpBze6{ zTgd%fO-P;V-}Zn)MuNUnFl!ZhV+O)*c2^i@Y!Gs$S$_pS{0i48jotEtYb+f7j-<`{nbx*eMxU&)l70 zGo=l@&q*ZwDIfiQgfs)wLRO5}qvjU!#Pl05oq&c)lO*S-l#J+_{zLs#?qHx{dz<2F)8A42uRG%am`}dA}?i*GlXE%v+wbA-UY>rg_$q8Sx^(do3*26 zmDXx9$tETU9+f|`;Gs788M>~JtfovDuPrSMrCA6YRxYNiYuUG$=30?73t#Sj4GU9< z89~0nW019d*zVK8wbl%06nF@mJ_u+FaRAdXqDV&p1!Q#_ZS3}4pF8jHCswk-9A)AY zHFIe;2%#ji(makNS%ntr|Pre4DAlEFS1keI)3v~*W7o_To?z%oKFVLe{!oNuwh z@DX@hm^x+afXg)p_!%4~O4Ty1qvF zh;%_H#u?`Cmxs6WsFYj~io1QuCQ2G35gLgQ#%iDg@xY&@K2sDRS9!17`ePQVbU+p@8q?jdy$(9o z%*aA93R0X&^0RhiJO~9GVt$2L|B#dqu>QD^@L;yMy4X(u zr^{Zu=`g;}k)2$a4m`4f%)j>f)>c>exc z&Wl{yAQ!t(d>$W@MCvyE7Znc|(?v%fOdB~yA4p)`(>cT4jd}Sn1ip%&x5ym!vi&!8 zD0McNDI6?hnhBYUGuV(0LFz~sAGMDKUWC-+MshEIKcOeEihtDLVbSEyv!=_b;B*kp z`~!bpj*bz3<}IQeFTPYS3=fMpj6{(l7wd0534XSfwx3gXjO?NoT57w;;M?GzZ2?vR zc;Dk&vK$5kx)Z!98T8fOh(~#O1HSHsWUEH_$LBZ%KjZTI>OQrFV_)u}5_kMv>D*h< zWGA5gPzs62x=zvIlgV!gXY$B$+NDyf_*4l39rBAdhre&mk^72IJd~xUd!%`JvOAjB z7~(kbJu@Y~OSVhJo~cJC;s{2EnSFAgBkQ??J$rHk>Zf&D z%*6tcIC}Bid;gKxSfQB_53v5-LD0}yQdqd?Wa^w1mTwhjVOyOh#D9~85`a5=YyQmdX}6tqokp80lB@5@21<$S zVVBKIbd%3=FK00@yi4Th*X;bdD3VTu&^0}=U9%h$5Zg&t^*#^4`{Vk_CI&g#Ujt`=k;XwTrc74;~!07Xp(T4jpYR0_v^vVb}L0|SZL>7 zVY$ktwq=}+b^G;rgpOqylP2&KWkrl~RxoA)iUc12}JBr1lbQd4aLpXmN-Fkc} z?&HZ{Tv<2RH{%TgB_L$^*SFDn0cCSU<~pdM?6eX<5B%3R%>MPZXlMimp7g}4^b3($ zm{^1-5BbC#auLYfwv4Yq*l|nT(0WOMy6ar6?l8RgwkE;lkmN zM|P%`vsEQ~(R|S|tL&r$sfAit1d;R)$h%Wr_i#@P=8=SP6HpibC|bS0Z@ihIOeWS1 z%=uQ1#+nqjGIvv`Gf^I?)B=czIKF1YdNF=CN2JN^&GlT>r#pFVx%=4uczaRSrA%of z`~S##gsmG#D*(Y`$Me$lSL%XEu> zcZL{bTaNtyjvzWR*m6gGVlrCj4)Jy`CkOVPQDNrDpWPk%Nq=ul1kEJXoT%k=Jl1bn zd0HbAC-&d-3jdG2?~cdvfBTN?dD)k}vdJh?_RK0XJJ}GkLeqPT%&!5lh<@?RmmCJd4&d=vq@AvUOYJ;rr?7J4{gi-Z& z{&dB&|BT^gn%wEF_V)I@km5&7W)`nC;FElx4x}Yn%wUf?o#6QbRpqgW(gNUD`K>nIMI8>Rtb@Z zTA#NMg%`i*JkhuMwX=Kaw>=|?hgXkg<9pD~%9HUd9Qf1p_?XaIi)u~P(N3dABA~}sFQOR+2XzDNT3v#ZA@S=E=dw1m|N`Qh?z{Y4rj#^b8?_Ci2Nkly?Ax}ace5iGcpAU(N*-> zSjP#|@8S2$9ymFFa4jRxeI)dQ5ie0D!5b?GBZ^*Z%@WxRZmmi>LN}($*Vs}A%Xo6R z^6K-9l<|B$-@_JQ?BgKz zhzOi^Q_+w&b?et(_U{!ia?P<1!mLeRD^7svE+Y`Y1dXQPJT!gr zRCq^P<-IFaUxFaFh;88taR~ug8iFqT*^iK4AGUErZQXd7UxJW8?&$GyVkwq zDQS}!F%btCE9~c>1_cr!m-#F52H#nfK5pN)?xTq|?D5vyPAq0nNN|xVZBhzzw$nAP zz=VI6Xb*l+kkt5P#aCiJFctl)1ylNZTar=2CAB|o^j3RSC07Xxe~-#>C5(A~V9)zU zj?-9t)1gMy^pkc6P-G(IxgfkXP*m?YMU2P@w(ODwsjC}p*d7znjqL5y*7q2FQo=o( zt|d~PIw(u>>K*&YR28k%OW_%`E}9(E!uHs*r!+2t96r;| z7gEx0RW@c5n#7_g92reQI`2P`NLgm}3<;P~P@<*I=(G9p^B(IaO*v(PXjGYNj-nie zh#*U7$yw=hsb(gk@h+CpGl7zdx2Y70pV zqplqcb=Y&!&hDX#DbgayLw`rlyEv84^r_%dDwEWPZZ#_CRL)FvZOyh^V5S;ov}Q1g zO&uW+n50YlwBmEd7l#fb znNidGS1gcy&Cdl`YfY4WcKGQ!f5Q3EzCkZQkgsu{nhH<4g_z!!pgNswc=Ts7_sLNG z_xDnt;^`e(0uxCXab|uDF$wB zI8TwWoFtx!u#GQ0q_Ln_8oVCN{L`jA8h`Db@IeNB=uD}row1KLzg*&edeed&4lS1& zcMv1JH&t9S0C<~Yp_bR$6PnrbVg*dX(NsGOL{&vyXGat5!UF6mcE7aZkl(v)rX{UO zly;LqXW}dNYI|T#g8UE0Ah}+({h254(gcNV)5?cF%!jroB$0G~sARoVW?F=UY zmSw1&KRY8f-db6TOrAr3Mg|{OfUX;9&OQ;ppL#onll1(p@!ku~`n!ol)wx5jikvt{ z-t)Z6@My`f{qQBaO#X@~NiH%QlJq>u6Gz8{Z;OwW$kKRaTwO|Vr%4vmH%%NnQNdq= zXDvuu-i!MScnhw#p;}AlA`}_3x6u>k80Nm1C3o459sSkA=ADV3m%LP!>JqOIPpG}v zjT7WYx5shhe7brQ$;Ohw&7XxsKo|HyE?nKB=3Hdu6+H+vAPj&{w!?{Gxs(3EcPf} z`}4a~7PYclk3>dxESorPX0rDdw25@S5H)dh|I&Y`A1%7n`Vv3@lN4jtlH{Q@S=p+$ zi92&;KA5<0(TvMEnD)n9HzmTYD{F&N=Lz_G9^%kQ3mc!U^6LG2y(FX)$Bl}3b&Z86 zEQ67$#Y@Cx`+Kvq4G`oqUZ>xi#Kt7nZYt1{|%>CjMX8B3YwV6-S z`@>v!LNrg5K5Fijwerc5e_ZHP3iCa@>HTe?S?a)wg=ln``=B*yUhj?YzZ~L!0Ur^E z$R<+y;;e315iZCQy>+|b_-Xhh?7Qdjmm+a7dwE3{8Rv^qu|^c1Y6MWnCB54Y+U}hG zVyXVJhtBg85v8C&q?-@#$*V`Y45YWJdW})n-2^5G1v3;z0Sf(HJIo-wBC&k1$PsvO zHbXCx&PC4Vj@ZMk@cdzEjsCxfQadkM|6HeAwIy!*oTcPHLBT@VHXihzF2j~U0HIBm zx@euhUM>^nImT^R9Wb!BeBejb852RYLxt6*r;1N(ym4R>vqL5LTE8?^RAtG&zUz$_ zQsmnDOQ@!E^jC2A8G=S#B*CiBv~Fd`W`d^4Kj#ujJ<-WEyrXC=xy_ zVvgmO%MfohZ54>l+p6Sm8$RR389c)m9ubXu{`}>a+xMHOa7zS-LvS&qw5?y?Q&9ZE z)m07FC{nFB-uteiGy4~}>J9sliO8&F(A#fPVWWzXkN&X`ors0_xm>XRkA<)y@|YQS zF7bNmKX(a5K&uw5xgv3_@>fT0`h<`;*6(Zvfe?}E-Q&^z8v zeA`tINA??V=#VbS7P$Y}DZb}`G!?N2b{LqS6*au-+*h(+R)`gf3Qngas5DR@vHQar zZpzI-rx$tEmfooypRj0GR__g2-7Pt6AC!SHolCE+=JHu`=^3+G=T3?JnpvP|_>aodX$=T;)E99xjOlhZ zGLy_N{29s}Tra%%;A`nu{1WWgS%kK1yn5rI=^LJ(>o|pjjNxYv9*NqtV;--0%(vre zc5O`A0|_%;8lSjn!))cOzCcCoCHjSsF zVa#@e&x20ngGSVOJAtBkERp$$0M6HJWMAUZ4zDOF-aP6x`pTgrj;;GRF8s1{MrEUR z!QQRPZm%o}D$ClfW425QkCz%WRb!=GPqg$auRn{;qqRtaK|r}Ne74W842uo0Z@HIz zJ>eI>jhxTSFF@3DdPkw}dkR()VAWevx$HsRa!0rXnT(7E(p!f%@7+TjH+Dg35Je5T z*bER)n**Pf74$gn2`@Pp*SyXUyDqQy@H8;k$Elu<2TEB;kmu(48>rZt25(?nUXY~t z80u7wQhC zcJiLa2vwzieDL&bI!}}>x7FLvFh@iGz7+Y!IL^m&;^o*xDcNuPPp|(24$5Fl>oUyO zIhp!4@bz+vkQ@NETY(c{PT$S0yq9@d_TmSCGCx^s>5mc|G<+jhp{fy7O9_C&NP_N1 z1W?FK#+fsut(V`ir7X)%6uQjRKiCRQ3hfv6pS(@sd2Z{#HfqN;BJ3^W4^4_lh5Itr z#vZ}!Ex&CQ6_Z?)?v|;olw;Ax*!Exc2Vw6y&HUc==KT~MPJn+KhiupR*ksdF_F5X{ zjOK&f+@brK8vP?4$VahZuXASl-v4#D;&NJR!2ThNZ9gtP~x34$Ia}eiUJ>>YAq|K6 zr0=gl!{NWZ$)Q6E$ov}x-4UoY|NECCKfIfMyZWoi|L`U%=r+<+Od9lbA=^YlRFZrbTh%dZifKr?_9^d?yWP)iJWgYsBX%}9qB z(k=+&jRjpwoA9=UP6HKiMrYtxl8UI^eg$ERC&i5B{=xJ^Ryf}uw>~S8&%X7@AF1&vdG? zr8cbnr5=vf&eX4tJ&^jm++IHf=K8m54`CcOIke%}K#}=bUgn~k z&&iOzmVu@}K`EKq*FVq$7XhQcJ438@rtS@1TXMFnKU&K ztLkv*WmewaL7oh+BD|pBy%9j%j()5yv-c$LHOr+t=9bPhCW%m81oop(;B@7%sle`! zu?syH2es~~b*C2olwZE<#f4I7gW+JeW?{4!G2oiP9Zu<9!zOh%fqh}8-1s5&_$UHQcH2hh;v(Kp-O8^nq z)lty=ZKl)|oXz0#+lPG@4)OxiHjFN?yQ~@mv4UnAF>muD<*%{_VkGbX7@;gyeaiy9dkFOe5zT z8lmW+2U|ksMah)TvWC~W#uS!eY)?i}JLn)MM+QcvLO-3NR{x{J$S)5O-J2uh@ zy({!E{M?G}#_H$^nQkRcGMQ-XYUt@!0yFuYO8y%iSR7WkqOnJW?VJS~K0hDjmqQZ? zzTUOG_o5?t^d&9f$AUD$V(iKgva5gvv-}F7rnsLVjjnzpqlTHy7DpjV`yF)WJ@5 zWP^QPPG4C4HNsUEpViOZ*gnm!89GFNXiwxlzuO(fZzqLg!7rW0D&b;03Ua#5Fy&Uf ziRvWk`T!%x=QB9Q#5V1*gz(1wLr^Vg2Dq33#5ynsT6`ppJ--Qb7lzH;yo8132+LF) z;gmg>4|NH=vs4`QpJx>W1k%C$8*&a3o*q0QWKp)-co89y7GMN3oFdR<@gR^O0#khB zr{^9Qr+LU5)Bux7CcGPZoma}v(VDr4x_YaH3~Z8K4XkwEs96f1et{Js2*` zB|NQARv6md5xj`)EqMkHkOsU{p_f?mwTivq9Uyy7@3$oVg6{Caejdih(F7Mcuof?t z=^Nw2prK5+9jmqeyPt$cUS%q_vsV48g@WCsGv6Q0`k{Zn{5R@x%1}Ad zup$WzlNLD7`tQi<_^^F=0=>K8O6iXG)|Q$a2}R)hgK#8O&&h0-S%G8ctzBexr-tjX z>JhMZNWo|BU;N~=>1i>+qrg?dnSKA1w!eV*-ad`1r&o3j)}{;Zf*B1IY4eXj1^_>! zd*@00WJ(yQ9ti#bNASlMp|O)d^Lmb3Qs?XB4{r!;HP2`Z;ZV;J08jC1kdO(SXp%Z6CgQ;3%#E# zcd-L2XCS$p8PW`ww5dHWcl4dA-(9c!P|#O_8H8!h1*WQ*v*`s>-Sf#!HY9?}^fghy zRiG@P7PGk~+D4thjEte%5%Tou! zUIFE75Yd31sg&n~lSwFRl)<6ZZxXbKrQh&d7qdx^lJg3=^N)%iK)>>5X(Z?CH-4K3 zCxR5Q=MrbbGRqMejd-~~k|=}yXo2FbRNjT#0@MabHyruQt-EmQ28pK~6Iti?<{2NlD>xxhzB7Z{v8BmozbRfR{5DQ6Y$wcLHmm zQSpTbIf;<6foW)Zw5>?;1dRBV9+4Ij7EL?oeC9J7L*y8?xkryfB7?V&U|MwB%Yjdt zxq`CHpmISFgNRrF45-YOLFAc+Q6%8c)!5Zx|3qU=vJ)_P?b4pB5WoblO4n(}!SEOi*21px>kIHF}9yvsA<{uBl0 zvt7-oH{(ojcVp05|4$Pjal$eT&BVx<>SP*&&O~v)!!7sk30EI*UIHc#oe{0EWzY}C zZwk&}h!yY+zag~i{}T8Hse*OndZycqVHXz+w{pfwrn}__J^kgswE%jDMtuqmCPJTl z+Y5#tZr_K5rgJMKQjy3Go_X1MZ}?+@V8s*hJ)Lr3$BtMzD|H}i={nuHjiMHq^uCmr z!vy?cEc`A<37Lt#=#@8PRPs7apK!>9s!m{kXZRE_oXda0k$rG#jyw=^^t?<@6ER1j zh&jSzm=8nqBfYeTXnvq)@R`vA0sNJA3afQKqB@afRARA_rMGVjTmLer4q7R!#;;d= z#NgcmkNr{81!oAF$vc?6&2>nUtH?`nlB+5P(@qU9^0YLV?Kx=$n>A#ly_Y&{u5Ra%*{ai zQ-q>&g|iEOLpeA=*~Zlj*kSq*91Kvm6kdn@c99aX^|7?!-=SA#K7(7@iuLkc+flfQ zPlo#-ClU&!1rd}cF1tNQ3|PU$4V{xG<=^kgvk)TAVHWzrh!OPq@1KJP8QeqA#hPc~ z&$Y(((4MQMx-QB7+@qa-;{{N-e8UiJP9)ax`C+g0PCZydi6n{PwEIm60+#^}7JptO zx2S~k&c(QhgqWkyhvfIE4+&lhtbgsVn;D{gKtsM6n!MK)yMekz3(dDhh<9}uRJ(Zt zM~quuXAzW)*qOl0Z4`x#(0VdiA#}dI3J^Fr-p_P;oyN@sv}P;d&65Mu!L?mzqx$L7 z%mI%MLm9|G99dxb(;W)G>Vt>ILj+Qu1|Brd;CI`J$r%}?~Nk42*I zez7OyuR-I9gN7BjK_spHxV2(?8rI3ew)yJNO`}fn3#?l>gp?96oStzu@GRNZZW%aY z&#p5S=tQKhLd@?H)wp$z?3021wd8A@UN@7$< zi2wk*vpiPe01`hKB%a5ACR9ix8pd-3%}O8KXy#TtWk|jzGy!fY<=u^^$l0_-Dispu za4&ErjtCLfRdr-8%6*~h?Q1wp5@fGn3DMcqe7EU6>H_iI2;5d`4=Ho7rExmjg@MYc2;~P>6lX*vLVw>O& zZY&hWdFWbeNOjx@){w$ly?1Ko{0$S|D4t_OeCs@vl%n>+TPf}jrt5?v4P)2`aH=EV7RJJ!fR&kgUqKTc!*l>- zIA7ERG|hK}T%n3k8}lERyF+O(-mhbH`XK&0qfs7~2J12ZU-j5rZF+sGlP6Xsv%gmdA4a<2+`7T<%wPD;lHkDnG>_~4Z2iHIH2C?&o` zl4F|!WUQUlLb4GVBAR1-?~6(cC=jg&IU+qmm&70z`g|&V_dFT0V8IaDl|&95R_sF{20sGWEIu7NNf2ZyWIY$%|`AX#2VpHc9Vkanx{u{K9;4`q)dNY&i?O0LFrU879rK0 zSJ+;6BPL%pj)6uN`u-0rOdt(3H#qlJ8%S6FLOC)ux2I;v470$*OJuG-LQFhjQ=haH zUx)nodm60IcpO=wk48Vaf};&o!CQ3OZO*+Kd2~InvXQ~^BO?bft}wNjX`2^Y$&=MS zY&hBr1#kboPgw|Y9N5oEWP`mY@u-F4XVsXZ(3%|eB}Np>oj7w>EU)Pp<9i6HX%+nG z)YK`E)xW8WSdQCB09_RIRui|*Q}~L!bFAg1a%y_S;b$tji;odwkVyhB?9Qy%Anz7S>$f9 z@N{n?NBjZe8Ulz2kO*xG2_+^X^O0_54jq4Z=S{zX(P_=W6yf$d&%HS%hIJ!JINoE~ zQ&Kh=axC!C%3cu5X0AYf;dIQ$b_PDd2lg_T8|SNixU@+5JtXt^{|op3cHuHjgS$Z( zgc8y0*#d7I0lqI=t7z<@bl170&@_jP%YvXOW(DGjo%M=eKkwqC9w7P}81RlQNSVWY zD-}kfD}i1ZqC%Gxv!73~hghJ1A2%vacT^1QsM`rcUZ-191LqA_{s~MdB9Z!0s)PYS ziw?%5NQUh^jb7%5^yAl}O2QhnCB|&6Aao!{WD8OQj#qvg{~o-8?Nqg`)vli_83ffH zA;2{{grLL{a;9h^>zB~H>FUb3cE41BFT-N6#b-VU$0eOe!iCbSgXQuzpx!z^^5?&gi5Z<)DPCNtV3TP#B({r&9q zZh;aKWMzn%Awa!l3}#aaqQ|q^-4kYrKHnCoLgKDM17k^Z1I(cnx&OoXAo^(Y;a|o8 z2&lHPiSfwWzr*UX8X8-L1vU;us_OWrRWA5R3msbBX*C+%ZjtO35x$;$bO80)2&}Vs zq2QOtkD6Q%+l*8%QXxBKcgi!pcVN61o zLLo5!VYosRMJU!sP)7@g2vSH(W^ZWyO#^pGLWsu92~=Og!I-x%sZf`sazeAE#wcKY zZ#W(;WWK%OhP|@^PMr)ASI@(V=|8mfhjYXwHlRLA*b+fb{ztmQqdxW^W4BR_9 zyI=N~;Auz&lbHztV0q1|2Ao3RT}u%`o!*-`%fTv~xOOI9(~BZWyx2y|PJ@|Gu+ zl^c{{$e~L*jyUFPYD8J_VKgTAb7Tk6#0JVe1Bz~RhNv@J>6d?Ya@PGJ3ztdSXiPGe zE;Y68; zr9>KBf=@(nMPdU$N3K^x%L~ID;$$3}Yp(zDazEs}{)2i-A=1K*Of-2>*zo5#N=989 zQ6NMFZlh%4T)hN9V&+~K)D$15&b}Cxc-C60OzEqbk;re04GqlV(+lX-KGrhTtgr)ICQ%rLXiyH=(sR z$}UYqt8=Haa@0D8Yf)ywrNRO~_NA-7`UZsdwn4bGYKtIYVf(!^A&M}bw5@z9&hB~j z%SqT4U7(t~JU`m-@ZnPG0^0tI+eSkb*M+iCy#TkWz;7l7+0IBY1Oja}c@}2|9u?ZwD(4?!oe2E&2c&do6{Lir z{#yDyQIkLHZrIHBJv3C|B@bspLg*J*%rT%ZRUfA5l>bS|mW&rvGmY;3t=ue$X#g=qR9wk!ptf=B%XHgh1dlZ>CjnL(wF$K4TeSW9 z^np3u{l$bi1;?>trN@v0`)X`9%gc-pIeJ+~UN|6^oH;|6I~pC5S_a0dM&Y{OOTm#d zwZ`}ej?>ZV14)3NjJ?p}#c|1GXW@C(;R|Bnh|1Ue;N!}IFVR&e|3Qo55D|qqi|WD` zsnd|h2-}2!AL)W>r3^W16(+m`eU&b#>Ch+LCLI^qi;05HsV``inwS5+w>Dfq7+23KIY-F$WFLa#QEN^6~ewqlzn&ImeewqN*nhPfk7)Z$1j?KuGy$Yqd2xo zp~|pAc`zuCeA^4bOQ!ioRSu=7{vz=}>&B9(+iEMuet>-1+}P<|Jp@z#fzVr8!}@aD zzw$nH#l<+~C6aOLFvr}*u2Czueef0($W!;#uZHE=jY+S2|}1WqVKhXQx74I>CucSnp2zym97^x zYATGV`>0NWQVprskb{TYnSKgyiEC`eq5cD1=+YtdW^{yNla4CE8rgBFV%CY@Gg@KY=4VDTNzn!Gbm_;-&tDgpsOe6Wxs+d;ZKUeJrd$DOg z;^R&zgZk;L=(*AI+<^z!1Poi}n8nlXql^Vb9F1Xas2`)cP_2P+Od-}d(7-ZcG<>jQ zX;wiF(@uIYqdg zRL53#Ao=(Zs7Qhx>a8A1eVu zgWA30pTF>PaFdofj;`8_-SCsWzdFSC1JWid0E#wnpX8PjXtyU!=On;#EP}&& zeE6Djs}mZddWn^yCbY`o_>I-ZT{bVB1SWE}_&KIXr>GFkfQ~clc-O7EY*>jVQX?8> zyRWd7@-_K)$*ZJ+LH_m&s{~&<|B6_@swuulAIdj8R<_mBI)-E3(1qFEZ>r;Zum;nO ze+X{#NPbmqla~Vd)jb_-^ku&<_L_YNc-utl6Mfx~xQfdlTL)iz`IgTD&86#@?oTOa z=TOhxn+eYs>KILtCRUY37B~|+MnyD3OuzyFR~3?khM&r|-18)feWh>{rVWO?8^KWh z24(33wtB?tHD)iv5)wm6>_f*a8y^0)9Kk@>=Fby8IVsCu0Dt;Qrz03hUEpXy1;his zjo%RkL4@zIAUtGw^|MV4{CwGbccIXQG2d@PyP z85@7};7I}yJ^8C?AAg{o1DmbTQs)lf z`BuPyA;2)H8}P|FqkYTUkrLo6R4vGH?x-41LfuP7T)gSF37roGb@xI30_VUQi!iw| z6}Hm+0>tOH4pYiu19WVXBHbwY7R>v_S7!7AD!*9&XLf zLaa#$qM&hd39v7cTkKL&pr*3aE1i5-2=8hAT*3};4t!90(%K>ne_>`x&9+wNCj{4- zgv134YXO9Y+|TED!Q4Q%o1|hSmeRU^PL||8Ud5WCf_g>o-s@*~7!W->H`WXZOsS>T zpLMr~JMu3uMne+>#chXGd1Q;Kwpvv$K!0wmF3BF`#}7gQ{g98K$Xi}R6n$Rd(1&lS zjsY5fR-qt@wWTglzOE1-hdQ{S7;p@%c;4FgDC4yoj#AhnX~n5aJ?kxGQi8RIb1_y> zvmt_L=2eN^@a2`#F6__|`Z>-uu6QtdELT-rPraQXW$E8c1LO?L<3M|g@}t&_a&AaH z0<2Y)thFPP;O99+IeC%{)89U{_8?3Ieb;lM+7Yd>uUj+rxk0E^-a3PVaNX&<;x8+Q zF%Qa}MK!I*Y7mo}I=D;`M$r#O(OX5)U+RMR?O6|$v!h(SHF`3yYgumhuZ}$&$3VWt zvnZaBih44-pVL|u$YqcAZRlxXn~jh~{*KD$uc$}Qbcw?RWA2^BNZi}kdu%w%CMMedwiVd=#6s_Mtb?OVzU`VySn-Oo_l$HyMJZ#;gr%cjx+f7y ztpXa#M%AKluikmWm3;~dAz;86ZL9xWyFlSTmDC3LaJ+7wD0pHs8glIiYsJI?PS&o zica-Il+s$OOBwnK$$V6*?=_@qN5(H!&iP-}fk9+SCw0v=!&4vy={NiXR38(KZ#&Z& zpspT0+kgG5aWhi6%Js{p9l;=s5O@>AUsQBqZJsON_L}hs0{#X4fNDrrM%is}O2`n2 zM)!i;OvU+pDss|TRICRMZmkU{W~PlAqA!`6evNw0)RVLVCC76B0$7O+YF*%><>V?8 zV!jhda{+~DDA{&#J-uf$P&Ha@W%#4IdBcMwoN14SNIa~gYSFDY#>3I9lyV|5!T7;w zYt^onEkOs2O8=IFfH`(S>`@LXX2idW3=+bzIe!H~l* zU=e)fw3%jA8{GEq4tb%|1tKKaQmGpbCL(P#E-hs30`9?F`#J4pkGGrC4G*oGuJim1 zPtE*t`FDtrCOX2!qfw+?vXwZ?08!1OicwDl-STO}!9=zuNvn&h?6|WCk4s&!+O^;l zoxfa0zjIbXyDucq#lF-tFUr5DGS;x0?<(#uLgt=Q2xOK!B%Jk6#<^8xbWivSFYL!$ zj!X@w@P#Cli&me9-pe`9+Z#+7o zHp6G?uj>l_{J}VK`4`9;|Jn}Q5qZSEle}G#Iap6qJZh@;{>1}g&6DT-j?Zv6t_$jv zlyq5l&}H1(aQi<3hFe_qEMUEVo?{fok3Am?^0_<8qrt<+4orh;VXmZ@^sRR<(C2Ji z*YNN>t+H;_$~k^!lS?_IK!4rhH54gEIU|@|sH=ohIElKTKrG`d?P1j@pUQOh6yV z-7*HpEE{CruhQuQ>!IDYHIgC<_vhZ|l@zQl+%(l^jQhO}I&_ z6Z_e*ah^%#y%)89?HG>pY(!LmBD7EElpDb4$pu9);KDAs;{ zPcqjvn0=LeTC8B}Wz=5BYaSjtL$Q5u~(bUo=_P%FC4S=y4izLaDYEC6sx^Jy=UhLW} zf6lt2W{oNVj{MYnMCDBxkAz#Aa!ap?tu-dw*vhOE23yQ>5S6s1u`Nd1DXJp_TCDS^ z{mDb9v#^_R=4rGu-(|&I>IKAFMok1yRMkP#;)083Bieovbv9ZiIj_XUbrm>CjISfF zm%S`ymoLkGc1`g2-JgIDVVLNZnXvt3LA$OCB%gm|h^5@d1Xt0nTZYtkRezSVj$*TI zO4n;0VZ-FP0zdJ5>62A~DTYWUq!w2( zm+TdY>R>3y`AoJ*yVEV^Qe$Z(*>M?9H)K>e7GE@Mi$sr z?3`gaS$F^wGu%%6UdtLAR|y^rv%3p62!nuEA8x$RxN)qN6|>N9fUmf&9BU!ommvg7 zW9?|3NM-gms)ma1+rHWQlP$4YqOBO&Fs{0Okrn^mc9iDPgc!I1Ys=}Wz2xlJU zk^5OEX1yt}eJ~U)z!dL6_r9vtW}t+C1|8d)HG{9j*b?U|fpbvwu5kyXu+<`W0v%Oe zypy3Vj$0z0NUoN}vov*&>S!~ny}s3|aHdf|H#Jl71rXg7t@-R^BnZO!M!x&EUer`y z8)+24<)m&&?9a}M<{`C?%gq&9X`-4K%r7a;Y;f^0yiBTfMfX zqMezv%aQ2-VSK}9M&_&csR{Ylv+_KVvMU=aM1bFIn1+au@EENfD7>;qX^_uo6OJav zn4LDjTtO>+tZQ#MBpaTgBYB5}wewMrt&=Z)##m4@+%DroUxgY_l3aoPvomE3%bpiY z=RCG)Y#2SlS(WigHmivEhm<7PW2MGlEcQquBYDo}Z)x0I>9_H9P5` zAXM9>Q=$lYbmcPZS<2a1W5!q+Ji*PpwRvL_2Weap17exm*1inCleg_iSv*=PSX#Y+ zn6f8%PDEp;(^rn(VGgfs9amT=v^k|}PJ|U9pT2a%h|Crk-X{Cvy?EBkon!|Fvs`D` z%+f2!Z1zfZ*lO1gTWgY|#Z=s(cz`1M=(T33$2Q9}t}Mu2>tg(?mHGix?P7}k$9q30 zbSg_~#0$C3m@>6|uqm#PKzWUlsY(rYbnI6x0=DVwxMF;oi}FC~cf>?jse$ZN!hd$E zDvqkSw)NPR<|N}-LXr$sezby19s3P2?6caLKGBa9%nL0?AM2&$O`|iUi;9OZ;q*>v z+t~7vj&TthWmP%TEt)tbbXAv`9xP{Z#ZB;rmtMo?q$EE(S73=UG!;QPNlhq5>ITti zT3Ov`Mg-mXtKzr3uL=}4fUM`}Qtz!lKk4E*^;Y~ksbZFcf+A==pB0(+?CFIycEj{; zEbM_Qdo#LMI4nX0F)l|=ahZB=WNQ2C@XO9>Q?_2mp{y!Mro8II>UMFM#5qXFr||bX zwLv}qHqZ~j+?_4rQOHHJ*MVd|DCKH`UEFz8_W+3WkM*%(6N#X=WoMlHjC`;7_lR~> zD};%Dnc^vPRk>CdiG5;;x-m}H8idSpy5I*>lC-fnYo6$al_{I!#fnr_Y=ufitGUlU z*xqcee+S(z&kuK>8G+8M8-ZzSa$`#%h9);wO_?;O(89>MeSA?&rXU9If~+45tWcE0lCeS?>Trcyw@;kEc(;>n#5&AY zR~pF8bwNh5%A!dSfJo{4S|neIXe^A3$4Kt240g1YOA5~_{)WqOerKpNd&N9HK=h&J z&1dz4PvW-$2Zn zG8=+0riTZd28~RpnNR*jG6wlnPyMSTXR<^r8v|_%?OK~=@*SE`AHWkxf9!=U*GLb= z=fbHkSqblLj-}u|P#_U2HGfg2K_%n4-Y;1Qkd3BvcvFZUzD`_U(oX!nr zoRSz%bpFv2p<)-X{#fPGXOb^B4O6u8NT;Pv7CC@W?@l4NuH-8VY@a)2ZM5eV+4I?|sRiC; zxlPww&}E#lxrSLI2=fW#AeykhqA~)3XE0WOmL(tWN4+1m>mR)sWhk`iu?MWn?B8!`l&xn25 zGr0xD-WUCsacGn~^{@%nl6@YDXpC>2l2PLdCF76!_b&%4RERqP^HBqtDH*jIx3tAT z@AyeOJ^FOZVo?MJ;?VS-vigAwLh{Izkbfv_33pp{>oc202PFSXa|$wXYT=>B6spLK z0PHiW-KAbD}t_aqW*Nf0Vx3ddzm5 z2#VP3EAM zL=Ha07+jey{~_nv;(OMLLi;W6PXj$3n}>~9_=m<6RD`po?qnX!Q!P|6m8vr%#Z?RB zLGd-DXBYnh4`)%4bCX6O8R>b{>8=u6Bx{%#S9t-0$99!_(&h_0pd53=z!fLyeA)lu zoOi(Jn+W;l`xl3)n;m}$?+x&*E=X2UFoW(z{o;0F*2@*=*T2*lzh@iy zkCG7jkBkQo&Y|8d0R>#8AQwr%^onEn0G_L0OvxpQG39I~Xy*OSBP&#rL&_B6j?6y% z7}U8RX3fG6o$+rm=eBmAq0dD!GktQvX_y2HXKv2BXPY5mxSG{G=#i0Wqio&S@=S;p ziR>KM`F}8GaAVN!B-pwa$>u%Y5o%K^)5WS~OjY3Wd@~H?opKN${@t9v_yEVuj`4;d z1S$FsKr&-)Smz61x#gsEmC^uY9F_qpJP_1&HuXc#b%+%EnBe!Q;1IVq48MyBn2l;m z5p9*wMWld>hY|F6*eDfR6o~QDmHTI9d#){V)~0FZaC29nymnYz zuhWfpco2kP`b&l4*eX`;yOX#|LD^3R57m@!@^PrmVcT<jl->vvQo>k^Y!<{jm#bR4 zFb}v;QQ-$+ZIa>8EK-W9b(V(U-_YDM;?F;6*zY@Q`e;?H1((~w_}TL$tR&Gc<)rM+ z%aOLq5Y25QTc6>myS1GcPxPul;)0ID3oql}j?4 zzy13h^$tj<>%4bYH$U8X(lpFbm9>2guyY@r-!_zZ)P&MAe)93|Np5bUCMW$A)?)?w z$RGl)Cm|6;Yv}q?0PA64-L%Z=(@ed~{15&?s1Qm{IzCd$ z)Dgoz*}nBIQ;``xf?&>(e3N_rfEzkTaBHQ^Vlp>@k8QWo>Z-~p=+-pelztKXSg zFJNYvm3#++6to11=Nvw}joy&r1q3C4J%46lpj*Ky%?K!9elA{$TAYtXJu5!$6fu1Tlr|<6owl#DOiwnX`!mFPWQ^sj%xO2_|K@*y;B%vZmC!e!UGCW4E6{ zr`T$7wjtKgdV5?Ag%g52)`;4$Fuq#9SAQV&Y0*To=lz@14slVsD?KV_cZvvZ-g(CO zcTx4NkA{`!{1VA+g~5;6L_}t4TF;T&brQ` zlkLQ~@lkV}(2msNqJ_w!lj%B?8bqdF>-(72=50z(R$qvpu+DJpm=tK8bQF)j376>j zYEAO#&)6Vi?KP#lz1jnioHGwQhIf!f0wo zEMdF(8Sm-4a54PH>)};Cub)MA&yl(w7z;=M7uj;Y`N@$)mU^GSFZ@_x&z6w>bhlq( zbW=U=#Z+#kdaJO+zKttwQD3#K#T-_ZqCMm1ZDP)U$ggowJ)yl<={nSA zZdE&})C~(T{gY;|jMn&|2Y(TJ+NbtO{zzShb#YB2?atWymNDS2-k^$!VP7nZw*KR5 z>AXX7MFJ}MH)LK))S#{FE4wFmy8YJDe3;lggNhfgIMn91g!LWXw7vGzOl3NC=R9S4 zN1*6|WYl!mZRe<~M`Vd&6Gee`w}Z(ormWFceiYNB<#K^zFex;`EUc zxy;Z8FsvQv8sttMEh9BPpfoT8++ylbIHU(9QCY<^Hg9+p!pcRpMAA?C+|g>96}=Zs8CfESEwX zciME+qW}&K%OC8DmYH{1fcLQs5jgBKcDGdC4V{4XTeRQ;Uia6Mbay(6E)3(Dn=jhrC^MWL%iD;y) z5|(Z$KSXQeNG~?PL!Q1pX#e@bOJl~2xhtVEt#531FWATW7?B@;I^-A->7Zaooim1+ zxQpa#f%nU%P+KTrln#onAZc^L?BPuc14p?%O!&*xn4SyMW)5K4tq>Hf4_Ka9aO@HlE&F2g=dgs)q`eIS3La;YU0UU^$DS~lG~=jZ$$8+!I3Yag z7`u+=BDxHfS+iq(y5NC=n-5-F6z87oy0?hRYW`jvBOM!6lxMsg?|CTrLjSE^_r{MD z*6vJmy2#)L-$vzO+Eun?o{6}_`kynux@GU>#+h%j@Rfaw5R!Abm01hX(nZe}nJS7e zn=swBJ%y-`e!(o7|5u;5OBtK}CuE*8)}5N-WU}K{LmuNnd_DO=9y)sN6*r3CuL2H} z{2hS$@@Vv2tlbs!)JPZE)(NXmq!WXkjg`e6^-tiY31{v}t9y*#WNYDE(NaqiS z_uV;CA4yao-p9%28m5@=aHKMOJQ{7Vmj5-r?_H8-{?m6e=TU9xmcFQM!$@}ATYUx6 z`TAx7`~}N($XZ68!O{8Ug?tl8@Fem~JFzN_sLTnOM{8f>;iry|%eOs~vXm#WV$lr( z%pUk6=ibn7-4trb-YJ;g6pX;y@DAZrJ==7Cr(LonYGuy5Xx?*N!;d1?qBZ<>`1PxM z`>h9CJ3<->0m&{3^>w@?pG{SHmtgkvc!E!x47GJ3VX7txq1bsEQI>aM@)bGC+|@$u z!XCzcA2n^kxdaLaIT4z7`?Tg-25em(*^&3-q#_O`#&V(8kL@+QynJ?t1S{4uhae{B z^|3NyYv!%R0j4~->>6K~51_Cm5i$i*3M7}tP zZ6sP=+~xm;?Flfu;S=*w5{ci?e>@H>#pv}b)yQ2n5(p+9yro1gKr9fk{T#*4eSw#z zkw5lVkHSYD8s@2+OlmaOR5i%WmoVT#echjim5wAl`Oom2uWbvtHGh6(Mf}aCC?N{= zK!H{#O`V_a$HzlkjpyD=Iu)0zElk8$=&zD1KVT;LcEFOkZv4Vqr}?)HjX{x(#JD&s z%(Two`<^HtA>~dNxM{+{_kj0cx-B_Yn?@r+G8r?PT8wM?U5MUz{)wRn;iCP)tR}-1VlG=;Gf!g$qH~9cszaUJVS^ALT@sNn)N1cGm{H zKt_|#3rzpLe(qeP=IMmo1hTko=Mxz5JD|Q>78h0aS<-vX5re61qZQoh<;Qp?=N5IG zuvL214gy|c)9ZS3|E+~uua*{*c-`Mi(6ZSjPw?tojU z6B&@iH?(oL*-;&X$*8xvm!qvz(B{8#2)VV}VWZ3em$Q!jx)Z~j+fjPU(~R-Cn!=D6 z-IM~O(k-gH%W9q-Nl)naHyn(bhu?*5c$YGH@SO~OBS9CS#>pOvVPulysU-d5TA+T& zz-q~_C%h3b=~37|{UXlM-~PRo%LQn9e6*PTr;dnq;|8j1InV5Hq;3?A^9y)CsM^W! zK4MR>Go@^T-7T|h(3?4}fyzwUlj~a=F#V|@P|I{sup};!cX%iU@TU^MO}NN1dSV`k<0oYF8~UZ z!uI`M4TJ$oMy#WR-}?MuNAkDQuwpAOuTYy`g4~b7@9NdEquD{<7B7^t*^@j!*yZ06 z5#htPGLmxtLB)WsP}59rAt2j)?JL)UyFC9-hHGhWQL5teRdkz+(my3hnDatnU;$*0|c-zmOgeotJXdK3zW`HGZi%mfA+C8O8si**v9B z<&3h%OfHZ4d!H(?I8y#tH?x>K4X1lA=}}c3s3Yi`DWV;Iz^xz&K4`hp4Abg6)FMB^+d6T;IGmWqn}&JSG-2zEl&{g+HvBo|XOE!f71xfeCnE3aLP^*B78JKH{hrNGc>XJ( z-w$h#_s#Bmeq(=2rZ$OkWyLvrUurONSbALD|Guk}z&|?IdWFK77QO(z*_jYiAka;C zc43DzR-RkSHd(n!iLmKnw83jeZ*iiSo{X$3c#G9j_qf3m5dZ96F}&kJ_ZfQRHTvAw zGN5dHUE>~nuX*Lw^Zupr!z}CgfmP3f`ZxaLnZMNL&h~vMi;BVXM zxY1bq_%7Thu+9+U+tav{$lW6v4?0o?_(-z(f)hi!3HI^P*b_~Q-{Yl~)OVix_4_$+ z74#3DwLV62%)+#zvd#a+qe6}IwIG%w47D3KBZ|VJU#V}(MG3O}N>B4KLVtgOl=Xs| z;}ay_c_PD7)x)y{>CS`^I}QJ z#PhE(PPIec%~$!;9ViO&aXUcPELJ&WSYMNfObEU7tW^-#i* zDtZ6TcEcoGRH}c#upCl&d`F}<*-r{QV9RT&6VWr%xK(JcH@qcnkJuycSo==3Mu2^v znsSfnUEx`~lcZ$=Ht6zL=Tpc4V}c#^nX=##n11AfyJ{et{lajOKs2jrN@A37BsnY$ApGV}~WL{#UGy^Z+ftKSeHl z+W@}m?ow-!8GL#mVzi_1uPu?HF!G>}%)Em}&VUp3mDG4H5?K#Wh7=5(rAHwIaCJ!OTYdV68t%IPQ z#!37xn0|Uu5{0n?TZS4)gBsLl3*qw}k6(F(yeTyb-n1DTVS41u@&%c7-T2Kj8!lrA z#VeC&`U`6OGN8h-jRnQv(MB+>^im_3J2=C{V6lp#Q)Z!>GlVNgy@GSu@JY|;L)ee< zPcK9cC9TX1F9wFl(3~1C&j%g)W{O)WH;GozqE3U+Oqd8v_?3e8KFhT$snDXn-BARX z2B1G4ut2< zO{F?H?fOCn^Id`%{|D#+0{;MYa=G;0@0XZm|7RIDD*&zh%wl?rFZ1Rjhr7q$Uwg>$ z^$jbqmuK1xhmh9s^j~=DAB~QFax$XUI7=FuxCwJHM?pO405FvU>J%B*iMJat@Ok#~ zZ&RldJ8iD=(VJ80fMG@gC=JJxH&hW4^jC+b}@@DCji72yZeie zC%_NX4dg>E#YsZX(v{z6hDB5?otys8THOIQeqv(DN7 zoOKEjo-*?(+Cx;^HvkB$!M~#)dW>?7pgl&wvW&MHwvFvbgZ4b6p(`Pj++}c8Lv&!; z@y|QmpuyG$g{MI0O&nwo{yOKOaq(ptbU!sfs-1vyorn8$htAA&7Gh@x{1M!F(3ZEK zjo98m#yJ*0T&QmG89t-*)F(FoE~J|SwXi6cCgw`N!iG&vgmphlL0$u@(#ULh08+rw zMqjA`As5;>8FOhRtNQ_OjpB6nn~^9Or#%SzuN1q@IYe6=R_;J*@dksA ziPJBOt&xHE&MO(vEjsB)WHv4H3sw@HyE~ZLR)Z!<*5IA4XWmy&MA<1SFhmY}onKE5 zjsea8%$IH;ipg{~qR)tY%^Sp96W|%cY{m*kE)8`P3Sfn$V<2R48X+uZ`>)npIdg*^ z*SkmPc?Zjrrag&w5eIO{Uk-M01ILmam%5qA-P+f`KkQE3yLTS!Gm4%?+J7mN{0{f9 zl9W!S75r*g8rR=Yy8sJU?@XFp_q9;XY$C5Cj3gCWTWJh5p^uIZ1V6Y&7}@@dJwI_e zvzyaHyF<5a>l=^=D!{v=`NuAK?B&(IDMTOpv;t^p6N!8({3FLQf;|a}FBvRFrjzaf zzctcl1US3Sz)|t~5NlkF7;=mPk@?fQ@t945JpXEp^^0L?(*K9Els^&CTTxu0#J^_;fX_9bJMbHpg7|*flQxff%yAKr8Bg@Kbt`D=DM2Z>Y#}V(<8ac`%Uuqs1 zhPfhnl#Azzg@!g$MH>1niONkt%Db|?D>8;9jteCQXHjzd=IW%2w;_a!Mv&Xhy$Whf zThhXu5+ZIg1iElGN=@tLW0-i?3%(;CYmDA~di24~V4VFN7+9NuyjkE6+AOlj<>5=? zNQKN)xciqfzB3-}Y)QELrY;@B7I30kAk;ye${{TO$aG~z9Se`E+-@nm9ZaU1K-``{ zTz@k3Ky{As2hIj_Vr4p?!3%js#78C-X?q%+5UU1edT#ZCrCk@RYmaMTdUz>uHksI`W-n2!D|K9`?{3vMj+-XaCS+{KsE( zQPQ&8cA(`B*V|VhTRU6RyY!tKj;=3r&tAeAqMECXq%!}`K1Uqa7>r^vDDM6JXUYZK z4+vPa^On^=y7uJEs|&AT*EC7`0mfhN&Yj^hzs#K@j||v zP03pE3~0Zlf)4jkr?qx-;2^4L?^)MV((?m8dV693*swW+omT9qFqcI!qiU`#8Jb1+ z!f#IjI~o3WVnG|!jfk4&cpKpTlPm`{xbCCsWIep9$~anDEFuB_q{?< zsj)a z96PhVm+56g2E!kmdk5ywbUmo$3EJc9XnUd()%B?GQ^D~@8-OUJNY;bo{uyd2E{OCM@eq2N?hw2o0ZPNp?A=$W)5zErr3mK>5LW%16EwI_ zBM(J5&6JZEtbkvk^);UKnm@=j;9T3)1fn80SKtvLrvxD?I{imfj6!zx%`%)tUFM&~ z3xBV1pYG%}ej%br1f>KczG`$+0IWO0PQai4RKn+w%ug-JNESJ*2N2UQcCv~*Z0ti2 zeVnr{M6{PTMcNJQkRa>aa9r-k14L!Mv*J+|VCLr;y%CsUJ%aN%&^RRMaEJGNS&12PIYT$jP8 zsv2>yiv8joEpe9oLsd;s#9^@7&XTau>H~2CX|T#BIHD#m{FD|?np{(_{+3N1ddwA zy5S=dTOnfceC5mm_z*!pgUmn2Ed#tTLHK+7&;DZf7w^?Ugq}X*W$-?+KoGGkgA}T@ z<#1EW3_ONbKrUU&b~F7+x>}JD=2SXS$gRiztFq^(gGmuVRVBni{49w`)qwKKvD3c3 ze2V-nWgnte6yV{d=)fQH+PYsXV2t?dKW8f+xED+3ZAx;W=%_*=JCcTJgFnZ2mTCeV zcW6{7!q(wtFAq$7193qWe8c&{YjZPqpmONqI z@EJK%544?e#Bfbp;7SpSNjuo@H{w(2GXlQ;E)ckCK~wSu-dyQTo`x8#Aj81lCRYP{ zeGK9@j}tg+K08&{*<9;Ob-MXK)1CjVl(=}b%X7u~UmyGsKEIgdu?&(w$-zz#k5{`# zwLH>KU+qab!BU!vkU?M7FR+^dsoU#n%Tc&gB@#;m5!EO>K%Ta=1PE&90Sud9Hd2e{ zWXlLNQSZ1cyV1cocW7!MB0rq=;1#%0YtcVuKWfCoSrc>OsPIjVg@ePIf-L0QjlTyE z!LLMRCOec*#&_M)hNB%er2M6)Sz57Fv3=mS(|H*@(NK;^mS((jE2DrYke5VIu+=nO z+$t18vxOS_Iuu-QUO_27`bSOk+hG#k6kMIKjr$9kEQvgXMBM_}tzK7`D32b;VVJjc2=B0=Eky#kIxriE2t5 zkFMPW1tz;0A5xx;enr%@7rh^QUJlEAI*ULnys5nG6+4TNuK?A{*GdG`!Y^voyQ&PEerlSjeq4H~@X`K{Sg@l7$TXTD6HV`8oZIF^UOjUhZQ{ ziSKrSra?lTQH&G^8c~RVPCe@hl$yN=D|-Tzj&Zzw#iUgCcj%J38QS7lIK2L#Otq#>!M!(^i=-89?_GBN7t@}6mCyRU%Dd5D`FUQFns%ECHz+j z5;lcnlgE#|$aau?iRZ#~5u}x7hd6f9log2%P9 zG4u(!+j@@SY{Zf4#7) zNZ_MO#5EoBks@RmdC&jTl8a+7I<+|U_fPFcZ~j=fZHmrK{0ShZ__})g8{JzwnYa-2 zr#_?k<{ytT3g7SFCy?u3*|@p}i|;Nn)V$e-^Ak0)BvgrDBUSl~*|GYA2^WG)APW2s z-Z0J?!G7ev3?YU2E(F+7__kkYW4S;prIXf*eIS3TdCPx)G$Cj^Kh#mb9A+nwnP?%J z(Pqd=b|JVO{$Cy2*;+Ri5I;;jq)!PuZ2(vXQ~kAv>|z3?Os z%5k#(Fu%XCtn41RV5FLg1mqeRraB4<4~NIOX!bHKZh;+=OEv~8vNxo5bR<6+)qt>Q z0wy1imwBa^=1af6958e@w>02ESL&i#oP+M|)Z9VF)GJ5!hD->%y*%Jb!x3tkK}4Y} zfHyj!oo-1=NmqrqqOJ}OCBoU$FExHkAQMbq*em9V#ISfGMDscU5>^Mqm<6oCV9s6g zhDvi&S1^e09bCX{qNeBc%X0-mJ*z8}%hNI9L_c5uUUA!nfKNI1^nqy);@AN`&*@~S zXhdYZ^7`+eU4X=*rlZ7;k;Nm-QM*X9~Wo`I}7DNMI^f? zcjPClsSQ6ono?Z{pvLc+Wa6Cce1!pk*6jkIH3ojnu80=$cmLnRgx}rw4x_k0rBDMI zYu9e(U8*_aMAbCgWgEQ~$WwQx{{EgiGHE1AtT98ZXlfu>j3TBa6Aw+7T<JZGH)hNrE6wl4e~nl7?2AnEn%h(~gikLo;Jc|4PdWikRfxI{1; z7n;T9;lnv%1G^%-VbOe+!o*6p9?bo7yzC5Mw%*|0b`7yCx3L?1@H0h|q|6^QdMjWC zTn-}$g}RvCF|RKCGuFQh^v}2r=Wq?MjpvW{Yd>;SdK6y)YZ_?Q#K1ab zra^baLyJaYwzO~|M(sses%ArB2fmga2CQm=K0lyG=gpxI zQ0FeqCYA;e=h==2YJTsG9E?j3P_T#(Ud*_M|LTV)WuG9wYvP>QFdHN63~YgA1LtfD z^5+WjI0Y(rK6Ic&26A= zjFXf_yj?!`Qk~4QMEu(*n+HClHD<9+A>DwCSOebfe*3Fs7_BmICaAkPX1#_DI|K{2 znH)YHwf#DgsKRNMb_R(mU-*NqMo{$3r@POFpI^X!wFgah(<0p}g6y_O(pEyNtHjkN z`R*-`yO-6R8Lrla9U^Ej2!^?Reoz&^|Kisq%#9(RIGtHyZdOZ)Y^B!_}0;Y=-QY@?G+T-)l26SKyf~2AHSzfIuc7x=jj z`7JHR60x)_JyyfB^gUhCy2n(pm|fD*h-|fslu92X_KJz8Xl}5_J(4W#PHdRzQS03c zv;H^N6A6w8ez4 zbCc_9HN64TRO7(E*Ol`8J`h{uz~$+3r-DTkW=K1O)p@TuQy;hkWj(O{UY;m16_io` zHmm-#(Sq4f1BJ3ZQcOZ8Dmxf&^dmXnv;62)PVnFU0de}YVY$^=DB9k8gEnUr1SYIL zcgDPL1B^8R=e)D4RA}DRbRI~Zlj?D7*Y16LIw<>XGVLS_2>q8a4#7ud z0_ZZ>7wm@XjjMLAk!^KqeS`BUVN{`4p7q(RJaN$docr%=mZ7YmPY|h~$3^4F^o8iPax$UB2O3C($zoXZ&>e~eHZ>& zpinFlTW;hH&@|z`fK8K!z=uP?Py3Zkn$}wWvF2bldyceBp_OcF`}(wmo`h3o_1h7V zS!_9T{E8dd6%o_nWzHv?kLSPQF%Kh`jcr>@!FOjFeBYyKF3h|A?hr`@s}%a?qsELe zLTF-E*KvVd;QJ@!@`=>y@4+*79>`Ui;ZknZo+7b@;9-aLB6}%vKqsGfto%4(N_m%& zBqI5Tk$N{7R!kxT!SOI#Vd)q|D2}q?x>0hLx0SthXp#Qjo(fsSe1t&|8U#i0o7Qi1^2F8~?0hA6k|+b)dHH`H`zm3( zXaY)+nBt1hsX8OaYyL^Bc7Ui|1FY4z4*;*X7Ic#l>^`}kW)CldV!Q)1YVW_?qyM4u<@3$Tk^&;z> zAgeNRBAi)b1db&9%8f#|lYB&4Bg7tmi(bAuCWcvJo;0I8LBkLqLOKv6WLvb?ns zwF`oF1|bXFqp^6#d^z*y{3^-;}ACI>;O2b3t;LL$68vxz+VD+R`&oo4Og_v~?c0*V81;Kwc_ z2m6diO7+_(UIY8M?N~d^+5-4W+~4o_Z!$E+#V3-b^(8d+9p$GLkYgK+Bpr>%IXDxN zN6y^StwpX&8T#hWg7oX?bh@0-b9f%K+bG3w427os3EElJ@q=wR!@FuiGHyP4hoJ0Q z?zS>fq+$^_sTE>bankF&{J}ylc&Os}U%o>?C#cD+P`7;#d}CHd88d1FocM#Eh?kC;D|U!qM!$==ONhqMS3@(+J> z>$BqCI7Jwz@{D-ySM*-RE{8Xl#hl}F}`sk$HgkX&M}Z3zy6@+YJ3V=nkN6h z@_-T~d;S@hVG7}E;5`+SNa67S>{4mdrsm{|;9CJT#y4-Elj8zeliQ5hJ%jUiH`x+` zAt@BIzM9BKHiO#Wb73_>Qmp#<2lr(JL8`0dOa2?$s;))2OPQG0V3p6>CUX^RY0iTa zu7DY>^|86G8jSVaZBZq;Xq}-LoZ#I%q6H13nh~`lfIeHJj0m?b{N3*tP@^q(jQM(y zGXBx9|FDHuk@t8UxBPfl0uK*Amsx!JghFSWOp_7W>(Ld-CKoZQhWfc3Fgpr>v!tF8pVcM&n@~Vk>P49BjFbW*SI}4m|9U; zQrRbGrDz>%Wr=L18Ys5aSold6lMU_y2jg@AwUPf&w!89V+wJ4QLTY~i@xPDO9({Jv zf2=69?hX=DVEP2RLkPx_kYGq508d}?pY!3Q_%!i8@XYF4r%*Pro!l_L=3L%+msSNf zCE<*j?=ukya+m=xXfY(nl2H!2GiI?ToV4!cot~+Vq`Z2zQ*b=qehmCD9azn8D;0yM z$Y5D10B(plV@x+_Gew5-2|=o|h5r1q_&-a!_r{kUjH`pFPG2eiu5tuLh>-l@N&P?h z(2X4#_N8wj-5%ZZt1I7;P zlw|&w7ni$Q&+^-~#PNBA2qShy64Kjer#2Joc^}m4G0lDJy6Fw9=9{8KwytFMuEjes z8(N$(cW(x@4$yA(h7H~nRqkw;mz2=OdMmhM>3oNEqjaiHtekhU8bml)2c{cU=Ro8~X5yDLFJ;axA0p}|*MUf&8GhoJ$ zGH@&BkB9hU^cB3g8+H%Hs?O=I8qme=un?lBoO$({T^h8;@bcm=#daearj*+uEe)(% zDCT0;kqeI>!g+5{gVFat3XVVFy!Xt`c!m7#5M?S$>~?CdvbuQc`T!{TC?#|+u=_lf zf2FY?`X{4X7}cMClf&=}c|4}`%TF$!N!%7u+L)$rNvb@|-*8b@wX5e|moS4M-+mU= zPCF;TmYKG=nX^_1C9k|Med+G2m&V4`{ijH0Y!Um(xkTKIs1(16BD7^kA*z9%BiN1fR{!D;q~ph^CA;IUH4m>VV{?` z$gevuI7FSD8f2L>dzV@t!AfnxNQgmsaDA|EDXpPnm%VC2*`gzq|C`X_>j1$1)f7VH z@Vh1+InP^%4>mH9O75I0lh*Hoc(rYvP|f+}A^)RYB@bcR)U9{e8bKB=2VWTWH_#(K z>1C~L0Y7mj`%SjP>VQ8>98GD|U7D?%bgVwp@}svFLL}Q|c9&(!Bl3+iM&O4#rPV7~$hk44k^FTa{TwrMT@K)l9IW1f(~U~#rOR+het z&8DkHr2o#K)jv+3-Y*g^rAj*v7cN7d5mUl^Z?Bz4jwxeYxN>=FhI|9c$WAGmg$qA} zOR&fYiiM2Z?AEHk&XF#lw>&{R){oZQu(Z~8a^X|evofP~3ik7Xbi8K5b|H_lCT@x< zoO*<~znA$(q2$#Gk*c1D+cSChMWH)6)Aq=w>GbHMKVSKD(He)|y0E?iGz1B0HU9Te99L>!Pph$C8{DZe4IX03{Z#9ss;KkpaQ3P(Q2=c@6n z!Y+}(t~y)|*+~U3E1%2gpPV~6G7W{o)Saqw@o!eHpP?JiQJ+2VJ`DC5@f1Yh^&3o@whYor_NbNgH z`WYD4|Kk6bG$i;h-xLg)c=6BVSf_AXM0kx1LkAZhL3rw8jj`-4VPc`^lghVmNDn*S zy?UGV-kWf$F@&k~d*PLo&zB{Sd8K;i68AZ|Mra&apGWyTihloQm%oM`eK;~y({+8* z(XK#ZET(OrjQcC1bK;07VO(m}>M%Us?JZcdfd>4@p7=8|yDG!}cQjU<>~o2?Xdu@- z_py!`aCP#s7Dla1=eqfJlx16kPi`NcL%Zquy^a)Unfh16NsXWK$j8^1=^4A>*HzbP zq;aCIW!;BX!q|yU(~^GQEVM1VE{C;%A4TaSGv9Q7aBwkLATjG$-q1F|^`+VL;;k&& zzr!%$=+P6tc9=#oU1@6xE-;ua-$gl6*8-gqbLOFopKo$~;uWv24Cm5U&Wq~ZL~QSU z8LkMYY&_v2^S5(C2iC*t=C{zE-zO`5nm{()Ni9?&Ki=`kvT;a>l8Xr!8xP#I^@F^i z{&1#=RhiP9Ft4Y%RXgb6>Z1}efMk4g=n$tzjU3#T{3M4-@&5iFuLYeb+W!0XxJ`$= zjrC7zCA~36Wo>er%Dc1#M=zEQw%>}AGB8|0#oUrQ_PJS!W1;?quAy|C4gVjM8a9tu z&5MEEfMe-GhvtJv~tVyUe6KF(}x;8BMyC9n|qtLet0HD(;H3k)Uu$5*j= znYaG~pr2694NGi8_q*dRJr8wwFUQ{Oii?%}M8-}wSK9;O@^Hj}0 zyc%`(1|Ph7O>Tg-E+#B0T_xwPO!%7~1qru|`>V|?d5lif7=aIr3m-?zZSpM^M3h{e zetWOW?j)yC;>t{KiU7baVofm@G7y3?T*>VZh{}Cu>aAO}40zwG2A1U9wu3p&A2Yc+ zv1gYd{mhbVXVC~38o3Zy{OP&lO407Mq8rzOiavg{KY9YlG{&kfnHOpBDv&^QuRvN$ z?};l7gHI{F4Flhs+E?M~6kP`3+@nh?(1jMVik1zHBF3?k~RRm z`SKV>yfp=x;~rNX;X}0%GdLN}^Ofh)+cvF;{cAsS@%WBg93E$1=bTEJ;+{Rv$HbqA z&U$cA#FeFmwhHGQX>pr>!A2P)Rla&l_;vk$y>{lbx_ogXGJzr7xFdn7`20?->3O-W z3@GLF29H=7dag)CEU(~*P+xhDxAH7~U@{WFS;rBpzk(wNX1j<=OL2P`Uq`m@L9qTs zZ%!Ji*b+y|Jc@}hw8bd5Dtg3`#6V(Iyy{Z-58TxgREqcY2~o#FkL8it)%qN7k6Y9v zrCm#dvPFW%e9+`7qsFOBCyX5amCE+R#yx=cZ|e|KwPVy`y^2yQs7}RTw2k>_7PhZD zUifTfIe3D`xXI+y^IER$6n6n}+%-Ei+d50nmScp1z~wSY+K}*WL^Azaq)VEFMPZedcbi2tU_1O^z$^f``wQ=@vyam0-8lDoFaC4KRV!NFo3r}KJ(rdZ*s|WsY1lnq8#?>A zvJXqSAoW$;*Wl^Fpda4wgp1l<>&yFI?NTy*IzMGUY2DjvGH%|#Q_u2*=>%{gZJ{-a zoTG@@%D>oYEYZDnX%)RE7ONUIWxS`$I*Fe=VWuZb0IWpy3@TPUwAgy@WPVYFE`okf z?k~6=Knb4%eUMixOf4MXQHc$7;W73jzCd%K$ru%`^~khCh^F?i&CmBeTJeG9Yu|Q+ z2b_YQy&|C>Y4HHsDlT2K{X59W8^!Udpv|DDsS5X z)y#&>PtbyDR&Bn`ep`UN=VrXb!YY`e@;evb?X3cJrk( z{;(MN|67cS(EPh}eETR16yyuu7^G2(kRh=hZG9QWn@pDN+`spO${bq@m93NST|DUT zuB>TZb)pPCmLXrK65krUNFMm&rcB$!(6_F7-$LHwGl6dw%{?G{?e^yFDaMQM)|ss? z8L9AYRCQTtFuV`nJ4ul76G-~pVclv1rfXU%dy1*N3S?^5^GZ`=QsHoFCechD`c*N~ zA0t_+_yi-&eQ^-jhNE#P(ojEZRR(hpa78X>=1=Qnq3bv|>*9I5@HThx$i^9^j(bZY zlv}hI8aDzsD2P^^!-)obEe^wlZvf1~MZ)>BY8?M+q-FbjXl_MtH>Gus@v#7Nb*|KR zhZl{G=QB)c=0HS$}ae7#MU^fVZHPg5XvF3 z7{<~vyXjitV;M|!sHJr-rY@z1%kNv#Q5BeE@$eV6EkKuRsWlAd$(K`xkdZ!QdhcA$ zcin3~B&6pTO!_-fs3+AWw9 zf@=m~UNvy3@$l>p`joSzwM+hg2IYe1v-vbHt``CP=61w7>Lefx|ip>QBz zh@%(V*~1-LnlI(@T^BDWps|=cn1Ht|M)h5?k9Rb9Z;eQV9$Peur&?Pgjr}f`$LA!T zrY_s-uF5pR706e8)>=?m;CJzvvpXqHf`!M~?;`DT;HChFC;9o4&qshS3CW@UR7&VE zif$mzp&it4$8opCL&fUEWHZPoQzAzlj0r)E6$1)BEIMn zr4IjK@%n~ObX}re zNkJJ{v0;6`GA|N(7LrZ{nEjBl{rCXK@Qy5;l_ck)^Mt0_@%fp+u+OxCZQ}E6V#jIv zAO&kF(d-$69vc5DSqOeftD004NS?kT6hSX}dL{9R=uUL0_%2(&Bfq)sue2IIJntF) zt{IVJ)>Gq7^d2cCcD`JECPtYm=DIgFg4)Fa47~mlc&5 zrPua=T~)b;jvE^D8`l~lJzh`~vn~)G^q#D%^xpJ%=yd9Fe_7HO!S+*@`ZhF7U%ozVUgaOmS)_u)EjpS!?udJz};W}-!zSv1gHvmen-4_L4j7D z9eKV&J^C(MWpW%2)%q^`HqCF%nJ3o>;$LwCzLsdOk;H=ESi=s6wx70$oANVr-Pij$ zo9Hv^Z~4OLLS(4kzWd@!Bj0lJl$cMKgSzC2)QcGbx`F0VCdz&U7H5u2J_lBBy*Zi; z(U#L$eeITlmsRK6WS!|B-TSlOZWt-`G5(x4_OtU)Yksf-zpGJAnzK5#2>ZcY*AJWh z`ruRP2?>06 z&)jekHFi1~s3_AaK(!{QN)t}s^Q~?2skSO>1u=A9#k(I_wyeu5PeBizqg>4q_e9=- zOT0UMif|fr-+|gHW=VyOAmXd({-IDmpZ_J32Pb`zBdl=TiH_=-+L(%r#qE#=q0BvT zgzG)L4+03a*9<57@~``T;KjD5crF4^YI>-^%l&M#$i0Bmuh2*!zyBzfLVA3Q?x=I{ z4z=EWPZxC>E984NF`g-(veqY@nkJ`$^WFFSlBU(3wteUPwh2oBMEliZv6wvaHE=QC7Xhq1zV>m?qC{C?wKPi2Y6CfMV6_9s1_D++##z}c- zKi<-nV*DOc67Ypofc|yG20J@iCtr+o!%nWJFRv3xV3i* zLo6dik%3poJL_Xv&D_ve(3ikK}r=z(?1Vw&vHZkoZItL6cMXgsZ-O6(BLV(#_0v}QYH<=%k?7w~+{}Mk2rP6J)XP|LdvxL-gzfJZMEB$=NQ-}{70hs} zC;Y@qR@7TLiXOYjV~#3A_>+^TB&S|s<{pZLOBAMbBLoXqd?v@!g|89|6n5tA``h=W z>aQmFsWERkUtGx3Ybo^A`O?cO-TEQUWXrR!%`=%%>yOf-8=~UrZp=9PUB?{S9{hX5 z;>w=#Tb@3JQ}P16@*2HE=CyGuMN#fr^b0|bx>r9F&#fxw?AP#ZnVgq`b5uRBSeSL8 z*N{4sTBJ*EI~|8p#|KvI8#ksKRwZnByS=P?u#2Od^blOAb^D3a&3EBm6Oxo)kag-0 zJKZg3`P-L<#l!t0K1^@>zU1^*owlr+dTEnk71>Wg_X>yPD|+SM_S*bSchilnIqQnQ z_1eEQojOvVvT3t;y0y(7OkQ>xP;6V-IG>s4CBM_eu#~KHA1*%L%eKs})8oKzdKp=K z{^%dHXmtlT)@qG=BHkNS`R@@iF4ED9h79e0HKmD&X-fE!rdfJha9ggA^uG8h+#!eH zXq9v__9Tid$bJ&f9l9@-Su^Co(e=1-uA4N3$c8?Il(g(6<-C=K)u{&7n0>DGQ{D1+ z@+?&8eoW(O6sICBTdH~Ycy0Z8)jf8_pc1fGqn0$Ym-q{X>P0}C<>0qWVh!M z30hW&9H5O%CFxHBuy?|lV3p)7b>MGaVhV9LcF6rG6?>`gsd=oXLcV#dIvH2&K>S{7 z?!CW%K0eEjm+Qf!Z<(k3DEPbj&JyvcFWXRTwo+aEr^{)4w#`Rnz@45fEX1C*2w$?j zHYCGq%%5nr-dL-QUAokCs)#X`hd<|p#g+vT5xV4(s*=(34|Lw#WHJlaiW1gv(}5iK z>z`|&{8za!BqoxOzg+#9(PF@}#Swzxi@=!;kQ@Etj^1yVvvkJjrPmJkuMz(ol7BRt zWhA}zaKFuTeZ^+M%sJGo=<%+=8GhS}u9>BBqh9%<@jjata0z(eA4yLAV@AV6BCY$Z z-DYkecYxFaPw7%O3H2GY9q;Ldi0s3i7c?W{E1SVpe4Ka9)9L{2H+pqJA?z_3v&Mgh zMCswde-{gmm}jcFs$_==zuwdw>eX;Cl?!&tbMXt)nsJU&v@c%d5q0hGJ{GDzJF7bx z@43$P>X&oi|KOd4SLupyv8alR z)#;$%GmfLCFJF0fe&va7>iZ^Lk5eCb7JMIN;uja$7RAN(@cj`hb{zs%v}j+J2m4_j z8Tq!q-4oIYV=A0H3R;wA-g~r`+~x73kt${;935Uo-g=^o1wjQZ-#&X({~WXz=k;LR z2s2lF_X_NO_dZyCV4NZZe~7-=5NWSqNRx)gH2<6KY5Z?4I%=d8b&X?DN|$qc)t>5f znY=OK+dCc^>n0SFnwr7ZtP(YE8ZY0nViyw#-?zL za^)Tx_H}gORV&CIXHH3n_(k7k$|Dj-|gk1f|Bb|Hs{FmBS^>e(6!inHJ)Le z044)r#vyXjbh&=$b>8Bk5L($RVH^XAXZn6ktho3-o-I8!WM*wbChu>Dt`d=Od!1** z)gP2jG1EYM)Wdf2=$}4L1Rn)#)aQAzX(_nvX!U(1ld@7dCEz`hH6H~BPG30N9-p4 z0aFBX8}WUCV4#7)fVr_kHYe)Sg5yL*#^i?Blc=fba$<{p6?qeR3oR|@X)jqYaxx}R zsgjS!P*T33SF(6-SvN$nTEM8mI769_CdEdscfMtxzosP~6!=~b+lAHHuNjhVQbZfc zJoN1JP2hF4RAdD|qB!@HiDzCs!g2EWFPHb)d=F1o%#?|0co(D`tXuJA&-f{teJ|bv zorpdVUxzd93-&^b)6=jZJZM*O3f%t)#QH5*NUAh^{GCU)Kp&dvb_I|$#ex0(wybp* zOWa>fvqp8?`keb6{!X9#iQ()`llQ3m1Y+M*J}L2EOFSNg{fN(0FA_1L=%~|d0?w_i zwdW%~Gja(uJ2{j1s>t1=((Oc$kO%d3p)oa?GR zto9pT51g0esulUj4cNI=I{$u`mY!UxcrSW? zj~E(OHe~lb6Cwz1dLpih0;dL8rKR{+L)_?bthD?xNYccy^0xcWZI#}`zJLx>-YzO) z>u9$&*_;FES#?jQlLZ(T*HW4TvyGFt3}tg{EA zI!5DaP|8}ZmvygjO#LIpzLnaEc^P`gpg3lCd@yYd8wK*8K6BNx{pL)>^78HX>u)jO zJPR3XvKG0e7IGwT3frILDPsL*`5)J#7|U-G_LUP2LgS28wwH4-X zL~u*TjK31Yg0Nu2GZwWx6|O?@bQG6O{y9F6OX)vHfdL z8d1r{UbO#lW|Q-PL~EAe3p`^CYlPSItbGT|tOv;i!;OgPlGPW3EawGxQgVB?qa>?A zTcjRrVZrE6*m;xXysebUMmiP4VrP;+lP{^LWx{kU>38!O$M|7_2|)xML7k{jLz zhx{WXfonfZE$L35a>-Y}Hh;ts>qiLWl}%SNk-+V#K{R*&LKR2Gd=!nSZ8RV1#2D`F zv@-A8Cce*B?G5fE6f|Rs;vH_A&f?gUST1aG3;9zK52}kct$SSisPO0zl@UG~EJ+>v3Q%Jrcynwy`0mq<`5I)VU@{J(>*jIR%V5uhB4x7* z(-yO}6#9`Hi2WH$nPH4jHE!A4c{6qq6UU!Xw{5*dAxQDc?ZW+7(QQnA?Mn(<{lK5j z^|oFcp7lvy(Y!D4mVI)kept#2s3q-3JByO3+WkS(9Dua}VKLxKZ(z&w>jb)9+Fv(g zytf+eG;My~-F6xwqEDb?46)9zS#2I#o=>nJ8!D;v27?mO|f)Ug3E&;R#A}>7Xw@3_$1P zLBFG_+yVie&r3OGAfU70fCl*aiTAy=ev|DA=djf0TIX>6Z0YUB`IN(@K#V#kgvP=9e{czZd(~ zyjXQHC||y`He1-^btZ6&zM|g*uGysxJ8p1H?E5tHaYgzE2N%zmogWPzKK$tDl;Wc# zm85k0=7*jG(JMYJI0GEiH?Wbn3E03ZV_|U3fX(K9sIZ3C=?lLaPk<-eU;lHda-TO%KqIu zJJfAH+uhyyOonfpA?S*zXDcT>F5L>e`svn(We;asXvtZaxJdIP@lVT=yRhwE+Us9- zP7Oa#%Bajc-oVIw$eT;Ca9U(q=8I{n&&oe#JeX;Epn2EjTMuMz9X>c`A8>y3kT-YW z@=p(9lI|(&df<9^n&qR36%G$O{yNIYxCF|s1)aH`-9JAHyzdBksN;f$BWvJe;A;8) z8N4h~z=rclw>-%OXZ|od8%Xxm%(EBp2Obhy*zFj7+L`TOD)1D(O{KQ~n$j;Dp7%)U z(4MEMV#B>7tevBzjiFd@z2Nx*k?0E*=MPF{Oj#a0>pAd9Akzug98;|Y0|dM~l|xp` zb{JKx7dUa#_=1a#;;HoC{EHSXm*R6=u_5 zhQpoqTV8(+NVMi_%(z*1`0l5Ykk4G6_e;N9N=|+G>(uv}z6K|=9`G80>}y%dLtw?XvG3TO?e}fR}~~0uKZL?Jfds`8s8;mNgMn zc&axT9{tu5TYmQ{=&p1}UBMs$;akEYKlqZfL90xFCx>sX&YLNiobn*FVd6o@`yag= z1G3itPklG}mKsy6DsT<3!}ISAmGk4-V|Bfsf9GL5_8NGhasL$Hx##`wHUu3rtK|Q5 z+cxEdz-{0W71n$F)IptD^y70J6*#qk$8`69SuGJZ*V16uhm=U*G3#rA#}W(4TQU1S z2kCKW=1^EM!R$BiCgv@m?Ze(nm1Za|((aC&zTssGc+rm)FhTgrGns-X)gYr<2y($y zVE-Au%osxn%Go*KQD9ID3x+aGLAe5Q3EUjY%7B z1uud`SF+$GFy)%ma$I3RSchbE7-*N-RX3lvpp!5#P{yd2Aya`cauCPzKk@4?9!ve) RTxQPz1fH&bF6*2Ung9js;Jp9< literal 0 HcmV?d00001 From 04108592a362848a5d3af4332f7628a14e312174 Mon Sep 17 00:00:00 2001 From: albanD Date: Fri, 27 Aug 2021 11:53:27 -0700 Subject: [PATCH 305/530] New TLS to disable forward mode AD (#63117) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63117 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30388097 Pulled By: albanD fbshipit-source-id: f1bc777064645db1ff848bdd64af95bffb530984 --- c10/core/AutogradState.cpp | 6 ++++-- c10/core/AutogradState.h | 15 +++++++++++++-- c10/core/GradMode.cpp | 1 - c10/core/GradMode.h | 14 ++++++++++++++ c10/core/InferenceMode.h | 10 ++++++---- torch/csrc/autograd/autograd_meta.cpp | 6 ++++++ 6 files changed, 43 insertions(+), 9 deletions(-) diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp index 9684a76b78564..4667acb435193 100644 --- a/c10/core/AutogradState.cpp +++ b/c10/core/AutogradState.cpp @@ -4,8 +4,10 @@ namespace c10 { namespace { // By default, grad mode is enabled and inference mode is disabled -thread_local AutogradState autograd_state_tls = - AutogradState(/* grad_mode */ true, /* inference_mode */ false); +thread_local AutogradState autograd_state_tls = AutogradState( + /* grad_mode */ true, + /* inference_mode */ false, + /* fw_grad_mode */ true); } // namespace AutogradState& AutogradState::get_tls_state() { diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h index 1447594433fe4..a1d13a42891da 100644 --- a/c10/core/AutogradState.h +++ b/c10/core/AutogradState.h @@ -12,13 +12,19 @@ struct C10_API AutogradState { static AutogradState& get_tls_state(); static void set_tls_state(AutogradState state); - AutogradState(bool grad_mode, bool inference_mode) - : grad_mode_(grad_mode), inference_mode_(inference_mode) {} + AutogradState(bool grad_mode, bool inference_mode, bool fw_grad_mode) + : grad_mode_(grad_mode), + inference_mode_(inference_mode), + fw_grad_mode_(fw_grad_mode) {} void set_grad_mode(bool enabled) { grad_mode_ = enabled; } + void set_fw_grad_mode(bool enabled) { + fw_grad_mode_ = enabled; + } + void set_inference_mode(bool enabled) { inference_mode_ = enabled; } @@ -27,6 +33,10 @@ struct C10_API AutogradState { return grad_mode_; } + bool get_fw_grad_mode() const { + return fw_grad_mode_; + } + bool get_inference_mode() const { return inference_mode_; } @@ -34,6 +44,7 @@ struct C10_API AutogradState { private: bool grad_mode_ : 1; bool inference_mode_ : 1; + bool fw_grad_mode_ : 1; }; } // namespace c10 diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp index a5db198083b2b..c2ea8698732d7 100644 --- a/c10/core/GradMode.cpp +++ b/c10/core/GradMode.cpp @@ -1,4 +1,3 @@ -#include #include #include diff --git a/c10/core/GradMode.h b/c10/core/GradMode.h index 1168bb1ae67c3..d83ff6d0d0d3b 100644 --- a/c10/core/GradMode.h +++ b/c10/core/GradMode.h @@ -1,5 +1,6 @@ #pragma once +#include #include namespace c10 { @@ -27,4 +28,17 @@ struct TORCH_API NoGradGuard : public AutoGradMode { NoGradGuard() : AutoGradMode(/*enabled=*/false) {} }; +// A RAII, thread local (!) guard that enables or disables forward grad mode +// upon construction, and sets it back to the original value upon destruction. +struct TORCH_API AutoFwGradMode { + AutoFwGradMode(bool enabled) + : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) { + AutogradState::get_tls_state().set_fw_grad_mode(enabled); + } + ~AutoFwGradMode() { + AutogradState::get_tls_state().set_fw_grad_mode(prev_mode); + } + bool prev_mode; +}; + } // namespace c10 diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h index 9748d6eccfb54..704c43b522c6d 100644 --- a/c10/core/InferenceMode.h +++ b/c10/core/InferenceMode.h @@ -53,10 +53,12 @@ struct TORCH_API InferenceMode { InferenceMode(bool enabled = true) : prev_mode(AutogradState::get_tls_state()), prev_keyset(c10::impl::tls_local_dispatch_key_set()) { - // Enabling inference mode means disabling grad mode - // And disabling inference mode means enabling grad mode - AutogradState::set_tls_state( - AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled)); + // Enabling inference mode means disabling grad modes + // And disabling inference mode means enabling grad modes + AutogradState::set_tls_state(AutogradState( + /* grad_mode */ !enabled, + /* inference_mode */ enabled, + /* fw_grad_mode */ !enabled)); DispatchKeySet included = enabled ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView) : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView); diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp index 248847f66ca60..f35c122225831 100644 --- a/torch/csrc/autograd/autograd_meta.cpp +++ b/torch/csrc/autograd/autograd_meta.cpp @@ -183,6 +183,12 @@ void AutogradMeta::set_fw_grad(const Variable& new_grad_, const Variable& self, } const Variable& AutogradMeta::fw_grad(uint64_t level, const Variable& self) const { + // TLS that disables forward AD + // This is only used for custom Function implementation + if (!c10::AutogradState::get_tls_state().get_fw_grad_mode()) { + return ForwardGrad::undef_grad(); + } + // Ensure that concurent fw_grad() "reads" are thread safe std::lock_guard lock(mutex_); From 6a76ee04de5f10b76cc8f97cc254da43905d170b Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 27 Aug 2021 12:45:01 -0700 Subject: [PATCH 306/530] Adding alltoall_single collective to collective quantization API (#63154) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63154 The collective quantization API now supports alltoall, alltoall_single, and allscatter. The test is also included. ghstack-source-id: 136856877 Test Plan: buck test mode/dev-nosan //caffe2/test/distributed/algorithms/quantization:DistQuantizationTests_nccl -- test_all_to_all_single_bfp16 Reviewed By: wanchaol Differential Revision: D30255251 fbshipit-source-id: 856f4fa12de104689a03a0c8dc9e3ecfd41cad29 --- .../quantization/test_quantization.py | 61 +++++++++++++++++++ .../algorithms/quantization/quantization.py | 14 +++-- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py index 505f805b2cc10..e60539face11c 100644 --- a/test/distributed/algorithms/quantization/test_quantization.py +++ b/test/distributed/algorithms/quantization/test_quantization.py @@ -148,6 +148,46 @@ def test_all_to_all_bfp16(self): dtype=torch.float32, qtype=DQuantType.BFP16) + @requires_nccl() + @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_single_fp16") + @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) + def test_all_to_all_single_fp16(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') + device = torch.device(f"cuda:{self.rank}") + group = list(range(0, self.world_size)) + group_id = dist.new_group(range(self.world_size)) + rank_to_GPU = self._init_multigpu_helper() + self._test_all_to_all_single( + group, + group_id, + self.rank, + cuda=True, + rank_to_GPU=rank_to_GPU, + dtype=torch.float32, + qtype=DQuantType.FP16 + ) + + @requires_nccl() + @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_single_bfp16") + @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) + def test_all_to_all_single_bfp16(self): + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') + device = torch.device(f"cuda:{self.rank}") + group = list(range(0, self.world_size)) + group_id = dist.new_group(range(self.world_size)) + rank_to_GPU = self._init_multigpu_helper() + self._test_all_to_all_single( + group, + group_id, + self.rank, + cuda=True, + rank_to_GPU=rank_to_GPU, + dtype=torch.float32, + qtype=DQuantType.BFP16 + ) + def _test_all_gather( self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None): for dest in group: @@ -203,5 +243,26 @@ def _test_all_to_all( for t1, t2 in zip(out_tensors, expected_tensors): self.assertEqual(t1, t2) + def _test_all_to_all_single( + self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=DQuantType.FP16 + ): + if group_id is not None: + size = len(group) + in_splits = [i + 1 for i in group] + out_splits = [rank + 1 for _ in group] + in_tensor = torch.ones([sum(in_splits), size], dtype=dtype) * rank + out_tensor = torch.ones([(rank + 1) * size, size], dtype=dtype) + expected_tensor = torch.cat( + [torch.ones([rank + 1, size], dtype=dtype) * i for i in group] + ) + if cuda: + rank_to_GPU = rank_to_GPU[rank][0] + in_tensor = in_tensor.cuda(rank_to_GPU) + expected_tensor = expected_tensor.cuda(rank_to_GPU) + out_tensor = out_tensor.cuda(rank_to_GPU) + quantize_alltoall_single = quant.auto_quantize(dist.all_to_all_single, qtype, quant_loss=None) + quantize_alltoall_single(out_tensor, in_tensor, out_splits=out_splits, in_splits=in_splits, group=group_id) + self.assertEqual(out_tensor, expected_tensor) + if __name__ == "__main__": run_tests() diff --git a/torch/distributed/algorithms/quantization/quantization.py b/torch/distributed/algorithms/quantization/quantization.py index d58c58cad09e2..a5e9b4652a805 100644 --- a/torch/distributed/algorithms/quantization/quantization.py +++ b/torch/distributed/algorithms/quantization/quantization.py @@ -90,18 +90,14 @@ def auto_quantize(func, qtype, quant_loss=None): """ This is a prototype API that automatically quantize the input tensors, choose the precision types, and pass other necessary arguments and then dequantizes the output. - Currently it only supports: . FP16 and BFP16 quantization method supported for gloo and nccl backends . all_gather, all_to_all collective ops - Note: BFP16 only supports 2D tensors. - Args: func (callable): A function representing collective operations. qtype (QuantType): Quantization method quant_loss (float, optional): This can be used to improve accuracy in the dequantization. - Returns: (callable): the same collective as func but enables automatic quantization/dequantization. """ @@ -129,6 +125,16 @@ def wrapper(*args, **kwargs): for i, t in enumerate(_dequantize_tensor_list(out_tensors, qtype, quant_loss=quant_loss)): tensors[i] = t + elif (func == dist.all_to_all_single): + tensors = args[0] + out_splits = kwargs.get('out_splits', None) + in_splits = kwargs.get('in_splits', None) + # Quantizing the input/output tensor + input_tensors = _quantize_tensor(args[1], qtype) + out_tensors = _quantize_tensor(tensors, qtype) + dist.all_to_all_single(out_tensors, input_tensors, out_splits, in_splits, group=group) + for i, t in enumerate(_dequantize_tensor(out_tensors, qtype, quant_loss=quant_loss)): + tensors[i] = t else: raise RuntimeError( f"The collective op {func} is not supported yet" From 8fc1064b7f562ec5b5c6a5e6a0b59069101e6c87 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 27 Aug 2021 12:55:26 -0700 Subject: [PATCH 307/530] [PyTorch] Reduce code size of register_prim_ops.cpp (#61494) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61494 Creating a constexpr array and then looping over it is much cheaper than emitting a function call per item. ghstack-source-id: 136639302 Test Plan: fitsships Buildsizebot some mobile apps to check size impact. Reviewed By: dhruvbird, iseeyuan Differential Revision: D29646977 fbshipit-source-id: 6144999f6acfc4e5dcd659845859702051344d88 --- torch/csrc/jit/runtime/custom_operator.h | 2 +- torch/csrc/jit/runtime/operator.h | 13 +- torch/csrc/jit/runtime/register_ops_utils.h | 95 +- torch/csrc/jit/runtime/register_prim_ops.cpp | 3074 +++++++++--------- 4 files changed, 1629 insertions(+), 1555 deletions(-) diff --git a/torch/csrc/jit/runtime/custom_operator.h b/torch/csrc/jit/runtime/custom_operator.h index 45ad6676376ce..e39789bfe9da3 100644 --- a/torch/csrc/jit/runtime/custom_operator.h +++ b/torch/csrc/jit/runtime/custom_operator.h @@ -19,7 +19,7 @@ struct TORCH_API RegisterOperators { /// Registers a vector of already created `Operator`s. /// The operator element is now optional to filter null ops. It's backward /// compatible and works for selective operator registration. - RegisterOperators(std::vector> operators) { + explicit RegisterOperators(std::vector> operators) { for (c10::optional& o : operators) { if (o) { registerOperator(std::move(o.value())); diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h index e243e8ff57f2d..ccdbfa03f5e8c 100644 --- a/torch/csrc/jit/runtime/operator.h +++ b/torch/csrc/jit/runtime/operator.h @@ -220,13 +220,24 @@ TORCH_API bool aliasAnalysisHasSpecialCaseFor(c10::Symbol sym); // string. template c10::optional OperatorGenerator( - torch::detail::SelectiveStr schema_str, + const char* schema_str, Func&& op, AliasAnalysisKind alias_analysis) { return c10::optional(Operator( std::string(schema_str), std::forward(op), alias_analysis)); } +template +c10::optional OperatorGenerator( + torch::detail::SelectiveStr schema_str, + Func&& op, + AliasAnalysisKind alias_analysis) { + return OperatorGenerator( + static_cast(schema_str), + std::forward(op), + alias_analysis); +} + template c10::optional OperatorGenerator( torch::detail::SelectiveStr schema_str, diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h index e068b7877aff1..5d00872d9ca7e 100644 --- a/torch/csrc/jit/runtime/register_ops_utils.h +++ b/torch/csrc/jit/runtime/register_ops_utils.h @@ -35,15 +35,15 @@ namespace torch { namespace jit { -inline c10::AliasAnalysisKind aliasAnalysisFromSchema() { +constexpr inline c10::AliasAnalysisKind aliasAnalysisFromSchema() { return c10::AliasAnalysisKind::FROM_SCHEMA; } -inline c10::AliasAnalysisKind aliasAnalysisConservative() { +constexpr inline c10::AliasAnalysisKind aliasAnalysisConservative() { return c10::AliasAnalysisKind::CONSERVATIVE; } -inline c10::AliasAnalysisKind aliasAnalysisSpecialCase() { +constexpr inline c10::AliasAnalysisKind aliasAnalysisSpecialCase() { return c10::AliasAnalysisKind::INTERNAL_SPECIAL_CASE; } @@ -430,9 +430,46 @@ void listCopyAndSort(Stack* stack); void listSetItem(Stack* stack); +struct OperatorGeneratorArgs { + const char* schema_str; + bool isOperationCreator; + union { + void (*operation)(Stack*); + OperationCreator operationCreator; + }; + AliasAnalysisKind aliasAnalysis; + + explicit constexpr OperatorGeneratorArgs( + torch::detail::SelectiveStr schema_str, + void (*op)(Stack*), + AliasAnalysisKind aa) + : schema_str(schema_str), + isOperationCreator(false), + operation(op), + aliasAnalysis(aa) {} + + explicit constexpr OperatorGeneratorArgs( + torch::detail::SelectiveStr schema_str, + OperationCreator opCreator, + AliasAnalysisKind aa) + : schema_str(schema_str), + isOperationCreator(true), + operationCreator(opCreator), + aliasAnalysis(aa) {} + + template + explicit constexpr OperatorGeneratorArgs( + torch::detail::SelectiveStr, + Args...) + : schema_str(nullptr), + isOperationCreator(false), + operation(nullptr), + aliasAnalysis(AliasAnalysisKind::INTERNAL_SPECIAL_CASE) {} +}; + #define DEFINE_GENERIC_BINARY_OP( \ aten_op, op, int_float_result, complex_result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".int_int(int a, int b) -> " #int_float_result), \ [](Stack* stack) { \ @@ -441,7 +478,7 @@ void listSetItem(Stack* stack); push(stack, op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op \ ".float_float(float a, float b) -> " #int_float_result), \ @@ -451,7 +488,7 @@ void listSetItem(Stack* stack); push(stack, op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op \ ".complex_complex(complex a, complex b) -> " #complex_result), \ @@ -464,7 +501,7 @@ void listSetItem(Stack* stack); // define implementations for primitive number ops #define DEFINE_GENERIC_OP(aten_op, int_op, float_op, int_result, float_result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result), \ [](Stack* stack) { \ int64_t a, b; \ @@ -472,7 +509,7 @@ void listSetItem(Stack* stack); push(stack, int_op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".float(float a, float b) -> " #float_result), \ [](Stack* stack) { \ @@ -483,7 +520,7 @@ void listSetItem(Stack* stack); aliasAnalysisFromSchema()) #define DEFINE_INT_FLOAT_OP(aten_op, op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".int_float(int a, float b) -> " #result), \ [](Stack* stack) { \ @@ -493,7 +530,7 @@ void listSetItem(Stack* stack); push(stack, op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".float_int(float a, int b) -> " #result), \ [](Stack* stack) { \ @@ -505,7 +542,7 @@ void listSetItem(Stack* stack); aliasAnalysisFromSchema()) #define DEFINE_INT_OP(aten_op, op) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> int"), \ [](Stack* stack) { \ int64_t a, b; \ @@ -515,7 +552,7 @@ void listSetItem(Stack* stack); aliasAnalysisFromSchema()) #define DEFINE_STR_CMP_OP(aten_op, op) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".str(str a, str b) -> bool"), \ [](Stack* stack) { \ auto b = pop(stack).toStringRef(); \ @@ -530,7 +567,7 @@ void listSetItem(Stack* stack); // in unintended implicit conversions #define DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC( \ aten_op, int_op, float_op, result, string_val) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op string_val \ "(Scalar a, Scalar b) -> " #result), \ [](Stack* stack) { \ @@ -586,7 +623,7 @@ void listSetItem(Stack* stack); DEFINE_STR_CMP_OP(aten_op, op) #define DEFINE_UNARY_INT_OP(aten_op, op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a) -> " #result), \ [](Stack* stack) { \ int64_t a; \ @@ -596,7 +633,7 @@ void listSetItem(Stack* stack); aliasAnalysisFromSchema()) #define DEFINE_UNARY_FLOAT_OP(aten_op, op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".float(float a) -> " #result), \ [](Stack* stack) { \ double a; \ @@ -608,7 +645,7 @@ void listSetItem(Stack* stack); #define DEFINE_UNARY_OP(aten_op, op, int_result, float_result) \ DEFINE_UNARY_INT_OP(aten_op, op, int_result), \ DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \ [](Stack* stack) { \ IValue x; \ @@ -623,7 +660,7 @@ void listSetItem(Stack* stack); }, \ aliasAnalysisFromSchema()) #define DEFINE_BOOL_OP(aten_op, op) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".bool(bool a, bool b) -> bool"), \ [](Stack* stack) { \ bool a, b; \ @@ -632,7 +669,7 @@ void listSetItem(Stack* stack); }, \ aliasAnalysisFromSchema()) #define DEFINE_STRING_OP(op_name, string_op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name ".str(str a, str b) ->" #result), \ [](Stack* stack) { \ auto b = pop(stack).toStringRef(); \ @@ -646,7 +683,7 @@ void listSetItem(Stack* stack); //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- #define DEFINE_UNARY_COMPLEX_OP(aten_op, op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".complex(complex a) -> " #result), \ [](Stack* stack) { \ c10::complex a; \ @@ -670,7 +707,7 @@ void listSetItem(Stack* stack); DEFINE_UNARY_INT_OP(aten_op, op, int_result), \ DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result), \ DEFINE_UNARY_COMPLEX_OP(aten_op, op, complex_result), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \ [](Stack* stack) { \ IValue x; \ @@ -700,7 +737,7 @@ void listSetItem(Stack* stack); int_result, \ float_result, \ complex_result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result), \ [](Stack* stack) { \ int64_t a, b; \ @@ -708,7 +745,7 @@ void listSetItem(Stack* stack); push(stack, int_op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".complex(complex a, complex b) -> " #complex_result), \ [](Stack* stack) { \ @@ -717,7 +754,7 @@ void listSetItem(Stack* stack); push(stack, complex_op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".float(float a, float b) -> " #float_result), \ [](Stack* stack) { \ @@ -728,7 +765,7 @@ void listSetItem(Stack* stack); aliasAnalysisFromSchema()) #define DEFINE_INT_COMPLEX_OP(aten_op, op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".int_complex(int a, complex b) -> " #result), \ [](Stack* stack) { \ @@ -738,7 +775,7 @@ void listSetItem(Stack* stack); push(stack, op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".complex_int(complex a, int b) -> " #result), \ [](Stack* stack) { \ @@ -750,7 +787,7 @@ void listSetItem(Stack* stack); aliasAnalysisFromSchema()) #define DEFINE_FLOAT_COMPLEX_OP(aten_op, op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".float_complex(float a, complex b) -> " #result), \ [](Stack* stack) { \ @@ -760,7 +797,7 @@ void listSetItem(Stack* stack); push(stack, op); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".complex_float(complex a, float b) -> " #result), \ [](Stack* stack) { \ @@ -773,7 +810,7 @@ void listSetItem(Stack* stack); #define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_AVOID_COLLISION_GENERIC( \ aten_op, int_op, float_op, complex_op, result, string_val) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op string_val \ "(Scalar a, Scalar b) -> " #result), \ [](Stack* stack) { \ @@ -821,7 +858,7 @@ void listSetItem(Stack* stack); #define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_WITHOUT_INT_COMPLEX_PAIR( \ aten_op, int_op, float_op, complex_op, result) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op "(Scalar a, Scalar b) -> " #result), \ [](Stack* stack) { \ IValue x, y; \ diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index 60458a0ae11e5..2953b686ee379 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -86,943 +86,858 @@ auto powWrapper(T a, U b) { return pow(a, b); } -RegisterOperators reg( - {OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"), - [](Stack* stack) { - std::stringstream ss; - ss << pop(stack); - push(stack, ss.str()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::list(str t) -> str[]"), - [](Stack* stack) { - auto str = pop(stack).toStringRef(); - c10::List chars; - chars.reserve(str.size()); - for (auto c : str) { - chars.push_back(std::string(1, c)); - } - push(stack, std::move(chars)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.cpu()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::layout(Tensor a) -> int"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.layout()); - }, - aliasAnalysisFromSchema()), - Operator( - prim::tolist, - // This operator has to be unschematized because the return type - // depends on the type hint and input. The implementation of this - // operator below is intended to be as close to the Python - // implementation in torch/csrc/utils/tensor_list.cpp as possible. - [](const Node* /*node*/) -> Operation { - return [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int elem_ty_val; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int dim_val; - at::Tensor t; - - pop(stack, elem_ty_val); - pop(stack, dim_val); - pop(stack, t); - - // If the Tensor is not on the CPU, transfer it. - if (!t.device().is_cpu()) { - t = t.cpu(); - } - - // Rebuild the output type using elem_ty_val and dim_val. Start - // with the element type corresponding to elem_ty_val. - TypePtr out_ty; - if (elem_ty_val == 0) { - out_ty = IntType::get(); - } else if (elem_ty_val == 1) { - out_ty = FloatType::get(); - } else if (elem_ty_val == 2) { - out_ty = BoolType::get(); - } else if (elem_ty_val == 3) { - out_ty = ComplexType::get(); - } else { - TORCH_CHECK( - false, - "Unsupported element type for tolist; only int, float, complex and bool are supported"); - } - - // Check that type of the Tensor matches that of the annotation. - // Make an exception for the case in which the annotated type is - // float/complex and the Tensor data type is also float/complex; - // the elements will be casted to double/c10::complex - // later. - TORCH_CHECK( - (out_ty == FloatType::get() && t.is_floating_point()) || - (out_ty == ComplexType::get() && t.is_complex()) || - tryScalarTypeFromJitType(out_ty) == t.scalar_type(), - "Output annotation element type and runtime tensor element type must match for tolist()"); - - // Check that the dimension of the Tensor matches that of the - // annotation. - TORCH_CHECK( - dim_val == t.dim(), - "Output annotation list dimension and runtime tensor dimension must match for tolist()"); - - // Wrap out_ty in a ListType dim times. - for (const auto i : c10::irange(dim_val)) { - (void)i; // Suppress unused variable warning - out_ty = ListType::create(out_ty); - } - - int64_t dim = t.dim(); - auto sizes = t.sizes(); - auto strides = t.strides(); - size_t element_size = t.element_size(); - char* data = static_cast(t.data_ptr()); - auto result = tensorToListRecursive( - data, - 0, - dim, - out_ty, - t.scalar_type(), - sizes, - strides, - element_size); - push(stack, std::move(result)); - }; - }, - aliasAnalysisSpecialCase()), - // only used internally in range() translation - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::__range_length(int lo, int hi, int step) -> int"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t lo, hi, step; - pop(stack, lo, hi, step); - // error handling when step_val = 0 during runtime - if (step == 0) { - throw std::runtime_error("range() arg 3 must not be zero"); - } - if (step > 0 && lo < hi) { - push(stack, 1 + (hi - 1 - lo) / step); - } else if (step < 0 && lo > hi) { - push(stack, 1 + (lo - 1 - hi) / (0 - step)); - } else { - push(stack, 0); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::__derive_index(int index, int start, int step) -> int"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t index, start, step; - pop(stack, index, start, step); - push(stack, start + index * step); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::TupleUnpack(Any tup) -> ..."), - [](Stack* stack) { tupleUnpack(*stack); }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::unchecked_cast(t x) -> t"), - noop, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::IntImplicit(Tensor a) -> int"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - checkImplicitTensorToNum(a, /*to int*/ true); - push(stack, a.item()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::ComplexImplicit(Tensor a) -> complex"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - checkImplicitTensorToNum(a, /*to int*/ false); - push(stack, a.item>()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::FloatImplicit(Tensor a) -> float"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - checkImplicitTensorToNum(a, /*to int*/ false); - push(stack, a.item()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::ScalarImplicit(Tensor a) -> Scalar"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - checkImplicitTensorToNum(a, /*to int*/ false); - push(stack, a.item()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Bool.Tensor(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_nonzero()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Bool.int(int a) -> bool"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t i; - pop(stack, i); - push(stack, (bool)i); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Bool.float(float a) -> bool"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - double d; - pop(stack, d); - push(stack, (bool)d); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.item()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool b; - pop(stack, b); - push(stack, static_cast(b)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - double d; - pop(stack, d); - push(stack, static_cast(d)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"), - [](Stack* stack) { - IValue scalar; - pop(stack, scalar); - if (scalar.isInt()) { - push(stack, std::move(scalar)); - } else { - // toScalar() needed to avoid strict type check in IValue::toInt. - push(stack, static_cast(scalar.toScalar().toInt())); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"), - [](Stack* stack) { - auto s = pop(stack).toString(); - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - std::string::size_type sz; - int64_t val = static_cast(c10::stoll(s->string(), &sz)); - if (sz == s->string().size()) { - push(stack, val); - } else { - std::stringstream error_str; - error_str << "invalid literal for int() " - << "with base 10: '" << s->string() << "'"; - throw std::runtime_error(error_str.str()); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.item()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Float.Scalar(Scalar a) -> float"), - [](Stack* stack) { - IValue scalar; - pop(stack, scalar); - if (scalar.isDouble()) { - push(stack, std::move(scalar)); - } else if (scalar.isComplexDouble()) { - push(stack, scalar.toComplexDouble().real()); - } else { - push(stack, static_cast(scalar.toInt())); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Float.int(int a) -> float"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t i; - pop(stack, i); - push(stack, (float)i); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Float.bool(bool a) -> float"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool b; - pop(stack, b); - push(stack, (float)b); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Float.str(str a) -> float"), - [](Stack* stack) { - auto s = pop(stack).toString(); - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - std::string::size_type sz; - double b = c10::stod(s->string(), &sz); - if (sz == s->string().size()) { - push(stack, b); - } else { - std::stringstream error_str; - error_str << "could not convert string " - << "to float: '" << s->string() << "'"; - throw std::runtime_error(error_str.str()); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Complex.Scalar(Scalar a) -> complex"), - [](Stack* stack) { - IValue scalar; - pop(stack, scalar); - if (scalar.isComplexDouble()) { - push(stack, std::move(scalar)); - } else if (scalar.isDouble()) { - push(stack, c10::complex(scalar.toDouble(), 0)); - } else { - push(stack, c10::complex(scalar.toInt(), 0)); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::Complex.Tensor_Tensor(Tensor a, Tensor b) -> complex"), - [](Stack* stack) { - at::Tensor a, b; - pop(stack, a, b); - push( - stack, c10::complex(a.item(), b.item())); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::format(str self, ...) -> str"), - [](Stack* stack) { - size_t num_inputs = pop(stack).toInt(); - format(*stack, num_inputs); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::einsum.sublist(Tensor a, ...) -> Tensor"), - [](Stack* stack) { - size_t num_inputs = pop(stack).toInt(); - einsum(*stack, num_inputs); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.Scalar(Scalar a) -> Tensor"), - [](Stack* stack) { - at::Scalar s; - pop(stack, s); - push(stack, at::scalar_to_tensor(s)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::RaiseException(str msg) -> ()"), - [](Stack* stack) { throw JITException(pop(stack).toStringRef()); }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Size(int[] sizes) -> int[]"), - [](Stack* stack) {}, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::size(Tensor self) -> int[]"), - [](Stack* stack) { - auto t = std::move(pop(stack)).toTensor(); - pack(stack, t.sizes().vec()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::EnumName(AnyEnumType enum) -> str"), - [](Stack* stack) { - IValue e = pop(stack); - push(stack, e.toEnumHolder()->name()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::EnumValue.int(AnyEnumType enum) -> int"), - [](Stack* stack) { - IValue e = pop(stack); - push(stack, e.toEnumHolder()->value()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "prim::EnumValue.float(AnyEnumType enum) -> float"), - [](Stack* stack) { - IValue e = pop(stack); - push(stack, e.toEnumHolder()->value()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::EnumValue.str(AnyEnumType enum) -> str"), - [](Stack* stack) { - IValue e = pop(stack); - push(stack, e.toEnumHolder()->value()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - // note the compiler knows to type TupleIndex more accurately than it - // is listed here. - TORCH_SELECTIVE_SCHEMA("prim::TupleIndex(Any tup, int i) -> Any"), - [](Stack* stack) { - int64_t index = pop(stack).toInt(); - auto tuple = pop(stack).toTuple(); - auto norm_index = normalizeIndex(index, tuple->elements().size()); - if (norm_index < 0 || - norm_index > static_cast(tuple->elements().size())) { - throw std::out_of_range("Tuple list index out of range"); - } - stack->emplace_back(tuple->elements()[norm_index]); - }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::ne.int_list(int[] a, int[] b) -> bool"), - listNe, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "prim::unchecked_unwrap_optional(t(a)? optional) -> t(a)"), - noop, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::device(Tensor a) -> Device"), - [](Stack* stack) { push(stack, pop(stack).toTensor().device()); }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::dtype(Tensor a) -> int"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, static_cast(a.scalar_type())); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::__not__(bool self) -> bool"), - [](Stack* stack) { push(stack, !pop(stack).toBool()); }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::__is__(t1 self, t2 obj) -> bool"), - [](Stack* stack) { - IValue self, obj; - pop(stack, self, obj); - push(stack, self.is(obj)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::__isnot__(t1 self, t2 obj) -> bool"), - [](Stack* stack) { - IValue self, obj; - pop(stack, self, obj); - push(stack, !self.is(obj)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::element_size(Tensor self) -> int"), - [](Stack* stack) { - at::Tensor arg = pop(stack).toTensor(); - push(stack, arg.element_size()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::numel(Tensor self) -> int"), - [](Stack* stack) { - at::Tensor arg = pop(stack).toTensor(); - push(stack, arg.numel()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::dim(Tensor self) -> int"), - [](Stack* stack) { - at::Tensor arg = pop(stack).toTensor(); - push(stack, arg.dim()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"), - [](Stack* stack) { - RECORD_FUNCTION("get_device", std::vector()); - auto result = - at::get_device((std::move(peek(stack, 0, 1))).toTensor()); - drop(stack, 1); - pack(stack, result); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"), - [](Stack* stack) { - RECORD_FUNCTION("storage_offset", std::vector()); - auto result = - ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset(); - drop(stack, 1); - pack(stack, result); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"), - [](Stack* stack) { - RECORD_FUNCTION("is_contiguous", std::vector()); - auto result = - ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous(); - drop(stack, 1); - pack(stack, result); - }, - aliasAnalysisFromSchema()), - // these ops are generic over the list element type. - // CREATING GENERIC_LIST_OPS - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::select.t(t[](a) list, int idx) -> t(*)"), - listSelect, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::__getitem__.t(t[](a) list, int idx) -> t(*)"), - listSelect, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::append.t(t[](a!) self, t(c -> *) el) -> t[](a!)"), - listAppend, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::reverse.t(t[](a!) self) -> ()"), - listReverse, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::extend.t(t[](a!) self, t[] other) -> ()"), - listExtend, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::copy.t(t[](a) self) -> t[]"), - listCopy, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::_set_item.t(t [](a!) l, int idx, t(b -> *) el) -> t[](a!)"), - listSetItem, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::clear.t(t[](a!) self) -> ()"), - listClear, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::Delete.t(t[](a!) self, int idx) -> ()"), - listDelete, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::insert.t(t[](a!) self, int idx, t(b -> *) el) -> ()"), - listInsert, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::pop.t(t[](a!) self, int idx=-1) -> t(*)"), - listPop, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::add.t(t[] a, t[] b) -> t[]"), - listAdd, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::add_.t(t[](a!) self, t[] b) -> t[]"), - listInplaceAdd, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> t[]"), - listSlice, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::list.t(t[] l) -> t[]"), - listList, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::mul.left_t(t[] l, int n) -> t[]"), - listMulIntLeft, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::mul.right_(int n, t[] l) -> t[]"), - listMulIntRight, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::mul_.t(t[](a!) l, int n) -> t[](a!)"), - listMulIntLeftInPlace, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::len.t(t[] a) -> int"), - listLen, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::eq.int_list(int[] a, int[] b) -> bool"), - listEq, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::eq.device(Device a, Device b) -> bool"), - [](Stack* stack) { - auto a = pop(stack).toDevice(); - auto b = pop(stack).toDevice(); - push(stack, a == b); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::ne.device(Device a, Device b) -> bool"), - [](Stack* stack) { - auto a = pop(stack).toDevice(); - auto b = pop(stack).toDevice(); - push(stack, a != b); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::eq.bool(bool a, bool b) -> bool"), - [](Stack* stack) { - auto a = pop(stack); - auto b = pop(stack); - push(stack, a == b); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::ne.bool(bool a, bool b) -> bool"), - [](Stack* stack) { - auto a = pop(stack); - auto b = pop(stack); - push(stack, a != b); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"), - [](Stack* stack) { push(stack, IValue::uninitialized()); }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::Print(...) -> ()"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - std::stringstream ss; - bool first = true; - for (const IValue& i : last(stack, num_inputs)) { - if (!first) - ss << " "; - first = false; - ss << i; - } - drop(stack, num_inputs); - ss << std::endl; - auto* handler = getPrintHandler(); - TORCH_INTERNAL_ASSERT(handler); - handler(ss.str()); - }, - aliasAnalysisSpecialCase()), - // This is an alternative to aten::cat op that takes variable number of - // parameters as input. - // Format: - // prim::VarConcat(Tensors..., dim) -> Tensor - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::VarConcat(...) -> Tensor"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - auto dim = pop(stack).toInt(); - std::vector inputs(num_inputs - 1); - for (int i = 0; i < num_inputs - 1; ++i) { - inputs[num_inputs - 2 - i] = pop(stack).toTensor(); - } - push(stack, at::cat(inputs, dim)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - auto dim = pop(stack).toInt(); - std::vector inputs(num_inputs - 1); - for (int i = 0; i < num_inputs - 1; ++i) { - inputs[num_inputs - 2 - i] = pop(stack).toTensor(); - } - push(stack, at::stack(inputs, dim)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"), - [](Stack* stack) { - IValue x = pop(stack); - IValue y = pop(stack); - push(stack, x == y); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::ne.enum(AnyEnumType a, AnyEnumType b) -> bool"), - [](Stack* stack) { - IValue x = pop(stack); - IValue y = pop(stack); - push(stack, x != y); - }, - aliasAnalysisFromSchema()), - // We define aten::dequantize in both native_functions.yaml and here, - // however, aten::dequantize.any defined here overrides - // aten::dequantize.tensors in native_functions.yaml. The variants here - // are only for graph mode quantization, and they should be removed once - // we deprecate graph mode quantization, and use the variants in - // native_functions.yaml. - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::dequantize.tensor(Tensor qtensor) -> Tensor"), - [](Stack* stack) { - at::Tensor qtensor; - pop(stack, qtensor); - push(stack, at::dequantize(qtensor)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::dequantize.list(Tensor[] qtensors) -> Tensor[]"), - [](Stack* stack) { - auto qtensors = pop(stack).toTensorVector(); - push(stack, at::dequantize(qtensors)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"), - [](Stack* stack) { dequantize(*stack); }, - aliasAnalysisFromSchema()), - DEFINE_UNARY_OP_WITH_COMPLEX(aten::log, std::log(a), float, float), - DEFINE_STRING_OP(aten::add, a + b, str), - DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::eq, a == b), - DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::ne, a != b), - DEFINE_GENERIC_OP( - aten::polar, - c10::polar(static_cast(a), static_cast(b)), - c10::polar(static_cast(a), static_cast(b)), - complex, - complex), - DEFINE_INT_FLOAT_OP( - aten::polar, - c10::polar(static_cast(a), static_cast(b)), - complex), - DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION( - aten::polar, - c10::polar(static_cast(a), static_cast(b)), - c10::polar(static_cast(a), static_cast(b)), - Scalar), - DEFINE_COMPARISON_OP(aten::lt, a < b), - DEFINE_COMPARISON_OP(aten::gt, a > b), - DEFINE_COMPARISON_OP(aten::le, a <= b), - DEFINE_COMPARISON_OP(aten::ge, a >= b), - DEFINE_BINARY_OP_WITH_COMPLEX(aten::add, a + b), - DEFINE_BINARY_OP_WITH_COMPLEX(aten::sub, a - b), - DEFINE_BINARY_OP_WITH_COMPLEX(aten::mul, a* b), - DEFINE_BOOL_OP(aten::__and__, a&& b), - DEFINE_BOOL_OP(aten::__or__, a || b), - DEFINE_BOOL_OP(aten::__xor__, a != b), - DEFINE_UNARY_OP(aten::round, round_to_even(a), float, float), - DEFINE_UNARY_OP(aten::floor, floor(a), int, int), - DEFINE_UNARY_OP(aten::ceil, ceil(a), int, int), - DEFINE_UNARY_OP_WITH_COMPLEX(aten::neg, -a, int, float), - DEFINE_UNARY_OP_WITH_COMPLEX(aten::exp, std::exp(a), float, float), - // Pass in two ops for handling int and float separately as % in C++ only - // works for int The modulus calculation is different between C++ and - // Python (on negative), we preserve the python behavior as it's more - // common and match python syntax, hence the conversion. - DEFINE_GENERIC_OP( - aten::remainder, - (b + (a % b)) % b, - fmod((b + fmod(a, b)), b), - int, - float), - DEFINE_INT_FLOAT_OP(aten::remainder, fmod((b + fmod(a, b)), b), float), - DEFINE_SCALAR_BINARY_OP( - aten::remainder, - (b + (a % b)) % b, - fmod((b + fmod(a, b)), b), - Scalar), - // NB: This is the python truediv operation - DEFINE_GENERIC_OP_WITH_COMPLEX( - aten::div, - static_cast(a) / static_cast(b), - a / b, - a / b, - float, - float, - complex), - DEFINE_SCALAR_BINARY_OP( - aten::div, - static_cast(a) / static_cast(b), - a / b, - float), - DEFINE_GENERIC_OP( - aten::floordiv, - floordiv(a, b), - std::floor(a / b), - int, - float), - DEFINE_INT_FLOAT_OP(aten::floordiv, std::floor(a / b), float), - DEFINE_SCALAR_BINARY_OP( - aten::floordiv, - floordiv(a, b), - std::floor(a / b), - Scalar), - // int ** int produces a float, because negative exponents produce float - // results - DEFINE_GENERIC_OP_WITH_COMPLEX( - aten::pow, - static_cast(powWrapper(a, b)), - static_cast(powWrapper(a, b)), - static_cast>(pow(a, b)), - float, - float, - complex), - DEFINE_INT_FLOAT_OP( - aten::pow, - static_cast(powWrapper(a, b)), - float), - DEFINE_FLOAT_COMPLEX_OP(aten::pow, pow(a, b), complex), - DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION( - aten::pow, - static_cast(pow(a, b)), - static_cast(pow(a, b)), - float), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::pow.int_to_int(int a, int b) -> int"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t a, b; - pop(stack, a, b); - push(stack, powWrapper(a, b)); - }, - aliasAnalysisFromSchema()), - // min and max are in prim:: because there is a difference between - // the python builtin 'min' and 'torch.min' - DEFINE_BINARY_OP(prim::min, a < b ? a : b), - DEFINE_BINARY_OP(prim::max, a > b ? a : b), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::type(Device self) -> str"), - [](Stack* stack) { - auto d = pop(stack); - push( - stack, - DeviceTypeName(d.toDevice().type(), /* lower_case=*/true)); - }, - aliasAnalysisFromSchema()), - // tensor length op (size of 1st dimension) - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::len.Tensor(Tensor t) -> int"), - [](Stack* stack) { - at::Tensor t = pop(stack).toTensor(); - if (t.dim() == 0) { - AT_ERROR("len() of a 0-d tensor"); - } - push(stack, t.sizes()[0]); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::ord(str string) -> int"), - [](Stack* stack) { - auto string = pop(stack).toStringRef(); - TORCH_CHECK( - string.size() == 1, - "String for ord() must be 1 character, found ", - string.size()); - uint8_t ord = string.at(0); - push(stack, int64_t(ord)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::lower(str self) -> str"), - [](Stack* stack) { - auto string = pop(stack).toStringRef(); - std::stringstream ss; - for (char c : string) { - ss << static_cast(::tolower(c)); - } - push(stack, ss.str()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::__contains__.int_list(int[] l, int item) -> bool"), - listContains, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::__contains__.str_list(str[] l, str item) -> bool"), - listContains, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::len.str(str s) -> int"), - [](Stack* stack) { - auto string = pop(stack).toStringRef(); - push(stack, static_cast(string.size())); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::dict() -> Dict(str, Tensor)"), - [](Stack* stack) { - auto dict = - c10::impl::GenericDict(StringType::get(), TensorType::get()); - push(stack, dict); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::__getitem__.str(str s, int index) -> str"), - [](Stack* stack) { - auto index = pop(stack).toInt(); - auto string = pop(stack).toStringRef(); - auto norm_index = normalizeIndex(index, string.size()); - char c = string.at(norm_index); - push(stack, std::string(&c, 1)); - }, - aliasAnalysisFromSchema()), +static const OperatorGeneratorArgs opGenArgs[] = { + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"), + [](Stack* stack) { + std::stringstream ss; + ss << pop(stack); + push(stack, ss.str()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::list(str t) -> str[]"), + [](Stack* stack) { + auto str = pop(stack).toStringRef(); + c10::List chars; + chars.reserve(str.size()); + for (auto c : str) { + chars.push_back(std::string(1, c)); + } + push(stack, std::move(chars)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.cpu()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::layout(Tensor a) -> int"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.layout()); + }, + aliasAnalysisFromSchema()), + + // only used internally in range() translation + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::__range_length(int lo, int hi, int step) -> int"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t lo, hi, step; + pop(stack, lo, hi, step); + // error handling when step_val = 0 during runtime + if (step == 0) { + throw std::runtime_error("range() arg 3 must not be zero"); + } + if (step > 0 && lo < hi) { + push(stack, 1 + (hi - 1 - lo) / step); + } else if (step < 0 && lo > hi) { + push(stack, 1 + (lo - 1 - hi) / (0 - step)); + } else { + push(stack, 0); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::__derive_index(int index, int start, int step) -> int"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t index, start, step; + pop(stack, index, start, step); + push(stack, start + index * step); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::TupleUnpack(Any tup) -> ..."), + [](Stack* stack) { tupleUnpack(*stack); }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::unchecked_cast(t x) -> t"), + noop, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::IntImplicit(Tensor a) -> int"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + checkImplicitTensorToNum(a, /*to int*/ true); + push(stack, a.item()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::ComplexImplicit(Tensor a) -> complex"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + checkImplicitTensorToNum(a, /*to int*/ false); + push(stack, a.item>()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::FloatImplicit(Tensor a) -> float"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + checkImplicitTensorToNum(a, /*to int*/ false); + push(stack, a.item()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::ScalarImplicit(Tensor a) -> Scalar"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + checkImplicitTensorToNum(a, /*to int*/ false); + push(stack, a.item()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Bool.Tensor(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_nonzero()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Bool.int(int a) -> bool"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t i; + pop(stack, i); + push(stack, (bool)i); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Bool.float(float a) -> bool"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + double d; + pop(stack, d); + push(stack, (bool)d); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.item()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool b; + pop(stack, b); + push(stack, static_cast(b)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + double d; + pop(stack, d); + push(stack, static_cast(d)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"), + [](Stack* stack) { + IValue scalar; + pop(stack, scalar); + if (scalar.isInt()) { + push(stack, std::move(scalar)); + } else { + // toScalar() needed to avoid strict type check in IValue::toInt. + push(stack, static_cast(scalar.toScalar().toInt())); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"), + [](Stack* stack) { + auto s = pop(stack).toString(); + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + std::string::size_type sz; + int64_t val = static_cast(c10::stoll(s->string(), &sz)); + if (sz == s->string().size()) { + push(stack, val); + } else { + std::stringstream error_str; + error_str << "invalid literal for int() " + << "with base 10: '" << s->string() << "'"; + throw std::runtime_error(error_str.str()); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.item()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Float.Scalar(Scalar a) -> float"), + [](Stack* stack) { + IValue scalar; + pop(stack, scalar); + if (scalar.isDouble()) { + push(stack, std::move(scalar)); + } else if (scalar.isComplexDouble()) { + push(stack, scalar.toComplexDouble().real()); + } else { + push(stack, static_cast(scalar.toInt())); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Float.int(int a) -> float"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t i; + pop(stack, i); + push(stack, (float)i); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Float.bool(bool a) -> float"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool b; + pop(stack, b); + push(stack, (float)b); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Float.str(str a) -> float"), + [](Stack* stack) { + auto s = pop(stack).toString(); + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + std::string::size_type sz; + double b = c10::stod(s->string(), &sz); + if (sz == s->string().size()) { + push(stack, b); + } else { + std::stringstream error_str; + error_str << "could not convert string " + << "to float: '" << s->string() << "'"; + throw std::runtime_error(error_str.str()); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Complex.Scalar(Scalar a) -> complex"), + [](Stack* stack) { + IValue scalar; + pop(stack, scalar); + if (scalar.isComplexDouble()) { + push(stack, std::move(scalar)); + } else if (scalar.isDouble()) { + push(stack, c10::complex(scalar.toDouble(), 0)); + } else { + push(stack, c10::complex(scalar.toInt(), 0)); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::Complex.Tensor_Tensor(Tensor a, Tensor b) -> complex"), + [](Stack* stack) { + at::Tensor a, b; + pop(stack, a, b); + push(stack, c10::complex(a.item(), b.item())); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::format(str self, ...) -> str"), + [](Stack* stack) { + size_t num_inputs = pop(stack).toInt(); + format(*stack, num_inputs); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::einsum.sublist(Tensor a, ...) -> Tensor"), + [](Stack* stack) { + size_t num_inputs = pop(stack).toInt(); + einsum(*stack, num_inputs); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.Scalar(Scalar a) -> Tensor"), + [](Stack* stack) { + at::Scalar s; + pop(stack, s); + push(stack, at::scalar_to_tensor(s)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::RaiseException(str msg) -> ()"), + [](Stack* stack) { throw JITException(pop(stack).toStringRef()); }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Size(int[] sizes) -> int[]"), + [](Stack* stack) {}, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::size(Tensor self) -> int[]"), + [](Stack* stack) { + auto t = std::move(pop(stack)).toTensor(); + pack(stack, t.sizes().vec()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::EnumName(AnyEnumType enum) -> str"), + [](Stack* stack) { + IValue e = pop(stack); + push(stack, e.toEnumHolder()->name()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::EnumValue.int(AnyEnumType enum) -> int"), + [](Stack* stack) { + IValue e = pop(stack); + push(stack, e.toEnumHolder()->value()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "prim::EnumValue.float(AnyEnumType enum) -> float"), + [](Stack* stack) { + IValue e = pop(stack); + push(stack, e.toEnumHolder()->value()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::EnumValue.str(AnyEnumType enum) -> str"), + [](Stack* stack) { + IValue e = pop(stack); + push(stack, e.toEnumHolder()->value()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + // note the compiler knows to type TupleIndex more accurately than it + // is listed here. + TORCH_SELECTIVE_SCHEMA("prim::TupleIndex(Any tup, int i) -> Any"), + [](Stack* stack) { + int64_t index = pop(stack).toInt(); + auto tuple = pop(stack).toTuple(); + auto norm_index = normalizeIndex(index, tuple->elements().size()); + if (norm_index < 0 || + norm_index > static_cast(tuple->elements().size())) { + throw std::out_of_range("Tuple list index out of range"); + } + stack->emplace_back(tuple->elements()[norm_index]); + }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::ne.int_list(int[] a, int[] b) -> bool"), + listNe, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "prim::unchecked_unwrap_optional(t(a)? optional) -> t(a)"), + noop, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::device(Tensor a) -> Device"), + [](Stack* stack) { push(stack, pop(stack).toTensor().device()); }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::dtype(Tensor a) -> int"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, static_cast(a.scalar_type())); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::__not__(bool self) -> bool"), + [](Stack* stack) { push(stack, !pop(stack).toBool()); }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::__is__(t1 self, t2 obj) -> bool"), + [](Stack* stack) { + IValue self, obj; + pop(stack, self, obj); + push(stack, self.is(obj)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::__isnot__(t1 self, t2 obj) -> bool"), + [](Stack* stack) { + IValue self, obj; + pop(stack, self, obj); + push(stack, !self.is(obj)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::element_size(Tensor self) -> int"), + [](Stack* stack) { + at::Tensor arg = pop(stack).toTensor(); + push(stack, arg.element_size()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::numel(Tensor self) -> int"), + [](Stack* stack) { + at::Tensor arg = pop(stack).toTensor(); + push(stack, arg.numel()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::dim(Tensor self) -> int"), + [](Stack* stack) { + at::Tensor arg = pop(stack).toTensor(); + push(stack, arg.dim()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"), + [](Stack* stack) { + RECORD_FUNCTION("get_device", std::vector()); + auto result = + at::get_device((std::move(peek(stack, 0, 1))).toTensor()); + drop(stack, 1); + pack(stack, result); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"), + [](Stack* stack) { + RECORD_FUNCTION("storage_offset", std::vector()); + auto result = + ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset(); + drop(stack, 1); + pack(stack, result); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"), + [](Stack* stack) { + RECORD_FUNCTION("is_contiguous", std::vector()); + auto result = + ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous(); + drop(stack, 1); + pack(stack, result); + }, + aliasAnalysisFromSchema()), + // these ops are generic over the list element type. + // CREATING GENERIC_LIST_OPS + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::select.t(t[](a) list, int idx) -> t(*)"), + listSelect, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::__getitem__.t(t[](a) list, int idx) -> t(*)"), + listSelect, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::append.t(t[](a!) self, t(c -> *) el) -> t[](a!)"), + listAppend, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::reverse.t(t[](a!) self) -> ()"), + listReverse, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::extend.t(t[](a!) self, t[] other) -> ()"), + listExtend, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::copy.t(t[](a) self) -> t[]"), + listCopy, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::_set_item.t(t [](a!) l, int idx, t(b -> *) el) -> t[](a!)"), + listSetItem, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::clear.t(t[](a!) self) -> ()"), + listClear, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::Delete.t(t[](a!) self, int idx) -> ()"), + listDelete, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::insert.t(t[](a!) self, int idx, t(b -> *) el) -> ()"), + listInsert, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::pop.t(t[](a!) self, int idx=-1) -> t(*)"), + listPop, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::add.t(t[] a, t[] b) -> t[]"), + listAdd, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::add_.t(t[](a!) self, t[] b) -> t[]"), + listInplaceAdd, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> t[]"), + listSlice, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::list.t(t[] l) -> t[]"), + listList, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::mul.left_t(t[] l, int n) -> t[]"), + listMulIntLeft, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::mul.right_(int n, t[] l) -> t[]"), + listMulIntRight, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::mul_.t(t[](a!) l, int n) -> t[](a!)"), + listMulIntLeftInPlace, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::len.t(t[] a) -> int"), + listLen, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::eq.int_list(int[] a, int[] b) -> bool"), + listEq, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::eq.device(Device a, Device b) -> bool"), + [](Stack* stack) { + auto a = pop(stack).toDevice(); + auto b = pop(stack).toDevice(); + push(stack, a == b); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::ne.device(Device a, Device b) -> bool"), + [](Stack* stack) { + auto a = pop(stack).toDevice(); + auto b = pop(stack).toDevice(); + push(stack, a != b); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::eq.bool(bool a, bool b) -> bool"), + [](Stack* stack) { + auto a = pop(stack); + auto b = pop(stack); + push(stack, a == b); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::ne.bool(bool a, bool b) -> bool"), + [](Stack* stack) { + auto a = pop(stack); + auto b = pop(stack); + push(stack, a != b); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"), + [](Stack* stack) { push(stack, IValue::uninitialized()); }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::Print(...) -> ()"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + std::stringstream ss; + bool first = true; + for (const IValue& i : last(stack, num_inputs)) { + if (!first) + ss << " "; + first = false; + ss << i; + } + drop(stack, num_inputs); + ss << std::endl; + auto* handler = getPrintHandler(); + TORCH_INTERNAL_ASSERT(handler); + handler(ss.str()); + }, + aliasAnalysisSpecialCase()), + // This is an alternative to aten::cat op that takes variable number of + // parameters as input. + // Format: + // prim::VarConcat(Tensors..., dim) -> Tensor + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::VarConcat(...) -> Tensor"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + auto dim = pop(stack).toInt(); + std::vector inputs(num_inputs - 1); + for (int i = 0; i < num_inputs - 1; ++i) { + inputs[num_inputs - 2 - i] = pop(stack).toTensor(); + } + push(stack, at::cat(inputs, dim)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + auto dim = pop(stack).toInt(); + std::vector inputs(num_inputs - 1); + for (int i = 0; i < num_inputs - 1; ++i) { + inputs[num_inputs - 2 - i] = pop(stack).toTensor(); + } + push(stack, at::stack(inputs, dim)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"), + [](Stack* stack) { + IValue x = pop(stack); + IValue y = pop(stack); + push(stack, x == y); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::ne.enum(AnyEnumType a, AnyEnumType b) -> bool"), + [](Stack* stack) { + IValue x = pop(stack); + IValue y = pop(stack); + push(stack, x != y); + }, + aliasAnalysisFromSchema()), + // We define aten::dequantize in both native_functions.yaml and here, + // however, aten::dequantize.any defined here overrides + // aten::dequantize.tensors in native_functions.yaml. The variants here + // are only for graph mode quantization, and they should be removed once + // we deprecate graph mode quantization, and use the variants in + // native_functions.yaml. + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::dequantize.tensor(Tensor qtensor) -> Tensor"), + [](Stack* stack) { + at::Tensor qtensor; + pop(stack, qtensor); + push(stack, at::dequantize(qtensor)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::dequantize.list(Tensor[] qtensors) -> Tensor[]"), + [](Stack* stack) { + auto qtensors = pop(stack).toTensorVector(); + push(stack, at::dequantize(qtensors)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"), + [](Stack* stack) { dequantize(*stack); }, + aliasAnalysisFromSchema()), + DEFINE_UNARY_OP_WITH_COMPLEX(aten::log, std::log(a), float, float), + DEFINE_STRING_OP(aten::add, a + b, str), + DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::eq, a == b), + DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::ne, a != b), + DEFINE_GENERIC_OP( + aten::polar, + c10::polar(static_cast(a), static_cast(b)), + c10::polar(static_cast(a), static_cast(b)), + complex, + complex), + DEFINE_INT_FLOAT_OP( + aten::polar, + c10::polar(static_cast(a), static_cast(b)), + complex), + DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION( + aten::polar, + c10::polar(static_cast(a), static_cast(b)), + c10::polar(static_cast(a), static_cast(b)), + Scalar), + DEFINE_COMPARISON_OP(aten::lt, a < b), + DEFINE_COMPARISON_OP(aten::gt, a > b), + DEFINE_COMPARISON_OP(aten::le, a <= b), + DEFINE_COMPARISON_OP(aten::ge, a >= b), + DEFINE_BINARY_OP_WITH_COMPLEX(aten::add, a + b), + DEFINE_BINARY_OP_WITH_COMPLEX(aten::sub, a - b), + DEFINE_BINARY_OP_WITH_COMPLEX(aten::mul, a* b), + DEFINE_BOOL_OP(aten::__and__, a&& b), + DEFINE_BOOL_OP(aten::__or__, a || b), + DEFINE_BOOL_OP(aten::__xor__, a != b), + DEFINE_UNARY_OP(aten::round, round_to_even(a), float, float), + DEFINE_UNARY_OP(aten::floor, floor(a), int, int), + DEFINE_UNARY_OP(aten::ceil, ceil(a), int, int), + DEFINE_UNARY_OP_WITH_COMPLEX(aten::neg, -a, int, float), + DEFINE_UNARY_OP_WITH_COMPLEX(aten::exp, std::exp(a), float, float), + // Pass in two ops for handling int and float separately as % in C++ only + // works for int The modulus calculation is different between C++ and + // Python (on negative), we preserve the python behavior as it's more + // common and match python syntax, hence the conversion. + DEFINE_GENERIC_OP( + aten::remainder, + (b + (a % b)) % b, + fmod((b + fmod(a, b)), b), + int, + float), + DEFINE_INT_FLOAT_OP(aten::remainder, fmod((b + fmod(a, b)), b), float), + DEFINE_SCALAR_BINARY_OP( + aten::remainder, + (b + (a % b)) % b, + fmod((b + fmod(a, b)), b), + Scalar), + // NB: This is the python truediv operation + DEFINE_GENERIC_OP_WITH_COMPLEX( + aten::div, + static_cast(a) / static_cast(b), + a / b, + a / b, + float, + float, + complex), + DEFINE_SCALAR_BINARY_OP( + aten::div, + static_cast(a) / static_cast(b), + a / b, + float), + DEFINE_GENERIC_OP( + aten::floordiv, + floordiv(a, b), + std::floor(a / b), + int, + float), + DEFINE_INT_FLOAT_OP(aten::floordiv, std::floor(a / b), float), + DEFINE_SCALAR_BINARY_OP( + aten::floordiv, + floordiv(a, b), + std::floor(a / b), + Scalar), + // int ** int produces a float, because negative exponents produce float + // results + DEFINE_GENERIC_OP_WITH_COMPLEX( + aten::pow, + static_cast(powWrapper(a, b)), + static_cast(powWrapper(a, b)), + static_cast>(pow(a, b)), + float, + float, + complex), + DEFINE_INT_FLOAT_OP( + aten::pow, + static_cast(powWrapper(a, b)), + float), + DEFINE_FLOAT_COMPLEX_OP(aten::pow, pow(a, b), complex), + DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION( + aten::pow, + static_cast(pow(a, b)), + static_cast(pow(a, b)), + float), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::pow.int_to_int(int a, int b) -> int"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t a, b; + pop(stack, a, b); + push(stack, powWrapper(a, b)); + }, + aliasAnalysisFromSchema()), + // min and max are in prim:: because there is a difference between + // the python builtin 'min' and 'torch.min' + DEFINE_BINARY_OP(prim::min, a < b ? a : b), + DEFINE_BINARY_OP(prim::max, a > b ? a : b), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::type(Device self) -> str"), + [](Stack* stack) { + auto d = pop(stack); + push( + stack, DeviceTypeName(d.toDevice().type(), /* lower_case=*/true)); + }, + aliasAnalysisFromSchema()), + // tensor length op (size of 1st dimension) + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::len.Tensor(Tensor t) -> int"), + [](Stack* stack) { + at::Tensor t = pop(stack).toTensor(); + if (t.dim() == 0) { + AT_ERROR("len() of a 0-d tensor"); + } + push(stack, t.sizes()[0]); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::ord(str string) -> int"), + [](Stack* stack) { + auto string = pop(stack).toStringRef(); + TORCH_CHECK( + string.size() == 1, + "String for ord() must be 1 character, found ", + string.size()); + uint8_t ord = string.at(0); + push(stack, int64_t(ord)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::lower(str self) -> str"), + [](Stack* stack) { + auto string = pop(stack).toStringRef(); + std::stringstream ss; + for (char c : string) { + ss << static_cast(::tolower(c)); + } + push(stack, ss.str()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::__contains__.int_list(int[] l, int item) -> bool"), + listContains, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::__contains__.str_list(str[] l, str item) -> bool"), + listContains, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::len.str(str s) -> int"), + [](Stack* stack) { + auto string = pop(stack).toStringRef(); + push(stack, static_cast(string.size())); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::dict() -> Dict(str, Tensor)"), + [](Stack* stack) { + auto dict = + c10::impl::GenericDict(StringType::get(), TensorType::get()); + push(stack, dict); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::__getitem__.str(str s, int index) -> str"), + [](Stack* stack) { + auto index = pop(stack).toInt(); + auto string = pop(stack).toStringRef(); + auto norm_index = normalizeIndex(index, string.size()); + char c = string.at(norm_index); + push(stack, std::string(&c, 1)); + }, + aliasAnalysisFromSchema()), #define CREATE_COPY_OP(other_type, c_type) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::copy_." #other_type \ "(Tensor(a!) self, " #other_type \ " other) -> Tensor(a!)"), \ @@ -1035,170 +950,168 @@ RegisterOperators reg( }, \ aliasAnalysisFromSchema()) - CREATE_COPY_OP(Tensor, at::Tensor), - CREATE_COPY_OP(int, int64_t), - CREATE_COPY_OP(float, double), + CREATE_COPY_OP(Tensor, at::Tensor), + CREATE_COPY_OP(int, int64_t), + CREATE_COPY_OP(float, double), #undef CREATE_COPY_OP - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"), - [](Stack* stack) { - bool create_graph = pop(stack).toBool(); - auto retain_graph = pop(stack).toOptional(); - IValue gradient_ivalue = pop(stack); - at::Tensor gradient = gradient_ivalue.isNone() - ? at::Tensor() - : gradient_ivalue.toTensor(); - at::Tensor self = pop(stack).toTensor(); - bool keep_graph = retain_graph ? retain_graph.value() : create_graph; - self.backward(gradient, keep_graph, create_graph); - }, - aliasAnalysisConservative()), - // - // create a clone of these declarations with a _hacked_twin overload name - // and nullability scrubbed from TensorList arg types - // TOOD find out why this exists and how to do it without the hack - // - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"), - [](Stack* stack) { - auto indices = pop(stack).to>>(); - auto self = pop(stack).toTensor(); - auto result = at::index(self, indices); - push(stack, std::move(result)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::_index_put_impl_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)"), - [](Stack* stack) { - auto unsafe = pop(stack).toBool(); - auto accumulate = pop(stack).toBool(); - auto values = pop(stack).toTensor(); - auto indices = pop(stack).to>>(); - auto self = pop(stack).toTensor(); - auto result = - at::_index_put_impl_(self, indices, values, accumulate, unsafe); - push(stack, std::move(result)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::index_put_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)"), - [](Stack* stack) { - auto accumulate = pop(stack).toBool(); - auto values = pop(stack).toTensor(); - auto indices = pop(stack).to>>(); - auto self = pop(stack).toTensor(); - auto result = at::index_put_(self, indices, values, accumulate); - push(stack, std::move(result)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::index_put.hacked_twin(Tensor self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor"), - [](Stack* stack) { - auto accumulate = pop(stack).toBool(); - auto values = pop(stack).toTensor(); - auto indices = pop(stack).to>>(); - auto self = pop(stack).toTensor(); - auto result = at::index_put_(self, indices, values, accumulate); - push(stack, std::move(result)); - }, - aliasAnalysisFromSchema()), - // reference function parse_to_conversion in python_arg_parsing.h - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool non_blocking; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool copy; - pop(stack, non_blocking, copy); - c10::optional scalarType = - pop(stack).toOptional(); - c10::optional device = - pop(stack).toOptional(); - at::Tensor self = pop(stack).toTensor(); - push( - stack, - to_dispatch(self, device, scalarType, non_blocking, copy)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::to.prim_dtype(Tensor(a) self, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool non_blocking; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool copy; - pop(stack, non_blocking, copy); - c10::optional scalarType = - pop(stack).toOptional(); - c10::optional device = c10::nullopt; - at::Tensor self = pop(stack).toTensor(); - push( - stack, - to_dispatch(self, device, scalarType, non_blocking, copy)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_cuda(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_cuda()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_xpu(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_xpu()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::data(Tensor(a) a) -> Tensor(a)"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, autograd::Variable(a).variable_data()); - }, - aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"), + [](Stack* stack) { + bool create_graph = pop(stack).toBool(); + auto retain_graph = pop(stack).toOptional(); + IValue gradient_ivalue = pop(stack); + at::Tensor gradient = gradient_ivalue.isNone() + ? at::Tensor() + : gradient_ivalue.toTensor(); + at::Tensor self = pop(stack).toTensor(); + bool keep_graph = retain_graph ? retain_graph.value() : create_graph; + self.backward(gradient, keep_graph, create_graph); + }, + aliasAnalysisConservative()), + // + // create a clone of these declarations with a _hacked_twin overload name + // and nullability scrubbed from TensorList arg types + // TOOD find out why this exists and how to do it without the hack + // + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"), + [](Stack* stack) { + auto indices = pop(stack).to>>(); + auto self = pop(stack).toTensor(); + auto result = at::index(self, indices); + push(stack, std::move(result)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::_index_put_impl_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)"), + [](Stack* stack) { + auto unsafe = pop(stack).toBool(); + auto accumulate = pop(stack).toBool(); + auto values = pop(stack).toTensor(); + auto indices = pop(stack).to>>(); + auto self = pop(stack).toTensor(); + auto result = + at::_index_put_impl_(self, indices, values, accumulate, unsafe); + push(stack, std::move(result)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::index_put_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)"), + [](Stack* stack) { + auto accumulate = pop(stack).toBool(); + auto values = pop(stack).toTensor(); + auto indices = pop(stack).to>>(); + auto self = pop(stack).toTensor(); + auto result = at::index_put_(self, indices, values, accumulate); + push(stack, std::move(result)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::index_put.hacked_twin(Tensor self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor"), + [](Stack* stack) { + auto accumulate = pop(stack).toBool(); + auto values = pop(stack).toTensor(); + auto indices = pop(stack).to>>(); + auto self = pop(stack).toTensor(); + auto result = at::index_put_(self, indices, values, accumulate); + push(stack, std::move(result)); + }, + aliasAnalysisFromSchema()), + // reference function parse_to_conversion in python_arg_parsing.h + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool non_blocking; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool copy; + pop(stack, non_blocking, copy); + c10::optional scalarType = + pop(stack).toOptional(); + c10::optional device = + pop(stack).toOptional(); + at::Tensor self = pop(stack).toTensor(); + push( + stack, to_dispatch(self, device, scalarType, non_blocking, copy)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::to.prim_dtype(Tensor(a) self, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool non_blocking; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool copy; + pop(stack, non_blocking, copy); + c10::optional scalarType = + pop(stack).toOptional(); + c10::optional device = c10::nullopt; + at::Tensor self = pop(stack).toTensor(); + push( + stack, to_dispatch(self, device, scalarType, non_blocking, copy)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_cuda(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_cuda()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_xpu(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_xpu()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::data(Tensor(a) a) -> Tensor(a)"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, autograd::Variable(a).variable_data()); + }, + aliasAnalysisFromSchema()), // these ops are not defined for Tensor #define CREATE_COMPARATOR_LIST_OPS_SPECIALIZED(decl_type, value_type) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("prim::min." decl_type "_list(" decl_type \ "[] l, " decl_type "[] r) -> " decl_type "[]"), \ minList, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("prim::max." decl_type "_list(" decl_type \ "[] l, " decl_type "[] r) -> " decl_type \ "[]"), \ maxList, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("prim::min.self_" decl_type "(" decl_type \ "[] self) -> " decl_type), \ listMin, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("prim::max.self_" decl_type "(" decl_type \ "[] self) -> " decl_type), \ listMax, \ aliasAnalysisFromSchema()), - CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("int", int64_t) - CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("float", double) - CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("bool", bool) + CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("int", int64_t) + CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("float", double) + CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("bool", bool) #undef CREATE_COMPARATOR_LIST_OPS_SPECIALIZED // python string is methods return false if empty #define DEFINE_STRING_IS_OP(op_name, char_op) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> bool"), \ [](Stack* stack) { \ auto string = pop(stack).toStringRef(); \ @@ -1211,15 +1124,15 @@ RegisterOperators reg( }, \ aliasAnalysisFromSchema()) - DEFINE_STRING_IS_OP(aten::isdigit, ::isdigit), - DEFINE_STRING_IS_OP(aten::isspace, ::isspace), - DEFINE_STRING_IS_OP(aten::isalnum, ::isalnum), - DEFINE_STRING_IS_OP(aten::isalpha, ::isalpha), - DEFINE_STRING_IS_OP(aten::isdecimal, ::isdigit), - DEFINE_STRING_IS_OP(aten::isnumeric, ::isdigit), + DEFINE_STRING_IS_OP(aten::isdigit, ::isdigit), + DEFINE_STRING_IS_OP(aten::isspace, ::isspace), + DEFINE_STRING_IS_OP(aten::isalnum, ::isalnum), + DEFINE_STRING_IS_OP(aten::isalpha, ::isalpha), + DEFINE_STRING_IS_OP(aten::isdecimal, ::isdigit), + DEFINE_STRING_IS_OP(aten::isnumeric, ::isdigit), #define DEFINE_STRING_CHAR_MAP_OP(op_name, char_op) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> str"), \ [](Stack* stack) { \ auto string = pop(stack).toStringRef(); \ @@ -1231,14 +1144,121 @@ RegisterOperators reg( }, \ aliasAnalysisFromSchema()) - DEFINE_STRING_CHAR_MAP_OP(aten::upper, ::toupper), - DEFINE_STRING_CHAR_MAP_OP(aten::swapcase, ([](char c) { - if (c == static_cast(::toupper(c))) { - return static_cast(::tolower(c)); - } else { - return static_cast(::toupper(c)); - } - }))}); + DEFINE_STRING_CHAR_MAP_OP(aten::upper, ::toupper), + DEFINE_STRING_CHAR_MAP_OP(aten::swapcase, ([](char c) { + if (c == static_cast(::toupper(c))) { + return static_cast(::tolower(c)); + } else { + return static_cast(::toupper(c)); + } + }))}; + +static std::vector> createOperators( + const OperatorGeneratorArgs* args, + int length) { + std::vector> result; + result.reserve(length); + for (int ii = 0; ii < length; ++ii) { + if (args[ii].schema_str) { + if (args[ii].isOperationCreator) { + result.push_back(OperatorGenerator( + args[ii].schema_str, + args[ii].operationCreator, + args[ii].aliasAnalysis)); + } else { + result.push_back(OperatorGenerator( + args[ii].schema_str, args[ii].operation, args[ii].aliasAnalysis)); + } + } + } + return result; +} + +RegisterOperators reg(([]() { + auto v = createOperators(opGenArgs, sizeof(opGenArgs) / sizeof(opGenArgs[0])); + v.push_back(Operator( + prim::tolist, + // This operator has to be unschematized because the return type + // depends on the type hint and input. The implementation of this + // operator below is intended to be as close to the Python + // implementation in torch/csrc/utils/tensor_list.cpp as possible. + [](const Node* /*node*/) -> Operation { + return [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int elem_ty_val; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int dim_val; + at::Tensor t; + + pop(stack, elem_ty_val); + pop(stack, dim_val); + pop(stack, t); + + // If the Tensor is not on the CPU, transfer it. + if (!t.device().is_cpu()) { + t = t.cpu(); + } + + // Rebuild the output type using elem_ty_val and dim_val. Start + // with the element type corresponding to elem_ty_val. + TypePtr out_ty; + if (elem_ty_val == 0) { + out_ty = IntType::get(); + } else if (elem_ty_val == 1) { + out_ty = FloatType::get(); + } else if (elem_ty_val == 2) { + out_ty = BoolType::get(); + } else if (elem_ty_val == 3) { + out_ty = ComplexType::get(); + } else { + TORCH_CHECK( + false, + "Unsupported element type for tolist; only int, float, complex and bool are supported"); + } + + // Check that type of the Tensor matches that of the annotation. + // Make an exception for the case in which the annotated type is + // float/complex and the Tensor data type is also float/complex; + // the elements will be casted to double/c10::complex + // later. + TORCH_CHECK( + (out_ty == FloatType::get() && t.is_floating_point()) || + (out_ty == ComplexType::get() && t.is_complex()) || + tryScalarTypeFromJitType(out_ty) == t.scalar_type(), + "Output annotation element type and runtime tensor element type must match for tolist()"); + + // Check that the dimension of the Tensor matches that of the + // annotation. + TORCH_CHECK( + dim_val == t.dim(), + "Output annotation list dimension and runtime tensor dimension must match for tolist()"); + + // Wrap out_ty in a ListType dim times. + for (const auto i : c10::irange(dim_val)) { + (void)i; // Suppress unused variable warning + out_ty = ListType::create(out_ty); + } + + int64_t dim = t.dim(); + auto sizes = t.sizes(); + auto strides = t.strides(); + size_t element_size = t.element_size(); + char* data = static_cast(t.data_ptr()); + auto result = tensorToListRecursive( + data, + 0, + dim, + out_ty, + t.scalar_type(), + sizes, + strides, + element_size); + push(stack, std::move(result)); + }; + }, + aliasAnalysisSpecialCase())); + return v; +})()); void dictSetItem(Stack* stack) { auto value = pop(stack); @@ -1408,123 +1428,125 @@ void dictConstructFromList(Stack* stack) { } #define CREATE_DICT_OPS(key_type) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::len.Dict_" key_type "(Dict(" key_type \ ", t) self) -> int"), \ dictLen, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::keys." key_type "(Dict(" key_type \ ", t) self) -> " key_type "[](*)"), \ dictKeys, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::values." key_type "(Dict(" key_type \ ", t) self) -> t[](*)"), \ dictValues, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::__getitem__.Dict_" key_type \ "(Dict(" key_type ", t) self, " key_type \ " key) -> t(*)"), \ dictIndex, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::get." key_type "(Dict(" key_type \ ", t) self, " key_type " key) -> t(*)?"), \ dictGet, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::get.default_" key_type \ "(Dict(" key_type ", t) self, " key_type \ " key, t default_value) -> t(*)"), \ dictGet, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ "aten::setdefault." key_type "(Dict(" key_type \ ", t)(a!) self, " key_type \ "(b -> *) key, t(c -> *) default_value) -> t(*)"), \ dictSetDefault, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Delete.Dict_" key_type \ "(Dict(" key_type ", t)(a!) self, " key_type \ " key) -> ()"), \ dictDelete, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::pop.Dict_" key_type "(Dict(" key_type \ ", t)(a!) self, " key_type " key) -> t(*)"), \ dictPop, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::pop.Dict_default_" key_type \ "(Dict(" key_type ", t)(a!) self, " key_type \ " key, t default_value) -> t(*)"), \ dictPop, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::popitem." key_type "(Dict(" key_type \ ", t)(a!) self) -> ((" key_type ", t))"), \ dictPopItem, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::clear." key_type "(Dict(" key_type \ ", t)(a!) self) -> ()"), \ dictClear, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::update." key_type "(Dict(" key_type \ ", t)(a!) self, Dict(" key_type \ ", t)(a!) to_add) -> ()"), \ dictUpdate, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::items." key_type "(Dict(" key_type \ ", t) self) -> ((" key_type ", t)[])"), \ dictItems, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::copy.Dict_" key_type "(Dict(" key_type \ ", t)(a) self) -> Dict(" key_type ", t)"), \ dictCopy, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::__contains__." key_type \ "(Dict(" key_type ", t) dict, " key_type \ " key) -> bool"), \ dictContains, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::_set_item." key_type "(Dict(" key_type \ ", t)(a!) l, " key_type \ "(b -> *) idx, t(c -> *) v) -> ()"), \ dictSetItem, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::dict." key_type "((" key_type \ ", tVal)[] inputs) -> Dict(" key_type \ ", tVal)"), \ dictConstructFromList, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::dict.Dict_" key_type "(Dict(" key_type \ ", t)(a) self) -> Dict(" key_type ", t)"), \ dictCopy, \ aliasAnalysisFromSchema()) -RegisterOperators reg_dict_ops({ +static const OperatorGeneratorArgs dict_ops[] = { CREATE_DICT_OPS("str"), CREATE_DICT_OPS("int"), CREATE_DICT_OPS("bool"), CREATE_DICT_OPS("float"), CREATE_DICT_OPS("complex"), CREATE_DICT_OPS("Tensor"), -}); +}; +RegisterOperators reg_dict_ops( + createOperators(dict_ops, sizeof(dict_ops) / sizeof(dict_ops[0]))); // NOLINTNEXTLINE(clang-diagnostic-unused-function) -c10::AliasAnalysisKind aliasAnalysisFromSchema() { +constexpr c10::AliasAnalysisKind aliasAnalysisFromSchema() { return c10::AliasAnalysisKind::FROM_SCHEMA; } @@ -2095,393 +2117,394 @@ TORCH_LIBRARY_IMPL(aten, CatchAll, m) { }); } +static const OperatorGeneratorArgs opGenArgs1[] = { + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::rangelist(int n) -> int[]"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t n; + pop(stack, n); + c10::List elems; + elems.reserve(n); + for (const auto i : c10::irange(n)) { + elems.push_back(i); + } + push(stack, std::move(elems)); + }, + aliasAnalysisFromSchema()), + // note: this op needs to share a name with the Scalar -> Tensor conversion + // because all _to_tensor conversion have to have the same operator namet + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.bool(bool a) -> Tensor"), + [](Stack* stack) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool b; + pop(stack, b); + push(stack, at::scalar_to_tensor(b)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::device(str a) -> Device"), + [](Stack* stack) { + push(stack, c10::Device(pop(stack).toStringRef())); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::percentFormat(str self, ...) -> str"), + [](Stack* stack) { + size_t num_inputs = pop(stack).toInt(); + percentFormat(*stack, num_inputs); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::to.prim_other(Tensor(a) self, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), + [](Stack* stack) { + at::Tensor self; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool non_blocking; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool copy; + pop(stack, self, non_blocking, copy); + c10::optional device = c10::nullopt; + c10::optional scalarType = c10::nullopt; + push( + stack, to_dispatch(self, device, scalarType, non_blocking, copy)); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::requires_grad(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.requires_grad()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::grad(Tensor a) -> Tensor(*)"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.grad()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_sparse(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_sparse()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_sparse_csr(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_sparse_csr()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_mkldnn(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_mkldnn()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_mlc()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_vulkan(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_vulkan()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_quantized()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_meta(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_meta()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.is_ort()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + if (a.name() == "") { + push(stack, IValue()); + } else { + push(stack, a.name()); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::index(Device self) -> int?"), + [](Stack* stack) { + auto d = pop(stack).toDevice(); + if (d.has_index()) { + push(stack, d.index()); + } else { + push(stack, IValue()); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + // TODO return generator object when torchscript supports RNG + // first-class + TORCH_SELECTIVE_SCHEMA("aten::manual_seed(int seed) -> ()"), + [](Stack* stack) { at::manual_seed(pop(stack).toInt()); }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("aten::cuda(Tensor(a) self) -> Tensor(a|b)"), + [](Stack* stack) { + at::Tensor a; + pop(stack, a); + push(stack, a.cuda()); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::AutogradZero() -> Tensor"), + [](Stack* stack) { stack->emplace_back(at::Tensor()); }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]"), + [](Stack* stack) { + bool keepdim = pop(stack).toBool(); + c10::List axes = pop(stack).toIntList(); + c10::List size = pop(stack).toIntList(); + if (keepdim) { + for (const auto& axis : axes) { + size.set(axis, 1); + } + } else { + int64_t index = 0; + auto iter = size.begin(); + std::sort(axes.begin(), axes.end()); + for (const auto& axis : axes) { + // move iter to the next axis + iter += axis - index; + + // input iter points to axis and is updated to axis + 1 + iter = size.erase(iter); + + // update current index for iter + index = axis + 1; + } + } + push(stack, IValue(std::move(size))); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::BroadcastSizes(...) -> int[]"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + std::vector size; + size.reserve(8); + for (const auto i : c10::irange(num_inputs)) { + size = + at::infer_size(size, peek(stack, i, num_inputs).toIntVector()); + } + drop(stack, num_inputs); + push(stack, IValue(size)); + }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::warn(str message, int stacklevel=2) -> ()"), + [](Stack* stack) { + TORCH_CHECK(false, "warn is implemented directly in the interpreter"); + }, + aliasAnalysisFromSchema()), + + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "onnx::Reshape(Tensor input, Tensor shape) -> Tensor"), + [](Stack* stack) { + at::Tensor input, shape; + pop(stack, input, shape); + shape = shape.contiguous(); + AT_ASSERT(shape.ndimension() == 1); + at::IntArrayRef shape_list(shape.data_ptr(), shape.size(0)); + push(stack, input.reshape(shape_list)); + }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("onnx::Shape(Tensor t) -> Tensor"), + [](Stack* stack) { + auto t = pop(stack).toTensor(); + at::IntArrayRef sizes = t.sizes(); + auto sizes_tensor = torch::empty( + {static_cast(sizes.size())}, at::dtype(at::kLong)); + auto accessor = sizes_tensor.accessor(); + for (const auto i : c10::irange(sizes.size())) { + accessor[i] = sizes[i]; + } + stack->emplace_back(sizes_tensor); + }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::AutogradAnyNonZero(...) -> bool"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + bool result = false; + for (const IValue& v : last(stack, num_inputs)) { + if (v.isTensor()) { + if (v.toTensor().defined()) { + result = true; + break; + } + } else if (v.isTensorList()) { + for (const at::Tensor& t : v.toTensorVector()) { + if (t.defined()) { + result = true; + } + } + if (result) { + break; + } + } else { + TORCH_INTERNAL_ASSERT(false); + } + } + drop(stack, num_inputs); + stack->emplace_back(result); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::AutogradAllZero(...) -> bool"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + bool result = true; + for (const IValue& v : last(stack, num_inputs)) { + TORCH_INTERNAL_ASSERT(v.isTensor()); + if (v.toTensor().defined()) { + result = false; + break; + } + } + drop(stack, num_inputs); + stack->emplace_back(result); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::AutogradAllNonZero(...) -> bool"), + [](Stack* stack) { + auto num_inputs = pop(stack).toInt(); + bool result = true; + for (const IValue& v : last(stack, num_inputs)) { + TORCH_INTERNAL_ASSERT(v.isTensor()); + if (!v.toTensor().defined()) { + result = false; + break; + } + } + drop(stack, num_inputs); + stack->emplace_back(result); + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA("prim::AutogradAdd(Any a, Any b) -> Any"), + [](Stack* stack) { + at::Tensor a, b; + pop(stack, a, b); + // NOLINTNEXTLINE(bugprone-branch-clone) + if (!a.defined() && !b.defined()) { + // undef + undef == undef + stack->emplace_back(a); + } else if (!a.defined()) { + stack->emplace_back(b); + } else if (!b.defined()) { + stack->emplace_back(a); + } else { + stack->emplace_back(a + b); + } + }, + aliasAnalysisSpecialCase()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::_size_if_not_equal(int[] self_size, int[] other_size) -> int[]?"), + [](Stack* stack) { + IValue self_size, other_size; + pop(stack, self_size, other_size); + auto s = self_size.toIntVector(); + auto o = other_size.toIntVector(); + if (s == o) { + push(stack, IValue()); + } else { + push(stack, s); + } + }, + aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "aten::_unwrap_optional(t(a)? optional) -> t(a)"), + [](Stack* stack) { + auto val = pop(stack); + TORCH_CHECK(!val.isNone(), "Unwrapping null optional"); + push(stack, std::move(val)); + }, + aliasAnalysisFromSchema())}; + RegisterOperators reg1( - {OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::rangelist(int n) -> int[]"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t n; - pop(stack, n); - c10::List elems; - elems.reserve(n); - for (const auto i : c10::irange(n)) { - elems.push_back(i); - } - push(stack, std::move(elems)); - }, - aliasAnalysisFromSchema()), - // note: this op needs to share a name with the Scalar -> Tensor conversion - // because all _to_tensor conversion have to have the same operator namet - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.bool(bool a) -> Tensor"), - [](Stack* stack) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool b; - pop(stack, b); - push(stack, at::scalar_to_tensor(b)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::device(str a) -> Device"), - [](Stack* stack) { - push(stack, c10::Device(pop(stack).toStringRef())); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::percentFormat(str self, ...) -> str"), - [](Stack* stack) { - size_t num_inputs = pop(stack).toInt(); - percentFormat(*stack, num_inputs); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::to.prim_other(Tensor(a) self, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), - [](Stack* stack) { - at::Tensor self; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool non_blocking; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool copy; - pop(stack, self, non_blocking, copy); - c10::optional device = c10::nullopt; - c10::optional scalarType = c10::nullopt; - push( - stack, - to_dispatch(self, device, scalarType, non_blocking, copy)); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::requires_grad(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.requires_grad()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::grad(Tensor a) -> Tensor(*)"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.grad()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_sparse(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_sparse()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_sparse_csr(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_sparse_csr()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_mkldnn(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_mkldnn()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_mlc()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_vulkan(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_vulkan()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_quantized()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_meta(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_meta()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.is_ort()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - if (a.name() == "") { - push(stack, IValue()); - } else { - push(stack, a.name()); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::index(Device self) -> int?"), - [](Stack* stack) { - auto d = pop(stack).toDevice(); - if (d.has_index()) { - push(stack, d.index()); - } else { - push(stack, IValue()); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - // TODO return generator object when torchscript supports RNG - // first-class - TORCH_SELECTIVE_SCHEMA("aten::manual_seed(int seed) -> ()"), - [](Stack* stack) { at::manual_seed(pop(stack).toInt()); }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("aten::cuda(Tensor(a) self) -> Tensor(a|b)"), - [](Stack* stack) { - at::Tensor a; - pop(stack, a); - push(stack, a.cuda()); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::AutogradZero() -> Tensor"), - [](Stack* stack) { stack->emplace_back(at::Tensor()); }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]"), - [](Stack* stack) { - bool keepdim = pop(stack).toBool(); - c10::List axes = pop(stack).toIntList(); - c10::List size = pop(stack).toIntList(); - if (keepdim) { - for (const auto& axis : axes) { - size.set(axis, 1); - } - } else { - int64_t index = 0; - auto iter = size.begin(); - std::sort(axes.begin(), axes.end()); - for (const auto& axis : axes) { - // move iter to the next axis - iter += axis - index; - - // input iter points to axis and is updated to axis + 1 - iter = size.erase(iter); - - // update current index for iter - index = axis + 1; - } - } - push(stack, IValue(std::move(size))); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::BroadcastSizes(...) -> int[]"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - std::vector size; - size.reserve(8); - for (const auto i : c10::irange(num_inputs)) { - size = - at::infer_size(size, peek(stack, i, num_inputs).toIntVector()); - } - drop(stack, num_inputs); - push(stack, IValue(size)); - }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::warn(str message, int stacklevel=2) -> ()"), - [](Stack* stack) { - TORCH_CHECK( - false, "warn is implemented directly in the interpreter"); - }, - aliasAnalysisFromSchema()), - - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "onnx::Reshape(Tensor input, Tensor shape) -> Tensor"), - [](Stack* stack) { - at::Tensor input, shape; - pop(stack, input, shape); - shape = shape.contiguous(); - AT_ASSERT(shape.ndimension() == 1); - at::IntArrayRef shape_list(shape.data_ptr(), shape.size(0)); - push(stack, input.reshape(shape_list)); - }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("onnx::Shape(Tensor t) -> Tensor"), - [](Stack* stack) { - auto t = pop(stack).toTensor(); - at::IntArrayRef sizes = t.sizes(); - auto sizes_tensor = torch::empty( - {static_cast(sizes.size())}, at::dtype(at::kLong)); - auto accessor = sizes_tensor.accessor(); - for (const auto i : c10::irange(sizes.size())) { - accessor[i] = sizes[i]; - } - stack->emplace_back(sizes_tensor); - }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::AutogradAnyNonZero(...) -> bool"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - bool result = false; - for (const IValue& v : last(stack, num_inputs)) { - if (v.isTensor()) { - if (v.toTensor().defined()) { - result = true; - break; - } - } else if (v.isTensorList()) { - for (const at::Tensor& t : v.toTensorVector()) { - if (t.defined()) { - result = true; - } - } - if (result) { - break; - } - } else { - TORCH_INTERNAL_ASSERT(false); - } - } - drop(stack, num_inputs); - stack->emplace_back(result); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::AutogradAllZero(...) -> bool"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - bool result = true; - for (const IValue& v : last(stack, num_inputs)) { - TORCH_INTERNAL_ASSERT(v.isTensor()); - if (v.toTensor().defined()) { - result = false; - break; - } - } - drop(stack, num_inputs); - stack->emplace_back(result); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::AutogradAllNonZero(...) -> bool"), - [](Stack* stack) { - auto num_inputs = pop(stack).toInt(); - bool result = true; - for (const IValue& v : last(stack, num_inputs)) { - TORCH_INTERNAL_ASSERT(v.isTensor()); - if (!v.toTensor().defined()) { - result = false; - break; - } - } - drop(stack, num_inputs); - stack->emplace_back(result); - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA("prim::AutogradAdd(Any a, Any b) -> Any"), - [](Stack* stack) { - at::Tensor a, b; - pop(stack, a, b); - // NOLINTNEXTLINE(bugprone-branch-clone) - if (!a.defined() && !b.defined()) { - // undef + undef == undef - stack->emplace_back(a); - } else if (!a.defined()) { - stack->emplace_back(b); - } else if (!b.defined()) { - stack->emplace_back(a); - } else { - stack->emplace_back(a + b); - } - }, - aliasAnalysisSpecialCase()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::_size_if_not_equal(int[] self_size, int[] other_size) -> int[]?"), - [](Stack* stack) { - IValue self_size, other_size; - pop(stack, self_size, other_size); - auto s = self_size.toIntVector(); - auto o = other_size.toIntVector(); - if (s == o) { - push(stack, IValue()); - } else { - push(stack, s); - } - }, - aliasAnalysisFromSchema()), - OperatorGenerator( - TORCH_SELECTIVE_SCHEMA( - "aten::_unwrap_optional(t(a)? optional) -> t(a)"), - [](Stack* stack) { - auto val = pop(stack); - TORCH_CHECK(!val.isNone(), "Unwrapping null optional"); - push(stack, std::move(val)); - }, - aliasAnalysisFromSchema())}); + createOperators(opGenArgs1, sizeof(opGenArgs1) / sizeof(opGenArgs1[0]))); void hashValue(Stack* stack) { auto value = pop(stack); push(stack, value.hash()); } -RegisterOperators reg2({ +static const OperatorGeneratorArgs opGenArgs2[] = { // registered as Any[] so that heterogenous tuples can be called with len() - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::len.any(Any[] a) -> int"), listLen, aliasAnalysisFromSchema()), // these ops have a specialized implementation for the list element type #define CREATE_SPECIALIZED_LIST_OPS(decl_type, value_type) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ "aten::remove." decl_type "(" decl_type \ "[](a!) self, \ " decl_type " el) -> ()"), \ listRemove, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ "aten::index.list_" decl_type "(" decl_type \ "[] self, \ " decl_type " el) -> int"), \ listIndex, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ "aten::count." decl_type "(" decl_type \ "[] self, \ @@ -2500,100 +2523,100 @@ RegisterOperators reg2({ // `listContains` is not implemented for non-primitive types // TODO: Add List[bool] once .to> doesn't throw an error - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::__contains__.float_list(float[] l, float item) -> bool"), listContains, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sort.int(int[](a!) self, bool reverse=False) -> ()"), listSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sort.float(float[](a!) self, bool reverse=False) -> ()"), listSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sort.Tensor(Tensor[](a!) self, bool reverse=False) -> ()"), listSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sort.bool(bool[](a!) self, bool reverse=False) -> ()"), listSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sort.str(str[](a!) self, bool reverse=False) -> ()"), listSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sorted.int(int[](a) input) -> (int[])"), listCopyAndSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sorted.float(float[](a) input) -> (float[])"), listCopyAndSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sorted.Tensor(Tensor[](a) input) -> (Tensor[])"), listCopyAndSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::sorted.bool(bool[](a) input) -> (bool[])"), listCopyAndSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sorted.str(str[](a) input) -> (str[])"), listCopyAndSort, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::eq.float_list(float[] a, float[] b) -> bool"), listEq, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::eq.Tensor_list(Tensor[] a, Tensor[] b) -> bool"), listEq, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::eq.bool_list(bool[] a, bool[] b) -> bool"), listEq, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::eq.str_list(str[] a, str[] b) -> bool"), listEq, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::ne.float_list(float[] a, float[] b) -> bool"), listNe, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::ne.Tensor_list(Tensor[] a, Tensor[] b) -> bool"), listNe, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::ne.bool_list(bool[] a, bool[] b) -> bool"), listNe, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ne.str_list(str[] a, str[] b) -> bool"), listNe, aliasAnalysisFromSchema()), #define DEFINE_CONVERT_BASE_OP(op_name, prefix, char_op) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name "(int i) -> str"), \ [](Stack* stack) { \ auto i = pop(stack).toInt(); \ @@ -2610,7 +2633,7 @@ RegisterOperators reg2({ DEFINE_CONVERT_BASE_OP(aten::hex, "x", std::hex), DEFINE_CONVERT_BASE_OP(aten::oct, "o", std::oct), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::bin(int i) -> str"), [](Stack* stack) { auto i = pop(stack).toInt(); @@ -2630,7 +2653,7 @@ RegisterOperators reg2({ }, aliasAnalysisFromSchema()), // TODO: deprecate this in favor of aten::getelem - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "prim::StringIndex(str string, int index) -> str"), [](Stack* stack) { @@ -2641,7 +2664,7 @@ RegisterOperators reg2({ push(stack, std::string(&c, 1)); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::chr(int i) -> str"), [](Stack* stack) { auto i = pop(stack).toInt(); @@ -2659,7 +2682,7 @@ RegisterOperators reg2({ // only used in loop unrolling, not exposed to end users DEFINE_INT_OP(aten::__round_to_zero_floordiv, a / b), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::modf(float a) -> (float, float)"), [](Stack* stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -2671,7 +2694,7 @@ RegisterOperators reg2({ push(stack, b, c); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::frexp(float a) -> (float, int)"), [](Stack* stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -2685,7 +2708,7 @@ RegisterOperators reg2({ push(stack, m, e); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ldexp(float x, int i) -> float"), [](Stack* stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -2785,7 +2808,7 @@ RegisterOperators reg2({ float, float, float), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::abs(Tensor x) -> Tensor"), [](Stack* stack) { at::Tensor x; @@ -2808,7 +2831,7 @@ RegisterOperators reg2({ std::copysign(a, b), std::copysign(a, b), float), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::_tensor_to_list(Tensor self) -> int[]"), [](Stack* stack) { at::Tensor t; @@ -2821,7 +2844,7 @@ RegisterOperators reg2({ push(stack, std::move(elems)); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::_list_to_tensor(int[] self) -> Tensor"), [](Stack* stack) { c10::List l = pop(stack).toIntList(); @@ -2833,7 +2856,7 @@ RegisterOperators reg2({ push(stack, std::move(t)); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.int(int[] self) -> int"), [](Stack* stack) { c10::List l = pop(stack).toIntList(); @@ -2844,7 +2867,7 @@ RegisterOperators reg2({ push(stack, sum); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.float(float[] self) -> float"), [](Stack* stack) { c10::List l = pop(stack).toDoubleList(); @@ -2855,7 +2878,7 @@ RegisterOperators reg2({ push(stack, sum); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.complex(complex[] self) -> complex"), [](Stack* stack) { c10::List> l = pop(stack).toComplexDoubleList(); @@ -2866,7 +2889,7 @@ RegisterOperators reg2({ push(stack, sum); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.bool(bool[] self) -> int"), [](Stack* stack) { c10::List l = pop(stack).toBoolList(); @@ -2879,7 +2902,7 @@ RegisterOperators reg2({ push(stack, sum); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.str(str[] self) -> bool"), [](Stack* stack) { auto l = pop(stack).toList(); @@ -2892,7 +2915,7 @@ RegisterOperators reg2({ push(stack, false); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.int(int[] self) -> bool"), [](Stack* stack) { c10::List l = pop(stack).toIntList(); @@ -2905,7 +2928,7 @@ RegisterOperators reg2({ push(stack, false); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.float(float[] self) -> bool"), [](Stack* stack) { c10::List l = pop(stack).toDoubleList(); @@ -2918,7 +2941,7 @@ RegisterOperators reg2({ push(stack, false); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.bool(bool[] self) -> bool"), [](Stack* stack) { c10::List l = pop(stack).toBoolList(); @@ -2931,7 +2954,7 @@ RegisterOperators reg2({ push(stack, false); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::all.int(int[] self) -> bool"), [](Stack* stack) { c10::List l = pop(stack).toIntList(); @@ -2944,7 +2967,7 @@ RegisterOperators reg2({ push(stack, true); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::all.float(float[] self) -> bool"), [](Stack* stack) { c10::List l = pop(stack).toDoubleList(); @@ -2957,7 +2980,7 @@ RegisterOperators reg2({ push(stack, true); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::all.bool(bool[] self) -> bool"), [](Stack* stack) { c10::List l = pop(stack).toBoolList(); @@ -2970,7 +2993,7 @@ RegisterOperators reg2({ push(stack, true); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::divmod.int(int x, int y) -> (int, int)"), [](Stack* stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -2992,7 +3015,7 @@ RegisterOperators reg2({ static_cast(divresult.rem)); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::divmod.float(float x, float y) -> (float, float)"), [](Stack* stack) { @@ -3010,7 +3033,7 @@ RegisterOperators reg2({ push(stack, (a - rem) / b, rem); }, aliasAnalysisFromSchema()), - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::id(AnyClassType? x) -> int"), [](Stack* stack) { IValue a; @@ -3024,7 +3047,7 @@ RegisterOperators reg2({ aliasAnalysisFromSchema()), #define DEFINE_DIVMOD_MIXED_OP(type_a, type_b) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::divmod." #type_a "_" #type_b "(" #type_a \ " x," #type_b " y) -> (float, float)"), \ [](Stack* stack) { \ @@ -3044,13 +3067,13 @@ RegisterOperators reg2({ DEFINE_DIVMOD_MIXED_OP(float, int), #undef DEFINE_DIVMOD_MIXED_OP - OperatorGenerator( + OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::hash.generic(t value) -> int"), hashValue, aliasAnalysisFromSchema()), #define DEFINE_COMPLEX_OP(type_a, type_b, actual_type_a, actual_type_b) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \ " x," #type_b " y) -> complex"), \ [](Stack* stack) { \ @@ -3064,7 +3087,7 @@ RegisterOperators reg2({ #define DEFINE_COMPLEX_OP_WITH_TENSOR_ARG( \ type_a, type_b, actual_type_a, actual_type_b) \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \ " x," #type_b " y) -> complex"), \ [](Stack* stack) { \ @@ -3075,7 +3098,7 @@ RegisterOperators reg2({ push(stack, comp); \ }, \ aliasAnalysisFromSchema()), \ - OperatorGenerator( \ + OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_b "_" #type_a \ "(" #type_b " x," #type_a " y) -> complex"), \ [](Stack* stack) { \ @@ -3099,7 +3122,10 @@ RegisterOperators reg2({ DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(Tensor, float, at::Tensor, double), DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(Tensor, int, at::Tensor, int), DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(Tensor, bool, at::Tensor, bool), -}); +}; + +RegisterOperators reg2( + createOperators(opGenArgs2, sizeof(opGenArgs2) / sizeof(opGenArgs2[0]))); } // namespace } // namespace jit From ff4569ae2939c3e81092fdf43c9d5f2f08453c42 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Fri, 27 Aug 2021 13:21:04 -0700 Subject: [PATCH 308/530] Sparse CUDA: rename files *.cu -> *.cpp (#63894) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63894 This PR introduces a few code structure changes. There is no need to use .cu extension for pure c++ code without cuda. Moved `s_addmm_out_csr_sparse_dense_cuda_worker` to a separate cpp file from cu file. cc nikitaved pearu cpuhrsch IvanYashchuk ngimel Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30548771 Pulled By: cpuhrsch fbshipit-source-id: 6f12d36e7e506d2fdbd57ef33eb73192177cd904 --- aten/src/ATen/native/sparse/cuda/SoftMax.cu | 2 +- .../native/sparse/cuda/SparseBlasLegacy.cpp | 74 +++++++++++++++++++ .../native/sparse/cuda/SparseBlasLegacy.h | 18 +++++ .../{SparseCUDABlas.cu => SparseCUDABlas.cpp} | 4 +- .../{SparseCUDABlas.cuh => SparseCUDABlas.h} | 0 .../sparse/cuda/SparseCUDATensorMath.cu | 61 +-------------- .../sparse/cuda/SparseCUDATensorMath.cuh | 2 - .../native/sparse/cuda/SparseCsrTensorMath.cu | 3 +- .../ATen/native/sparse/cuda/SparseMatMul.cu | 2 +- 9 files changed, 100 insertions(+), 66 deletions(-) create mode 100644 aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp create mode 100644 aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h rename aten/src/ATen/native/sparse/cuda/{SparseCUDABlas.cu => SparseCUDABlas.cpp} (99%) rename aten/src/ATen/native/sparse/cuda/{SparseCUDABlas.cuh => SparseCUDABlas.h} (100%) diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu index d5bc66b7fb23b..c55ea3b540b5a 100644 --- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp new file mode 100644 index 0000000000000..b13e7fe595d8f --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp @@ -0,0 +1,74 @@ +/* +Functions here use deprecated cuSPARSE API that was removed in CUDA 11. +This file will be removed eventually. +*/ +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense) { + TORCH_INTERNAL_ASSERT(nnz > 0); + + // No half support, so we don't have to use CUDATypeConversion + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + values.scalar_type(), "addmm_sparse_cuda", [&] { + scalar_t cast_beta = beta.to(); + scalar_t cast_alpha = alpha.to(); + Tensor r__; + if (cast_beta == scalar_t(0)) { + r_.zero_(); + } else if (!at::sparse::is_same_tensor(t, r_)) { + r_.copy_(t); + } + if (r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) { + r__ = r_; + } else { + // Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors + r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous); + r__.transpose_(0, 1); + } + TORCH_INTERNAL_ASSERT(r__.transpose(-1, -2).is_contiguous()); + Tensor dense_; + char transpose_dense; + if (dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { + transpose_dense = 'n'; + dense_ = dense; + } else if (dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) { + transpose_dense = 't'; + dense_ = dense; + } else { + transpose_dense = 't'; + dense_ = dense.contiguous(); + } + + sparse::cuda::csrmm2( + 'n', + transpose_dense, + m, + n, + k, + nnz, + cast_alpha, + values.data_ptr(), + crow_indices.data_ptr(), + col_indices.data_ptr(), + dense_.data_ptr(), + (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), + cast_beta, + r__.data_ptr(), + r__.stride(1)); + + if (!at::sparse::is_same_tensor(r__, r_)) { + r_.copy_(r__); + } + } + ); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h new file mode 100644 index 0000000000000..67eaffb13a75c --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include + +/* +Functions here use deprecated cuSPARSE API that was removed in CUDA 11. +Here only 32-bit indices sparse indices are supported. +This file will be removed eventually. +*/ + +namespace at { +namespace native { + +void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp similarity index 99% rename from aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu rename to aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp index dd03e2bfeacbe..db0088a084c6d 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include @@ -14,7 +14,7 @@ // Using these APIs in any other systems will result in compile-time or run-time failures. // Their support will be extended in the next releases. -#if defined(__CUDACC__) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301)) +#if defined(CUDART_VERSION) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301)) #define IS_SPMM_AVAILABLE() 1 #else #define IS_SPMM_AVAILABLE() 0 diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h similarity index 100% rename from aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh rename to aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 511e69ef4b408..0331f5e4d932e 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -5,8 +5,9 @@ #include #include #include +#include #include -#include +#include #include #include #include @@ -50,64 +51,6 @@ namespace { } } -void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense) { - TORCH_INTERNAL_ASSERT(nnz > 0); - - // No half support, so we don't have to use CUDATypeConversion - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( - values.scalar_type(), "addmm_sparse_cuda", [&] { - scalar_t cast_beta = beta.to(); - scalar_t cast_alpha = alpha.to(); - Tensor r__; - if (cast_beta == scalar_t(0)) { - r_.zero_(); - } else if (!is_same_tensor(t, r_)) { - r_.copy_(t); - } - if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) { - r__ = r_; - } else { - // Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors - r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous); - r__.transpose_(0, 1); - } - Tensor dense_; - char transpose_dense; - if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { - transpose_dense = 'n'; - dense_ = dense; - } else if(dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) { - transpose_dense = 't'; - dense_ = dense; - } else { - transpose_dense = 't'; - dense_ = dense.contiguous(); - } - - sparse::cuda::csrmm2( - 'n', - transpose_dense, - m, - n, - k, - nnz, - cast_alpha, - values.data_ptr(), - crow_indices.data_ptr(), - col_indices.data_ptr(), - dense_.data_ptr(), - (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), - cast_beta, - r__.data_ptr(), - r__.stride(1)); - - if (!is_same_tensor(r__, r_)) { - r_.copy_(r__); - } - } - ); -} - // NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not // wired at all) diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh index 1a99e818e1bad..9448b2aa46b6c 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh @@ -6,8 +6,6 @@ namespace at { namespace native { -void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense); - void s_addmm_out_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& indices, Tensor& values, const Tensor& dense); }} // namespace at::native diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu index ea765e076fb04..b21d892fcdf84 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu @@ -19,7 +19,8 @@ #include #include -#include +#include +#include #include #include diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu index 2d041de6ea411..d5f31a1980bac 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include From 18cb3fc91004ac9e551301748246aaa2a5a5dd04 Mon Sep 17 00:00:00 2001 From: Patrick Hu Date: Fri, 27 Aug 2021 13:37:38 -0700 Subject: [PATCH 309/530] [FX] Validate data type of target on Node Construction (#64050) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64050 Test Plan: Imported from OSS Reviewed By: jamesr66a Differential Revision: D30585535 Pulled By: yqhu fbshipit-source-id: 96778a87e75f510b4ef42f0e5cf76b35b7b2f331 --- test/test_fx.py | 6 ++++++ torch/fx/graph.py | 9 ++++++++- torch/fx/node.py | 14 ++++++++++---- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/test/test_fx.py b/test/test_fx.py index 27f64e1cd1827..47873d7ef9b41 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -1280,6 +1280,12 @@ def test_wrong_topo(self): with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'): graph.lint() + def test_wrong_target_type(self): + graph : torch.fx.Graph = torch.fx.Graph() + with self.assertRaises(ValueError): + n = torch.fx.Node(graph=graph, name='foo', op='call_function', target='foo', + args=(), kwargs={}) + def test_example_shape_prop(self): class TestCase(torch.nn.Module): def __init__(self): diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 88c7b54a06ce4..1ee6f05f79809 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -1066,8 +1066,15 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None: # Check targets are legit if self.owning_module: for node in self.nodes: + if node.op == 'call_function': + if not callable(node.target): + raise ValueError(f'Node {node} target {node.target} has type {torch.typename(node.target)} but ' + 'a Callable is expected') + else: + if not isinstance(node.target, str): + raise ValueError(f'Node {node} target {node.target} has type {torch.typename(node.target)} but ' + 'a str is expected') if node.op in ['get_attr', 'call_module']: - assert isinstance(node.target, str) target_atoms = node.target.split('.') m_itr = self.owning_module for i, atom in enumerate(target_atoms): diff --git a/torch/fx/node.py b/torch/fx/node.py index e00f25f47a2ee..8c4faf7d4fa27 100644 --- a/torch/fx/node.py +++ b/torch/fx/node.py @@ -114,13 +114,19 @@ class Node: """ def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target', args: Tuple['Argument', ...], kwargs: Dict[str, 'Argument'], - type : Optional[Any] = None) -> None: + return_type : Optional[Any] = None) -> None: self.graph = graph self.name = name # unique name of value being created assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output', 'root'] self.op = op # the kind of operation = placeholder|call_method|call_module|call_function|get_attr - if op in ['call_method', 'call_module']: - assert isinstance(target, str) + if op == 'call_function': + if not callable(target): + raise ValueError(f'Node [graph = {graph}, name = \'{name}\'] target {target} has type {torch.typename(target)} ' + 'but a Callable is expected') + else: + if not isinstance(target, str): + raise ValueError(f'Node [graph = {graph}, name = \'{name}\'] target {target} has type {torch.typename(target)} ' + 'but a str is expected') self.target = target # for method/module/function, the name of the method/module/function/attr # being invoked, e.g add, layer1, or torch.add @@ -146,7 +152,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target', # generated function return type. (Note this is a special case. ``return`` # does not produce a value, it's more of a notation. Thus, this value # describes the type of args[0] in the ``return`` node. - self.type : Optional[Any] = type + self.type : Optional[Any] = return_type self._prev = self self._next = self self._erased = False From 358c46f99eea23ec86e4358a5d4253e4059e962c Mon Sep 17 00:00:00 2001 From: Vincent Phan Date: Fri, 27 Aug 2021 13:51:38 -0700 Subject: [PATCH 310/530] Implement leaky relu op Summary: Implemented leaky relu op as per: https://www.internalfb.com/tasks/?t=97492679 Test Plan: buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test adb shell "/data/local/tmp/vulkan_api_test" all tests pass, including new ones Reviewed By: SS-JIA Differential Revision: D30186225 fbshipit-source-id: fdb1f8f7b3a28b5504581822185c0475dcd53a3e --- .../ATen/native/vulkan/glsl/leaky_relu.glsl | 28 +++++ .../ATen/native/vulkan/glsl/leaky_relu_.glsl | 27 ++++ aten/src/ATen/native/vulkan/ops/Clamp.cpp | 117 ++++++++++++++++++ aten/src/ATen/test/vulkan_api_test.cpp | 43 +++++++ 4 files changed, 215 insertions(+) create mode 100644 aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl create mode 100644 aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl diff --git a/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl b/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl new file mode 100644 index 0000000000000..f947e78f1843d --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl @@ -0,0 +1,28 @@ +#version 450 core +#define PRECISION $precision + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + float negative_slope; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const vec4 inval = texelFetch(uInput, pos, 0); + const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f))); + const vec4 positive_values = vec4(1.0) - negative_values; + const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values; + const vec4 outval = inval * mask; + imageStore(uOutput, pos, outval); + } +} diff --git a/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl b/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl new file mode 100644 index 0000000000000..345e66942c155 --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl @@ -0,0 +1,27 @@ +#version 450 core +#define PRECISION $precision + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION restrict Block { + ivec4 size; + float negative_slope; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const vec4 inval = imageLoad(uOutput, pos); + const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f))); + const vec4 positive_values = vec4(1.0) - negative_values; + const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values; + const vec4 outval = inval * mask; + imageStore(uOutput, pos, outval); + } +} diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index c6f046e84fd17..7982b0eda0d7a 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -404,6 +404,121 @@ Tensor& hardshrink_( return self; } +Tensor leaky_relu( + const Tensor& self_arg, + const Scalar& negative_slope) { + api::Context* const context = api::context(); + + const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); + const vTensor& v_self = convert(self); + + vTensor v_output{ + context, + v_self.sizes(), + v_self.options(), + }; + + api::Command::Pool& command_pool = context->command().pool; + api::Command::Buffer& command_buffer = command_pool.stream(); + { + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + const struct Block final { + uvec3 extents; + uint32_t _; + float negative_slope; + } block { + v_output.extents(), + 0u, + negative_slope.to(), + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(leaky_relu), + v_output.extents(), + context->gpu().adapter->local_work_group_size(), + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_self.image( + command_buffer, + vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { + TORCH_CHECK(false, "Not implemented!"); + } + } + command_pool.submit(context->gpu().queue, command_buffer); + + return convert(v_output); +} + +Tensor& leaky_relu_( + Tensor& self, + const Scalar& negative_slope) { + api::Context* const context = api::context(); + + TORCH_CHECK( + self.is_vulkan(), + "Vulkan: In-place leaky relu is only supported on Vulkan tensors."); + + vTensor& v_self = convert(self); + + api::Command::Pool& command_pool = context->command().pool; + api::Command::Buffer& command_buffer = command_pool.stream(); + { + if C10_LIKELY(v_self.has_image()) { + const struct Block final { + uvec3 extents; + uint32_t _; + float negative_slope; + } block { + v_self.extents(), + 0u, + negative_slope.to(), + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(leaky_relu_), + v_self.extents(), + context->gpu().adapter->local_work_group_size(), + // Read-Write access triggers an async synchronization if necessory + // and inserts appropriate barriers if hazards are detected. + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { + TORCH_CHECK(false, "Not implemented!"); + } + } + command_pool.submit(context->gpu().queue, command_buffer); + + return self; +} + Tensor sigmoid(const Tensor& self) { return ops::activation(self, VK_KERNEL(sigmoid)); } @@ -433,6 +548,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_); m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh); m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh_"), hardtanh_); + m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu"), leaky_relu); + m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu_"), leaky_relu_); m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), sigmoid); m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid_"), sigmoid_); m.impl(TORCH_SELECTIVE_NAME("aten::tanh"), tanh); diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 2873d3c0584c8..d4b466aa920f2 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -979,6 +979,49 @@ TEST(VulkanAPITest, hardshrink_) { } } +TEST(VulkanAPITest, leaky_relu) { + if (!at::is_vulkan_available()) { + return; + } + + for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) { + const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_vulkan = in_cpu.vulkan(); + + const auto out_cpu = at::leaky_relu(in_cpu, negative_slope); + const auto out_vulkan = at::leaky_relu(in_vulkan, negative_slope); + + const auto check = almostEqual(out_cpu, out_vulkan.cpu()); + + if (!check) { + showRtol(out_cpu, out_vulkan.cpu()); + } + + ASSERT_TRUE(check); + } +} + +TEST(VulkanAPITest, leaky_relu_) { + if (!at::is_vulkan_available()) { + return; + } + + for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) { + auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + auto vulkan = cpu.vulkan(); + + at::leaky_relu_(cpu, negative_slope); + at::leaky_relu_(vulkan, negative_slope); + + const auto check = almostEqual(cpu, vulkan.cpu()); + if (!check) { + showRtol(cpu, vulkan.cpu()); + } + + ASSERT_TRUE(check); + } +} + TEST(VulkanAPITest, hardswish) { if (!at::is_vulkan_available()) { return; From bdde898d9c26edc7f9bc37164cf5480c450d5189 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Fri, 27 Aug 2021 14:15:23 -0700 Subject: [PATCH 311/530] Add README to datapipes (#63982) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63982 Add a readme to `datapipes` for developer. This is can be a replacement of https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb After this PR is landed, the README.md will be added to PyTorch Wiki Test Plan: Imported from OSS Reviewed By: soulitzer Differential Revision: D30554198 Pulled By: ejguan fbshipit-source-id: 6091aae8ef915c7c1f00fbf45619c86c9558d308 --- torch/utils/data/datapipes/README.md | 103 ++++++++++ .../data/datapipes_tutorial_dev_loaders.ipynb | 178 ------------------ 2 files changed, 103 insertions(+), 178 deletions(-) create mode 100644 torch/utils/data/datapipes/README.md delete mode 100644 torch/utils/data/datapipes_tutorial_dev_loaders.ipynb diff --git a/torch/utils/data/datapipes/README.md b/torch/utils/data/datapipes/README.md new file mode 100644 index 0000000000000..69cd56d3cfbd1 --- /dev/null +++ b/torch/utils/data/datapipes/README.md @@ -0,0 +1,103 @@ +The [`datapipes`](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes) folder holds the implementation of the `IterDataPipe` and `MapDataPipe`. + +This document serves as an entry point for DataPipe implementation. + +## Implementing DataPipe +For the sake of an example, let us implement an `IterDataPipe` to apply a callable over data under [`iter`](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes/iter). +For `MapDataPipe`, please take reference from files in [map](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes/map) folder and implement the corresponding `__getitem__` method. + +### Naming +The naming convention for DataPipe is Operation-er and with suffix of `IterDataPipe` because each DataPipe behaves like a container to apply the operation to data yielded from the source DataPipe. +And, when importing the DataPipe into `iter` module under `datapipes`, each DataPipe will be aliased as Op-er without the suffix of `IterDataPipe`. +Please check [`__init__.py`](https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes/iter/__init__.py) in `iter` module for how we aliasing each DataPipe class. +Like the example of `IterDataPipe` to map a function, we are going to name it as `MapperIterDataPipe` and alias it as `iter.Mapper` under `datapipes`. + +### Constructor +As DataSet now constructed by a stack of DataPipe-s, each DataPipe normally takes a source DataPipe as the first argument. +```py +class MapperIterDataPipe(IterDataPipe): + def __init__(self, dp, fn): + super().__init__() + self.dp = dp + self.fn = fn +``` +Note: Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory. + +### Iterator +For `IterDataPipe`, an `__iter__` function is needed to consume data from the source `IterDataPipe` then apply operation over the data before yield. +```py +class MapperIterDataPipe(IterDataPipe): + ... + + def __iter__(self): + for d in self.dp: + yield self.fn(d) +``` + +### Length +In the most common cases, as the example of `MapperIterDataPipe` above, the `__len__` method of DataPipe should return the length of source DataPipe. +```py +class MapperIterDataPipe(IterDataPipe): + ... + + def __len__(self): + return len(self.dp) +``` +Note that `__len__` method is optional for `IterDataPipe`. +Like `CSVParserIterDataPipe` in the [Using DataPipe sector](#using-datapipe), `__len__` is not implemented because the size of each file streams is unknown for us before loading it. + +Besides, in some special cases, `__len__` method can be provided, but it would either return an integer length or raise Error depending on the arguments of DataPipe. +And, the Error is required to be `TypeError` to support Python's build-in functions like `list(dp)`. +Please check NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ] for detailed reason in PyTorch. + +### Registering DataPipe with functional API +Each DataPipe can be registered to support functional API using the decorator `functional_datapipe`. +```py +@functional_datapipe("map") +class MapperIterDataPipe(IterDataPipe): + ... +``` +Then, the stack of DataPipe can be constructed in functional-programming manner. +```py +>>> import torch.utils.data.datapipes as dp +>>> datapipes1 = dp.iter.FileLoader(['a.file', 'b.file']).map(fn=decoder).shuffle().batch(2) + +>>> datapipes2 = dp.iter.FileLoader(['a.file', 'b.file']) +>>> datapipes2 = dp.iter.Mapper(datapipes2) +>>> datapipes2 = dp.iter.Shuffler(datapipes2) +>>> datapipes2 = dp.iter.Batcher(datapipes2, 2) +``` +In the above example, `datapipes1` and `datapipes2` represent the exact same stack of `IterDataPipe`-s. + +## Using DataPipe +For example, we want to load data from CSV files with the following data pipeline: +- List all csv files +- Load csv files +- Parse csv file and yield rows + +To support the above pipeline, `CSVParser` is registered as `parse_csv_files` to consume file streams and expand them as rows. +```py +@functional_datapipe("parse_csv_files") +class CSVParserIterDataPipe(IterDataPipe): + def __init__(self, dp, **fmtparams): + self.dp = dp + self.fmtparams = fmtparams + + def __iter__(self): + for filename, stream in self.dp: + reader = csv.reader(stream, **self.fmtparams) + for row in reader: + yield filename, row +``` +Then, the pipeline can be assembled as following: +```py +>>> import torch.utils.data.datapipes as dp + +>>> FOLDER = 'path/2/csv/folder' +>>> datapipe = dp.iter.FileLister([FOLDER]).filter(fn=lambda filename: filename.endswith('.csv')) +>>> datapipe = dp.iter.FileLoader(datapipe, mode='rt') +>>> datapipe = datapipe.parse_csv_files(delimiter=' ') + +>>> for d in datapipe: # Start loading data +... pass +``` diff --git a/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb b/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb deleted file mode 100644 index 0a9b834a86862..0000000000000 --- a/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3610jvsc74a57bd0eb5e09632d6ea1cbf3eb9da7e37b7cf581db5ed13074b21cc44e159dc62acdab", - "display_name": "Python 3.6.10 64-bit ('dataloader': conda)" - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "source": [ - "## DataPipes development tutorial. Loaders DataPipes." - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "source": [ - "As DataSet now constructed by stacking `DataPipe`-s it is recommended to keep `DataPipe` functionality as primitive as possible. For example loading data from CSV file will look like sequence of DataPipes: ListFiles FileLoader CSVParser.\n", - "\n" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "source": [ - "`ExampleListFilesDataPipe` scans all files in `root` folder and yields full file names. Avoid loading entire list in `__init__` function to save memory." - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import csv\n", - "import io\n", - "import os\n", - "\n", - "from torch.utils.data import IterDataPipe, functional_datapipe\n", - "\n", - "\n", - "class ExampleListFilesDataPipe(IterDataPipe):\n", - " def __init__(self, *, root):\n", - " self.root = root\n", - "\n", - " def __iter__(self):\n", - " for (dirpath, dirnames, filenames) in os.walk(self.root):\n", - " for file_name in filenames:\n", - " yield os.path.join(dirpath, file_name)" - ] - }, - { - "source": [ - "`ExampleFileLoaderDataPipe` registered as `load_files_as_string` consumes file names from source_datapipe and yields file names and file lines." - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "@functional_datapipe('load_files_as_string')\n", - "class ExampleFileLoaderDataPipe(IterDataPipe):\n", - " def __init__(self, source_datapipe):\n", - " self.source_datapipe = source_datapipe\n", - "\n", - " def __iter__(self):\n", - " for file_name in self.source_datapipe:\n", - " with open(file_name) as file:\n", - " lines = file.read()\n", - " yield (file_name, lines)\n" - ] - }, - { - "source": [ - "`ExampleCSVParserDataPipe` registered as `parse_csv_files` consumes file lines and expands them as CSV rows." - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "@functional_datapipe('parse_csv_files')\n", - "class ExampleCSVParserDataPipe(IterDataPipe):\n", - " def __init__(self, source_datapipe):\n", - " self.source_datapipe = source_datapipe\n", - "\n", - " def __iter__(self):\n", - " for file_name, lines in self.source_datapipe:\n", - " reader = csv.reader(io.StringIO(lines))\n", - " for row in reader:\n", - " yield [file_name] + row\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "['/home/vitaly/dataset/data/datapipes/load/iter/test/example_2.csv', '10', \" 'foo'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_2.csv', '11', \" 'bar'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '12', \" 'aaaa'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '13', \" 'bbbb'\"]\n" - ] - } - ], - "source": [ - "FOLDER = 'define your folder with csv files here'\n", - "FOLDER = '/home/vitaly/dataset/data'\n", - "dp = ExampleListFilesDataPipe(root = FOLDER).filter(lambda filename: filename.endswith('.csv')).load_files_as_string().parse_csv_files()\n", - "\n", - "for data in dp:\n", - " print(data)" - ] - }, - { - "source": [ - "This approach allows to replace any DataPipe to get different functionality. For example you can pick individual files.\n" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '12', \" 'aaaa'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '13', \" 'bbbb'\"]\n" - ] - } - ], - "source": [ - "FILE = 'define your file with csv data here'\n", - "FILE = '/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv'\n", - "dp = ExampleFileLoaderDataPipe([FILE]).parse_csv_files()\n", - "\n", - "for data in dp:\n", - " print(data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ] -} \ No newline at end of file From 8406dba65af414f4ac9bb569ca8d70752611e4ba Mon Sep 17 00:00:00 2001 From: Jessica Choi Date: Fri, 27 Aug 2021 14:46:31 -0700 Subject: [PATCH 312/530] Removing references to ProcessGroupAgent in comments (#64051) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64051 cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 Test Plan: Imported from OSS Reviewed By: mrshenli Differential Revision: D30587076 Pulled By: jaceyca fbshipit-source-id: 414cb95faad0b4da0eaf2956c0668af057f93574 --- torch/csrc/distributed/rpc/message.h | 6 +++--- torch/testing/_internal/dist_utils.py | 2 -- .../testing/_internal/distributed/rpc/dist_autograd_test.py | 3 +-- .../distributed/rpc/faulty_rpc_agent_test_fixture.py | 2 -- torch/testing/_internal/distributed/rpc/rpc_test.py | 2 +- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h index 93eff094243f8..17a7808912b11 100644 --- a/torch/csrc/distributed/rpc/message.h +++ b/torch/csrc/distributed/rpc/message.h @@ -101,9 +101,9 @@ enum MessageType { // can then serialize and send tensors chunck-by-chunk, in the streaming // fashion. // type (MessageType): type of the message. -// id (int64_t): message id, this is used by ProcessGroupAgent to match -// request and response. Other implementation can ignore it -// if they have their own ways to do matching. +// id (int64_t): message id, this is used to match request and response. +// Other implementation can ignore it if they have their own +// ways to do matching. // // Layers above ``RpcAgent`` only converts ScriptCall, ScriptResp, PythonCall, // and PythonResp into a Message, and it is up to the RpcAgent diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py index bdb21a7941c17..284a541444cdd 100644 --- a/torch/testing/_internal/dist_utils.py +++ b/torch/testing/_internal/dist_utils.py @@ -171,8 +171,6 @@ def wait_until_owners_and_forks_on_rank( def initialize_pg(init_method, rank: int, world_size: int) -> None: # This is for tests using `dist.barrier`. - # For `RpcAgent` other than `ProcessGroupAgent`, - # no `_default_pg` is initialized. if not dist.is_initialized(): dist.init_process_group( backend="gloo", diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py index e50c30d4974b7..017a61b7debf5 100644 --- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py @@ -1463,8 +1463,7 @@ def test_backward_node_failure_python_udf(self): dist_autograd.backward(context_id, [res.sum()]) # Mark rank 0 is done in the store, since the RPC framework on - # some nodes might be broken at this point (listenLoop() in - # ProcessGroupAgent might've exited). + # some nodes might be broken at this point. store.set('test_backward_node_failure_python_udf_rank0_done', "True") else: # Wait for backward to finish on rank 0. diff --git a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py index ae151137a4705..24f7ab81c5594 100644 --- a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py +++ b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py @@ -50,8 +50,6 @@ def setup_fault_injection(self, faulty_messages, messages_to_delay): def get_shutdown_error_regex(self): error_regexes = [ - "Encountered exception in ProcessGroupAgent::enqueueSend", - "Encountered exception in ProcessGroupAgent::listenLoop()", "Exception in thread pool task", "Connection reset by peer", "Connection closed by peer" diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index ae57ea5f40f8c..1a44ef6e63b65 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -2897,7 +2897,7 @@ def test_handle_send_exceptions(self): ) rpc._set_rpc_timeout(10) # This barrier is needed to ensure that some workers do not exit before - # others have been brought up, for non ProcessGroupAgent backends. + # others have been brought up. initialize_pg(self.file_init_method, self.rank, self.world_size) dist.barrier() if self.rank == 1: From 90a6498a1288a4248b4cfe603949fd5b2e60dc0f Mon Sep 17 00:00:00 2001 From: soulitzer Date: Fri, 27 Aug 2021 14:59:08 -0700 Subject: [PATCH 313/530] Add autograd not implemented boxed fallback (#63458) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63458 See description and discussion from https://github.com/pytorch/pytorch/pull/62450 Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30518572 Pulled By: soulitzer fbshipit-source-id: 3b1504d49abb84560ae17077f0dec335749c9882 --- test/cpp/api/autograd.cpp | 257 ++++++++++++++++++ tools/build_variables.bzl | 1 + torch/csrc/api/include/torch/autograd.h | 1 + .../autograd_not_implemented_fallback.cpp | 189 +++++++++++++ .../autograd_not_implemented_fallback.h | 11 + torch/csrc/autograd/function.h | 8 + 6 files changed, 467 insertions(+) create mode 100644 torch/csrc/autograd/autograd_not_implemented_fallback.cpp create mode 100644 torch/csrc/autograd/autograd_not_implemented_fallback.h diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp index 80d892d5195c9..edb73f90852a2 100644 --- a/test/cpp/api/autograd.cpp +++ b/test/cpp/api/autograd.cpp @@ -1,6 +1,8 @@ #include +#include #include +#include #include @@ -869,6 +871,261 @@ TEST(CustomAutogradTest, BackwardWithCreateGraphWarns) { } } +/** + * Tests for AutogradNotImplementedFallback + * - Check that we created the NotImplemented kernel when inputs require grad + * but when no inputs require grad, we should not create this node + * - check_inplace logic + * - view ops (TODO: not an official view yet, update this once InplaceOrView kernel is landed) + * - TODO: Tests for NDEBUG checks? + * - tensorlist input and output + * - multiple outputs / non-tensor output + * - rebase_history vs set_history + */ +namespace { + +torch::Tensor inplace_op(const torch::Tensor& self, const torch::Tensor& other) { + return self.add_(other); +} + +std::tuple two_arg_inplace_op(const torch::Tensor& self, const torch::Tensor& other) { + other.add_(self); + self.add_(other); + return std::tuple(self, other); +} + +std::tuple two_pairs_of_view_op(const torch::Tensor& self, const torch::Tensor& other) { + // This is not allowed. We test below that this calling into the boxed kernel will raise an error + auto self_view = self.view(-1); + auto other_view = other.view(-1); + return std::tuple(self_view, other_view); +} + +int64_t ret_single_non_tensor(const torch::Tensor& self, const torch::Tensor& other) { + return 12; +} + +torch::Tensor opt_op(const torch::Tensor& self, const c10::optional& other) { + if (other.has_value()) { + return self + other.value(); + } else { + return self.clone(); + } +} + +torch::Tensor my_custom_op(const torch::Tensor& self, const torch::Tensor& other) { + return self + other; +} + +std::tuple ret_tuple_non_tensor(const torch::Tensor& self, const torch::Tensor& other) { + auto a = self - other; + auto b = self + other; + return std::tuple(a, b, 12); +} + +torch::Tensor view_op(const torch::Tensor& self, const torch::Tensor& other) { + return self.view(-1); +} + +std::vector ret_tensor_vector(const torch::Tensor& self, const torch::Tensor& other) { + std::vector out; + out.push_back(self + other); + out.push_back(self - other); + return out; +} + +torch::Tensor tensorlist_op(const torch::Tensor& self, at::TensorList other) { + const auto& res = self.clone(); + for (const auto& t : other) { + res.add_(t); + } + return res; +} + +#define REGISTER_TEST_OP(name, schema, fn) \ + auto m = MAKE_TORCH_LIBRARY(_test); \ + m.def(schema); \ + auto m_autograd = MAKE_TORCH_LIBRARY_IMPL(_test, Autograd); \ + auto m_cpu = MAKE_TORCH_LIBRARY_IMPL(_test, CPU); \ + m_cpu.impl(name, c10::DispatchKey::CPU, TORCH_FN(fn)); \ + m_autograd.impl(name, c10::DispatchKey::Autograd, autogradNotImplementedFallback()); + +template +void assertBasicChecks(F op) { + auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + auto b = torch::tensor({1.}, {torch::kFloat32}); + auto c = torch::tensor({1.}, {torch::kFloat32}); + + // If any inputs require grad, + auto out1 = op(a, b); + ASSERT_THROWS_WITH(out1.backward(), "is not implemented"); + + // # Should not have grad_fn if none require grad + auto out2 = op(b, c); + ASSERT_THROWS_WITH(out2.backward(), "element 0 of tensors does not require grad and does not have a grad_fn"); + + // TODO: Forward AD Tests? +} + +} // namespace + +TEST(TestAutogradNotImplementedFallback, RetSingleNonTensor) { + REGISTER_TEST_OP("ret_single_non_tensor", "_test::ret_single_non_tensor(Tensor self, Tensor other) -> int", ret_single_non_tensor); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_single_non_tensor", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed(opHandle, _1, _2); + }; + + auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + auto b = torch::tensor({1.}, {torch::kFloat32}); + + ASSERT_EQ(op(a, b), ret_single_non_tensor(a, b)); +} + +TEST(TestAutogradNotImplementedFallback, DoubleViewOP) { + REGISTER_TEST_OP("two_pairs_of_view_op", "_test::two_pairs_of_view_op(Tensor(a) self, Tensor(b) other) -> (Tensor(a), Tensor(b))", two_pairs_of_view_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_pairs_of_view_op", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2); + }; + auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + auto b = torch::tensor({1.}, {torch::kFloat32}); + ASSERT_THROWS_WITH(op(a, b), + "Expected only a single output in the operator schema to have a non-write alias annotation"); +} + +TEST(TestAutogradNotImplementedFallback, InplaceOp) { + REGISTER_TEST_OP("inplace_op", "_test::inplace_op(Tensor(a!) self, Tensor other) -> Tensor(a!)", inplace_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::inplace_op", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed(opHandle, _1, _2); + }; + + auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + auto b = torch::tensor({1.}, {torch::kFloat32}); + + // Check in-place + ASSERT_THROWS_WITH(op(a, b), + "a leaf Variable that requires grad is being used in an in-place operation"); + op(b, a); + a = a.clone(); + b = b.clone(); + auto c = op(a, b); + ASSERT_TRUE(torch::allclose(c, inplace_op(a, b))); + + // Test in-place on view + auto base = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true).clone(); + auto view = base.view(-1); + auto t = torch::tensor({1.}, {torch::kFloat32}); + + torch::Tensor v_nograd; + { + c10::NoGradGuard guard; + v_nograd = base.view(-1); + op(v_nograd, t); + } + + ASSERT_THROWS_WITH(op(v_nograd, t), "A view was created in no_grad mode"); + ASSERT_EQ(op(view, t).unsafeGetTensorImpl(), view.unsafeGetTensorImpl()); + + // TODO: once we have InplaceOrView kernel, renable this since version counter would actually + // be incremented + // ASSERT_THAT(op(view, t).grad_fn()->name(), ::testing::HasSubstr("AsStridedBackward")); +} + +TEST(TestAutogradNotImplementedFallback, DoubleInplaceOp) { + REGISTER_TEST_OP("two_arg_inplace_op", "_test::two_arg_inplace_op(Tensor(a!) self, Tensor(b!) other) -> (Tensor(a!), Tensor(b!))", two_arg_inplace_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_arg_inplace_op", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2); + }; + auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + auto b = torch::tensor({1.}, {torch::kFloat32}); + + // Both are modified in-place! + ASSERT_THROWS_WITH(op(a, b), + "a leaf Variable that requires grad is being used in an in-place operation"); + ASSERT_THROWS_WITH(op(b, a), + "a leaf Variable that requires grad is being used in an in-place operation"); +} + +TEST(TestAutogradNotImplementedFallback, OptOp) { + REGISTER_TEST_OP("opt_op", "_test::opt_op(Tensor self, Tensor? other) -> Tensor", opt_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", ""); + auto op = [&](const torch::Tensor& _1, const c10::optional& _2) { + return callOpUnboxed&>(opHandle, _1, _2); + }; + + auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + auto b = torch::tensor({1.}, {torch::kFloat32}); + + ASSERT_TRUE(torch::allclose(op(a, b), opt_op(a, b))); + ASSERT_TRUE(torch::allclose(op(a, {}), opt_op(a, {}))); +} + +TEST(TestAutogradNotImplementedFallback, OutOfPlaceAddition) { + REGISTER_TEST_OP("my_custom_op", "_test::my_custom_op(Tensor self, Tensor other) -> Tensor", my_custom_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::my_custom_op", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed(opHandle, _1, _2); + }; + + assertBasicChecks(op); +} + +TEST(TestAutogradNotImplementedFallback, RetTupleNonTensor) { + REGISTER_TEST_OP("ret_tuple_non_tensor", "_test::ret_tuple_non_tensor(Tensor self, Tensor other) -> (Tensor, Tensor, int)", ret_tuple_non_tensor); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tuple_non_tensor", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + torch::Tensor out0; + torch::Tensor out1; + int64_t out2; + auto out = callOpUnboxed, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2); + std::tie(out0, out1, out2) = std::move(out); + return out0; + }; + + assertBasicChecks(op); +} + +TEST(TestAutogradNotImplementedFallback, ViewOp) { + REGISTER_TEST_OP("view_op", "_test::view_op(Tensor(a) self, Tensor other) -> Tensor(a)", view_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::view_op", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed(opHandle, _1, _2); + }; + assertBasicChecks(op); +} + +TEST(TestAutogradNotImplementedFallback, RetTensorVector) { + REGISTER_TEST_OP("ret_tensor_vector", "_test::ret_tensor_vector(Tensor self, Tensor other) -> Tensor[]", ret_tensor_vector); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tensor_vector", ""); + auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) { + return callOpUnboxed, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2)[0]; + }; + assertBasicChecks(op); +} + +TEST(TestAutogradNotImplementedFallback, TensorlistOp) { + REGISTER_TEST_OP("tensorlist_op", "_test::tensorlist_op(Tensor self, Tensor[] other) -> Tensor", tensorlist_op); + auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::tensorlist_op", ""); + auto op = [&](torch::Tensor _1, at::TensorList _2) { + return callOpUnboxed(opHandle, _1, _2); + }; + + auto a = torch::tensor({1.}, {torch::kFloat32}); + auto b = torch::tensor({1.}, {torch::kFloat32}); + auto c = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); + std::vector vec = {b, c}; + auto out = op(a, vec); + + ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[0]}), "One of the differentiated Tensors does not require grad"); + ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[1]}), "is not implemented"); + + ASSERT_TRUE(at::allclose(op(a, vec), tensorlist_op(a, vec))); +} + + // TODO add these tests if needed // test_once_differentiable // test_sparse_backward diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 650830b3143f0..b2a1016118d28 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -128,6 +128,7 @@ libtorch_edge_profiler_sources = libtorch_profiler_sources + [ core_trainer_sources = [ "torch/csrc/autograd/anomaly_mode.cpp", "torch/csrc/autograd/autograd.cpp", + "torch/csrc/autograd/autograd_not_implemented_fallback.cpp", "torch/csrc/autograd/cpp_hook.cpp", "torch/csrc/autograd/custom_function.cpp", "torch/csrc/autograd/engine.cpp", diff --git a/torch/csrc/api/include/torch/autograd.h b/torch/csrc/api/include/torch/autograd.h index 83aa102de0128..809fbe8bd3350 100644 --- a/torch/csrc/api/include/torch/autograd.h +++ b/torch/csrc/api/include/torch/autograd.h @@ -2,3 +2,4 @@ #include #include +#include diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp new file mode 100644 index 0000000000000..ab9cb49ec63a7 --- /dev/null +++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp @@ -0,0 +1,189 @@ +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace torch { namespace autograd { + +namespace { + +template +void _foreach_tensor( + F fn, + torch::jit::Stack* stack, + size_t stack_start, + size_t size) { + // Enumerate over tensors in a stack, including ones in TensorLists + int idx_tensor = 0; + for (const auto idx_arg : c10::irange(size)) { + auto& ivalue = (*stack)[stack_start + idx_arg]; + if (ivalue.isTensor()) { // true for optional tensor that has value + const auto& tensor = ivalue.toTensor(); + fn(idx_tensor, idx_arg, tensor); + idx_tensor++; + } else if (ivalue.isTensorList()) { + for (const auto& iv : ivalue.toListRef()) { + const auto& tensor = iv.toTensor(); + fn(idx_tensor, idx_arg, tensor); + idx_tensor++; + } + } + } +} + +} + +void autogradNotImplementedFallbackImpl(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + // Mimics the logic of a VariableType NotImplemented kernel + const auto& schema = op.schema(); + const auto& op_name = schema.operator_name().name; + const auto& arguments = schema.arguments(); + const auto& returns = schema.returns(); + const auto num_arguments = arguments.size(); + const auto num_returns = returns.size(); + const auto stack_start = stack->size() - num_arguments; + const bool grad_mode = GradMode::is_enabled(); + std::vector tensors_requiring_grad_on_stack; + + // Keep track of which outputs are output of in-place modification + // so we can rebase_history if necessary + std::vector is_inplace_output; + bool any_is_inplace_output = false; + std::vector is_aliased_output; + is_inplace_output.reserve(num_returns); + is_aliased_output.reserve(num_returns); + + for (const auto i : c10::irange(num_returns)) { + const auto& alias_info = returns[i].alias_info(); + is_inplace_output.push_back(alias_info.has_value() && alias_info->isWrite()); + any_is_inplace_output |= alias_info.has_value() && alias_info->isWrite(); + is_aliased_output.push_back(alias_info.has_value()); + + } + int aliased_input_idx = -1; + int aliased_output_idx = -1; + for (const auto i : c10::irange(num_returns)) { + const auto& alias_info = returns[i].alias_info(); + if (alias_info.has_value() && !alias_info->isWrite()) { + AT_ASSERT( + aliased_output_idx == -1, + "Expected only a single output in the operator schema to have a non-write alias annotation (i.e., 'Tensor(a)'). " + "Non-composite functions where multiple outputs are aliased with inputs aren't supported." + "Please rewrite your function as a composite function."); + aliased_output_idx = i; + } + } + for (const auto i : c10::irange(num_arguments)) { + const auto& alias_info = arguments[i].alias_info(); + if (alias_info.has_value() && !alias_info->isWrite()) { + AT_ASSERT( + aliased_input_idx == -1, + "Expected only a single input in the operator schema to have a non-write alias annotation (i.e., 'Tensor(a)'). " + "Non-composite functions where multiple inputs are aliased with outputs aren't supported. " + "Please rewrite your function as a composite function."); + aliased_input_idx = i; + } + } + + size_t num_tensor_inputs = 0; // Only used for DEBUG-only checks + + _foreach_tensor([&](size_t _, size_t idx_arg, const at::Tensor& t) { + if (grad_mode && t.requires_grad()) { + tensors_requiring_grad_on_stack.push_back(&t); + } + num_tensor_inputs++; + TORCH_CHECK_NOT_IMPLEMENTED(!isFwGradDefined(t), "Trying to use forward AD with ", op_name, " that does not support it."); + }, stack, stack_start, num_arguments); + + const bool any_requires_grad = tensors_requiring_grad_on_stack.size() > 0; + + _foreach_tensor([&](size_t _, size_t i, const at::Tensor& t) { + const auto& alias_info = arguments[i].alias_info(); + if (alias_info.has_value() && alias_info->isWrite()) { + check_inplace(t, any_requires_grad); + } + }, stack, stack_start, num_arguments); + + std::shared_ptr grad_fn; + if (any_requires_grad) { + grad_fn = std::shared_ptr(new NotImplemented(op_name), deleteNode); + grad_fn->set_next_edges(collect_next_edges(tensors_requiring_grad_on_stack)); + } + + #ifndef NDEBUG + // See NOTE [ TensorImpl and Storage Pointer Sanity Checks ] + auto stack_args_copy = std::vector(stack->begin() + stack_start, stack->end()); + std::vector> impl_saved; + impl_saved.reserve(num_tensor_inputs); + std::vector> storage_saved; + storage_saved.reserve(num_tensor_inputs); + _foreach_tensor([&](size_t idx, size_t _, const at::Tensor& t) { + storage_saved.push_back(t.has_storage() ? c10::optional(t.storage()) : c10::nullopt); + impl_saved.push_back(t.getIntrusivePtr()); + }, &stack_args_copy, 0, num_arguments); + #endif + if (aliased_input_idx != -1 || any_is_inplace_output) { + at::AutoDispatchBelowAutograd guard; + op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack); + } else { + // If neither in-place nor view + at::AutoDispatchBelowADInplaceOrView guard; + op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack); + } + #ifndef NDEBUG + _foreach_tensor([&](size_t idx_tensor, size_t _, const at::Tensor& t) { + if (storage_saved.at(idx_tensor).has_value()) + TORCH_INTERNAL_ASSERT(storage_saved.at(idx_tensor).value().is_alias_of(t.storage()), op_name); + if (impl_saved.at(idx_tensor)) + TORCH_INTERNAL_ASSERT(impl_saved.at(idx_tensor) == t.getIntrusivePtr(), op_name); + }, &stack_args_copy, 0, num_arguments); + _foreach_tensor([&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) { + if (!is_inplace_output[idx_ret]) + TORCH_INTERNAL_ASSERT(t.use_count() <= 1, op_name); // Okay to return undefined tensor + if (!is_aliased_output[idx_ret] && t.has_storage()) + TORCH_INTERNAL_ASSERT(t.storage().use_count() == 1); + }, stack, stack->size() - num_returns, num_returns); + // There should be only a single base-view pair, make sure their storage is aliased + if (aliased_input_idx != -1 && aliased_output_idx != -1) { + const c10::IValue& aliased_input_iv = stack_args_copy[aliased_input_idx]; + const c10::IValue& aliased_output_iv = (*stack)[stack->size() - num_returns + aliased_output_idx]; + // We do not support views embedded inside tensorlist + TORCH_INTERNAL_ASSERT(aliased_input_iv.isTensor(), op_name); + TORCH_INTERNAL_ASSERT(aliased_output_iv.isTensor(), op_name); + const at::Tensor& aliased_input = aliased_input_iv.toTensor(); + const at::Tensor& aliased_output = aliased_input_iv.toTensor(); + if(is_aliased_output[aliased_input_idx] && aliased_input.has_storage()) + TORCH_INTERNAL_ASSERT(aliased_input.storage().is_alias_of(aliased_output.storage()), op_name); + } + #endif + + if (any_requires_grad) { + _foreach_tensor([&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) { + if (isDifferentiableType(t.scalar_type())) { + if (is_inplace_output[idx_ret]) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + rebase_history(const_cast(t), grad_fn); + } else { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + set_history(const_cast(t), grad_fn); + } + } + }, stack, stack->size() - num_returns, num_returns); + } +} + +torch::CppFunction autogradNotImplementedFallback() { + return torch::CppFunction::makeFromBoxedFunction<&autogradNotImplementedFallbackImpl>(); +} + +}} // namespace torch::autograd diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.h b/torch/csrc/autograd/autograd_not_implemented_fallback.h new file mode 100644 index 0000000000000..4b2cbd14b9d86 --- /dev/null +++ b/torch/csrc/autograd/autograd_not_implemented_fallback.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace torch { +namespace autograd { + +TORCH_API torch::CppFunction autogradNotImplementedFallback(); + +}} // namespace torch::autograd diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 25336dfa9d911..2a1de8e82a774 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -563,6 +563,14 @@ struct MakeNextFunctionList : IterArgs { next_edges.emplace_back(); } } + void operator()(const Variable* variable) { + // NOLINTNEXTLINE(bugprone-branch-clone) + if (variable->defined()) { + next_edges.push_back(impl::gradient_edge(*variable)); + } else { + next_edges.emplace_back(); + } + } void operator()(const c10::optional& variable) { // NOLINTNEXTLINE(bugprone-branch-clone) if (variable.has_value() && variable->defined()) { From 6ccb74b837535f8f5e7a687ee1a17fea52972ab3 Mon Sep 17 00:00:00 2001 From: soulitzer Date: Fri, 27 Aug 2021 14:59:08 -0700 Subject: [PATCH 314/530] Update codegen to use boxed kernel (#63459) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63459 - Replaces the usual registration basically when "requires_derivative" is True (as in we still need a grad_fn), but `fn.info` is `None` (TODO maybe make sure differentiable inputs > 0 also to match requires_derivative). - Adds some (temporary?) fixes to some sparse functions See: https://github.com/pytorch/pytorch/issues/63549 - To remove the codegen that generates NotImplemented node (though that should only be one line), because there are some ops listed under `RESET_GRAD_ACCUMULATOR` that have a extra function call. We would need to make this list of ops available to c++, but this would either mean we'd have to codegen a list of strings, or move the RESET_GRAD_ACCUMULATOR to cpp land. We could do this in a future PR if necessary. Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D30518571 Pulled By: soulitzer fbshipit-source-id: 99a35cbced46292d1b4e51594ae4d534c2caf8b6 --- tools/autograd/gen_inplace_or_view_type.py | 4 ++ tools/autograd/gen_variable_type.py | 46 ++++++++++++++++++---- tools/autograd/templates/VariableType.h | 1 + 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py index 6c42bec1e5d12..524cca262f4f2 100644 --- a/tools/autograd/gen_inplace_or_view_type.py +++ b/tools/autograd/gen_inplace_or_view_type.py @@ -124,6 +124,10 @@ ); """) +AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate("""\ +m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback()); +""") + INPLACE_REDISPATCH = CodeTemplate("""\ { at::AutoDispatchBelowADInplaceOrView guard; diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index a64f7341e281c..d0a9048df47f3 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -1,5 +1,8 @@ # Generates VariableType.h/cpp # +# **If any changes are being made to the VariableType codegen please also check +# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp +# # VariableType is a subclass of at::Type that provides the binding code # necessary to provide a differentiable version of ATen operators. There are a # number of different things we could mean: @@ -30,7 +33,8 @@ from .gen_inplace_or_view_type import ( get_view_info, is_tensor_type, is_tensor_list_type, unpack_args, get_base_name, use_derived, modifies_arguments, WRAPPER_REGISTRATION, TMP_VAR, METHOD_DEFINITION, - ASSIGN_RETURN_VALUE, gen_formals, ALL_VIEW_FUNCTIONS, unpacked_name + ASSIGN_RETURN_VALUE, gen_formals, ALL_VIEW_FUNCTIONS, unpacked_name, + AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION ) from tools.codegen.api.types import (Binding, DispatcherSignature, BaseCType, intArrayRefT, @@ -404,13 +408,39 @@ def gen_variable_type_func( name = cpp.name(f.func) formals = gen_formals(f) - type_definition = METHOD_DEFINITION.substitute( - return_type=cpp.returns_type(f.func.returns).cpp_type(), - type_wrapper_name=type_wrapper_name(f), - type_definition_body=emit_body(fn), - formals=formals, - ) - wrapper_registration = gen_wrapper_registration(f) + if fn.info is None and not get_base_name(f) in RESET_GRAD_ACCUMULATOR \ + and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE \ + and len(gen_differentiable_outputs(fn)) > 0 \ + and not get_base_name(f) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \ + and not get_base_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \ + and not get_base_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT: + # NOTE: [ Registering AutogradNotImplemented boxed kernel ] + # + # When there is no derivatives.yaml entry, we register a generic boxed + # NotImplemented kernel to set grad_fn to be NotImplemented, so that forward + # proceeds as usual but an error is properly produced on backward. + # TODO: it would be nice to not have these special cases + # + # There are several cases where still let codegen handle it: + # 1) ops that need to reset grad accumulator (we let codegen handle this case + # because) the list is (currently) only accessible in Python. + # 2) User explicitly specifies DONT_REQUIRE_DERIVATIVE. This basically makes + # autograd a fallthrough with NDEBUG checks. This can be useful for when all + # outputs are integral. + # 3) When there are no differentiable outputs. This is similar to (2). + # 4) There are certain ops where we skip certain NDEBUG checks. this is similar + # to (1). + type_definition = "" + wrapper_registration = AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION.substitute( + unqual_operator_name_with_overload=f.func.name) + else: + type_definition = METHOD_DEFINITION.substitute( + return_type=cpp.returns_type(f.func.returns).cpp_type(), + type_wrapper_name=type_wrapper_name(f), + type_definition_body=emit_body(fn), + formals=formals, + ) + wrapper_registration = gen_wrapper_registration(f) # See Note [Manual Backend kernels] assert (name in MANUAL_BACKEND) == f.manual_kernel_registration diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index fc8ffa5799c11..333e8a0d7ada5 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -7,6 +7,7 @@ #include #include +#include #include // for size_t #include // for function From 3abbcf079d38d468a45073b13cb13627c9c0f367 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Fri, 27 Aug 2021 16:02:49 -0700 Subject: [PATCH 315/530] .github: Add cpp_docs job to current gcc5 workflow (#64044) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64044 Adds the cpp_docs job to the current workflow, also modifies the scripts surrounding building docs so that they can be powered through environment variables with sane defaults rather than having to have passed arguments. Ideally should not break current jobs running in circleci but those should eventually be turned off anyways. Coincides with work from: * https://github.com/seemethere/upload-artifact-s3/pull/1 * https://github.com/seemethere/upload-artifact-s3/pull/2 Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30610010 Pulled By: seemethere fbshipit-source-id: f67adeb1bd422bb9e24e0f1ec0098cf9c648f283 --- .circleci/scripts/cpp_doc_push_script.sh | 25 +++++++---- .circleci/scripts/python_doc_push_script.sh | 27 ++++++++---- .github/templates/linux_ci_workflow.yml.j2 | 42 +++++++++++-------- .../generated-linux-xenial-py3.6-gcc5.4.yml | 42 +++++++++++-------- 4 files changed, 85 insertions(+), 51 deletions(-) diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh index c6b4f00a06f0f..aa26a740aca7d 100755 --- a/.circleci/scripts/cpp_doc_push_script.sh +++ b/.circleci/scripts/cpp_doc_push_script.sh @@ -10,21 +10,30 @@ pt_checkout="/var/lib/jenkins/workspace" # Since we're cat-ing this file, we need to escape all $'s echo "cpp_doc_push_script.sh: Invoked with $*" +# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}} +# the order of operations goes: +# 1. Check if there's an argument $1 +# 2. If no argument check for environment var DOCS_INSTALL_PATH +# 3. If no environment var fall back to default 'docs/' + +# NOTE: It might seem weird to gather the second argument before gathering the first argument +# but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to +# try and gather it first, just so we don't potentially break people who rely on this script +# Argument 2: What version of the Python API docs we are building. +version="${2:-${DOCS_VERSION:-master}}" +if [ -z "$version" ]; then +echo "error: cpp_doc_push_script.sh: version (arg2) not specified" + exit 1 +fi + # Argument 1: Where to copy the built documentation for Python API to # (pytorch.github.io/$install_path) -install_path="$1" +install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}" if [ -z "$install_path" ]; then echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified" exit 1 fi -# Argument 2: What version of the Python API docs we are building. -version="$2" -if [ -z "$version" ]; then -echo "error: cpp_doc_push_script.sh: version (arg2) not specified" - exit 1 -fi - is_master_doc=false if [ "$version" == "master" ]; then is_master_doc=true diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh index ed8f28d54f333..167eaca2d1df1 100755 --- a/.circleci/scripts/python_doc_push_script.sh +++ b/.circleci/scripts/python_doc_push_script.sh @@ -13,28 +13,37 @@ echo "python_doc_push_script.sh: Invoked with $*" set -ex +# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}} +# the order of operations goes: +# 1. Check if there's an argument $1 +# 2. If no argument check for environment var DOCS_INSTALL_PATH +# 3. If no environment var fall back to default 'docs/' + +# NOTE: It might seem weird to gather the second argument before gathering the first argument +# but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to +# try and gather it first, just so we don't potentially break people who rely on this script +# Argument 2: What version of the docs we are building. +version="${2:-${DOCS_VERSION:-master}}" +if [ -z "$version" ]; then +echo "error: python_doc_push_script.sh: version (arg2) not specified" + exit 1 +fi + # Argument 1: Where to copy the built documentation to # (pytorch.github.io/$install_path) -install_path="$1" +install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}" if [ -z "$install_path" ]; then echo "error: python_doc_push_script.sh: install_path (arg1) not specified" exit 1 fi -# Argument 2: What version of the docs we are building. -version="$2" -if [ -z "$version" ]; then -echo "error: python_doc_push_script.sh: version (arg2) not specified" - exit 1 -fi - is_master_doc=false if [ "$version" == "master" ]; then is_master_doc=true fi # Argument 3: The branch to push to. Usually is "site" -branch="$3" +branch="${3:-${DOCS_BRANCH:-site}}" if [ -z "$branch" ]; then echo "error: python_doc_push_script.sh: branch (arg3) not specified" exit 1 diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 52c0a09a9e1c5..7d9020790710e 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -441,11 +441,15 @@ jobs: {% endblock %} {%- endif -%} {%- if enable_doc_jobs %} - pytorch_python_doc_build: + pytorch_doc_build: runs-on: linux.2xlarge + strategy: + matrix: + docs_type: [cpp, python] needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + DOCS_TYPE: ${{ matrix.docs_type }} steps: - name: Log in to ECR run: | @@ -483,7 +487,7 @@ jobs: - name: Unzip artifacts run: | unzip -o artifacts.zip - - name: Build Python Doc in Docker + - name: Build ${{ matrix.docs_type }} docs run: | set -ex time docker pull "${DOCKER_IMAGE}" > /dev/null @@ -496,6 +500,8 @@ jobs: -e IN_CI \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e CIRCLE_SHA1="$GITHUB_SHA" \ + -e DOCS_VERSION="${target}" \ + -e DOCS_TYPE \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -505,34 +511,36 @@ jobs: -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" \ - bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site" + bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh" - name: Chown workspace run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be - if: ${{ github.event_name == 'pull_request' }} - name: Upload Docs Preview + - uses: seemethere/upload-artifact-s3@v3 + name: Upload Python Docs Preview + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} with: - name: deploy retention-days: 14 if-no-files-found: error - path: pytorch.github.io/docs/merge - - name: Show Docs Preview URL (Click Me) - if: ${{ github.event_name == 'pull_request' }} - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/" + path: pytorch.github.io/docs/merge/ + s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }} + - uses: seemethere/upload-artifact-s3@v3 + name: Upload C++ Docs Preview + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }} + with: + retention-days: 14 + if-no-files-found: error + path: cppdocs/ + s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs - name: Archive artifacts into zip run: | - zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io" + zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs" - uses: actions/upload-artifact@v2 name: Store PyTorch Build Artifacts with: - name: pytorch_github_io + name: docs_${{ matrix.docs_type }} + path: docs_${{ matrix.docs_type }}.zip if-no-files-found: error - path: pytorch_github_io.zip - name: Hold runner for 2 hours or until ssh sessions have drained # Always hold for active ssh sessions if: always() diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index b5f062c53cb05..6cc391ba15991 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -429,11 +429,15 @@ jobs: # Prune all of the docker images docker system prune -af - pytorch_python_doc_build: + pytorch_doc_build: runs-on: linux.2xlarge + strategy: + matrix: + docs_type: [cpp, python] needs: [calculate-docker-image, build, ciflow_should_run] env: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + DOCS_TYPE: ${{ matrix.docs_type }} steps: - name: Log in to ECR run: | @@ -474,7 +478,7 @@ jobs: - name: Unzip artifacts run: | unzip -o artifacts.zip - - name: Build Python Doc in Docker + - name: Build ${{ matrix.docs_type }} docs run: | set -ex time docker pull "${DOCKER_IMAGE}" > /dev/null @@ -487,6 +491,8 @@ jobs: -e IN_CI \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e CIRCLE_SHA1="$GITHUB_SHA" \ + -e DOCS_VERSION="${target}" \ + -e DOCS_TYPE \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -496,34 +502,36 @@ jobs: -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" \ - bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site" + bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh" - name: Chown workspace run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be - if: ${{ github.event_name == 'pull_request' }} - name: Upload Docs Preview + - uses: seemethere/upload-artifact-s3@v3 + name: Upload Python Docs Preview + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} with: - name: deploy retention-days: 14 if-no-files-found: error - path: pytorch.github.io/docs/merge - - name: Show Docs Preview URL (Click Me) - if: ${{ github.event_name == 'pull_request' }} - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/" + path: pytorch.github.io/docs/merge/ + s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }} + - uses: seemethere/upload-artifact-s3@v3 + name: Upload C++ Docs Preview + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }} + with: + retention-days: 14 + if-no-files-found: error + path: cppdocs/ + s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs - name: Archive artifacts into zip run: | - zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io" + zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs" - uses: actions/upload-artifact@v2 name: Store PyTorch Build Artifacts with: - name: pytorch_github_io + name: docs_${{ matrix.docs_type }} + path: docs_${{ matrix.docs_type }}.zip if-no-files-found: error - path: pytorch_github_io.zip - name: Hold runner for 2 hours or until ssh sessions have drained # Always hold for active ssh sessions if: always() From 2d75ab0c8fe793ceddd3aee74f25c956d5d8d2ec Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Fri, 27 Aug 2021 16:15:55 -0700 Subject: [PATCH 316/530] [TensorExpr] Update tutorial. (#64109) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64109 Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D30614050 Pulled By: ZolotukhinM fbshipit-source-id: e8f9bd9ef2483e6eafbc0bd5394d311cd694c7b2 --- test/cpp/tensorexpr/tutorial.cpp | 488 +++++++++++++++++++------------ 1 file changed, 304 insertions(+), 184 deletions(-) diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index 16605e5e6d501..0ec0968bebf8f 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -38,21 +38,30 @@ #include #include +#include +#include #include #include #include #include +#include +#include #include #include #include +#include using namespace torch::jit::tensorexpr; +// Helper function to print a snippet from a big multi-line string +static void printLinesToFrom(const std::string& input_str, int from, int to); + int main(int argc, char* argv[]) { - std::cout << "*** Structure of tensor expressions ***" << std::endl; + std::cout << "*** Structure of tensor expressions and statements ***" + << std::endl; { // A tensor expression is a tree of expressions. Each expression has a type, - // and that type defines what sub-expressions it the current expression has. + // and that type defines what sub-expressions the current expression has. // For instance, an expression of type 'Mul' would have a type 'kMul' and // two subexpressions: LHS and RHS. Each of these two sub-expressions could // also be a 'Mul' or some other expression. @@ -72,15 +81,21 @@ int main(int argc, char* argv[]) { // like we did in the previous example). Expression handles overload common // operations and allow us to express the same semantics in a more natural // way: - ExprHandle l = 1; + ExprHandle l = 5; ExprHandle r = Var::make("x", kInt); ExprHandle m = l * r; std::cout << "Tensor expression: " << *m.node() << std::endl; - // Prints: Tensor expression: 1 * x + // Prints: Tensor expression: 5 * x + + // Converting from handles to raw expressions and back is easy: + ExprHandle handle = Var::make("x", kInt); + ExprPtr raw_expr_from_handle = handle.node(); + ExprPtr raw_expr = alloc("x", kInt); + ExprHandle handle_from_raw_expr = ExprHandle(raw_expr); - // In a similar fashion we could construct arbitrarily complex expressions - // using mathematical and logical operations, casts between various data - // types, and a bunch of intrinsics. + // We could construct arbitrarily complex expressions using mathematical + // and logical operations, casts between various data types, and a bunch of + // intrinsics. ExprHandle a = Var::make("a", kInt); ExprHandle b = Var::make("b", kFloat); ExprHandle c = Var::make("c", kFloat); @@ -96,238 +111,232 @@ int main(int argc, char* argv[]) { // placeholder similar to Var, but with dimensions info. // // Let's construct a simple load: - BufHandle A("A", {ExprHandle(64), ExprHandle(32)}, kInt); - ExprHandle i = Var::make("i", kInt), j = Var::make("j", kInt); + BufHandle A("A", {64, 32}, kInt); + VarPtr i_var = alloc("i", kInt), j_var = alloc("j", kInt); + ExprHandle i(i_var), j(j_var); ExprHandle load = Load::make(A.dtype(), A, {i, j}); std::cout << "Tensor expression: " << *load.node() << std::endl; // Prints: Tensor expression: A[i, j] - } - std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl; - { - // A tensor computation is represented by Tensor class objects and - // consists of the following pieces: - // - domain, which is specified by a Buf expression - // - a tensor statement, specified by a Stmt object, that computation to - // be performed in this domain - - // Let's start with defining a domain. We do this by creating a Buf object. - - // First, let's specify the sizes: - std::vector dims = { - alloc(64), - alloc(32)}; // IntImm stands for Integer Immediate - // and represents an integer constant - - // Now we can create a Buf object by providing a name, dimensions, and a - // data type of the elements: - BufPtr buf = alloc("X", dims, kInt); - - // Next we need to spefify the computation. We can do that by either - // constructing a complete tensor statement for it (statements are - // examined in details in subsequent section), or by using a convenience - // method where we could specify axis and an element expression for the - // computation. In the latter case a corresponding statement would be - // constructed automatically. - - // Let's define two variables, i and j - they will be axis in our - // computation. - VarPtr i = alloc("i", kInt); - VarPtr j = alloc("j", kInt); - std::vector args = {i, j}; - - // Now we can define the body of the tensor computation using these - // variables. What this means is that values in our tensor are: - // X[i, j] = i * j - ExprPtr body = alloc(i, j); - - // Finally, we pass all these pieces together to Tensor constructor: - Tensor X = Tensor(buf, args, body); - std::cout << "Tensor computation: " << X << std::endl; + // Tensor Expressions constitute Tensor Statements, which are used to + // represent computation of a given operator or a group of operators from a + // fusion group. + // + // There are three main kinds of tensor statements: + // - block + // - store + // - loop + // + // A Store represents a store to a single element of a tensor (or to a + // group of elements if it's a vectorized store). Store statements, + // similarly to Load expressions, have a base and indices, but on top of + // that they also include a value - an expression representing what needs + // to be stored at the given memory location. Let's create a Store stmt: + StmtPtr store_a = Store::make(A, {i, j}, i + j); + std::cout << "Store statement: " << *store_a << std::endl; + // Prints: Store statement: A[i, j] = i + j; + + // An operator fills the entire tensor, not just a single element, and to + // represent this we need to use For stmt: let's wrap our store stmt with + // two nested loops to represent that variables i and j need to iterate + // over some ranges. + ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a); + ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a); + + std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl; // Prints: - // Tensor computation: Tensor X[64, 32]: + // Nested for loops: // for (int i = 0; i < 64; i++) { // for (int j = 0; j < 32; j++) { - // X[i, j] = i * j; + // A[i, j] = i + j; // } // } - // TODO: Add an example of constructing a Tensor with a complete Stmt. - - // Similarly to how we provide a more convenient way of using handles for - // constructing Exprs, Tensors also have a more convenient API for - // construction. It is based on Compute API, which takes a name, - // dimensions, and a lambda specifying the computation body: - Tensor Z = Compute( - "Z", - {{64, "i"}, {32, "j"}}, - [](const VarHandle& i, const VarHandle& j) { return i / j; }); - std::cout << "Tensor computation: " << Z << std::endl; + // A Block statement is used when we need a sequence of other statements. + // E.g. if a fusion group contains several operators, we initially define + // separate loopnest for each of them and put them all into a common block: + BufHandle B("B", {64, 32}, kInt); + StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j)); + ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b); + ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b); + + BlockPtr block = Block::make({loop_i_a, loop_i_b}); + std::cout << "Compound Block statement: " << std::endl + << *block << std::endl; // Prints: - // Tensor computation: Tensor Z[64, 32]: - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { - // Z[i, j] = i / j; + // Compound Block statement: + // { + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // A[i, j] = i + j; + // } + // } + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // B[i, j] = A[i, j]; + // } // } // } - // Tensors might access other tensors and external placeholders in their - // expressions. It can be done like so: - Placeholder P("P", kInt, {64, 32}); - Tensor R = Compute( - "R", + // Manually constructing nested loops and blocks to represent a computation + // might be laborious, and instead we can use a 'Compute' API. This API + // requires us to specify dimensions and a lambda to compute a single + // element of the resulting tensor and returns a `Tensor` structure. This + // structure is simply a pair of a buffer that was created to represent the + // result of the computation (BufPtr) and a statement representing the + // computation itself (StmtPtr). + Tensor C = Compute( + "C", {{64, "i"}, {32, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { - return Z.load(i, j) * P.load(i, j); - }); - std::cout << "Tensor computation: " << R << std::endl; + [&](const VarHandle& i, const VarHandle& j) { return i * j; }); + std::cout << "Stmt produced by 'Compute' API: " << std::endl + << *C.stmt() << std::endl; // Prints: - // Tensor computation: Tensor R[64, 32]: + // Stmt produced by 'Compute' API: // for (int i = 0; i < 64; i++) { // for (int j = 0; j < 32; j++) { - // R[i, j] = (Z(i, j)) * (P[i, j]); + // C[i, j] = i * j; // } // } - // Placeholders could be thought of as external tensors, i.e. tensors for - // which we don't have the element expression. In other words, for `Tensor` - // we know an expression specifying how its elements can be computed (a - // mathematical formula). For external tensors, or placeholders, we don't - // have such an expression. They need to be considered as coming to us as - // inputs from outside - we can only load data from them. - // - // TODO: Show how reductions are represented and constructed + // To construct statements to represent computations with reductions, we + // can use a 'Reduce' API - it is similar to 'Compute' but takes a couple + // of extra arguments defining how to perform the reduction. Let's define a + // simple 2D sum of C using that: + Tensor D = Reduce( + "D", + {}, + Sum(), + [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); }, + {{64, "i"}, {32, "j"}}); + std::cout << "Stmt produced by 'Reduce' API: " << std::endl + << *D.stmt() << std::endl; } - std::cout << "*** Loopnests and Statements ***" << std::endl; + std::cout << "*** Loopnests transformations ***" << std::endl; { - // Creating a tensor expression is the first step to generate an executable - // code for it. A next step is to represent it as a loop nest and apply - // various loop transformations in order to get an optimal implementation. - // In Halide's or TVM's terms the first step was to define the algorithm of - // computation (what to compute?) and now we are getting to the schedule of - // the computation (how to compute?). + // When a statement for the computation is generated, we might want to + // apply some optimizations to it. These transformations allow us to end up + // with a statement producing the same results, but more efficiently. // - // Let's create a simple tensor expression and construct a loop nest for it. - Placeholder A("A", kFloat, {64, 32}); - Placeholder B("B", kFloat, {64, 32}); - Tensor X = Compute( - "X", + // Let's look at a couple of transformations that are used in NNC. We will + // begin with constructing a Block statement like we did before. + + Tensor C = Compute( + "C", {{64, "i"}, {32, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { - return A.load(i, j) + B.load(i, j); - }); - Tensor Y = Compute( - "Y", + [&](const VarHandle& i, const VarHandle& j) { return i * (j + 1); }); + BufHandle c_buf(C.buf()); + Tensor D = Compute( + "D", {{64, "i"}, {32, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return sigmoid(X.load(i, j)); + return c_buf.load(i, j) - i; }); - std::cout << "Tensor computation X: " << X << "Tensor computation Y: " << Y - << std::endl; + StmtPtr block = Block::make({C.stmt(), D.stmt()}); + std::cout << "Stmt produced by 'Compute' API: " << std::endl + << *block << std::endl; // Prints: - // Tensor computation X: Tensor X[64, 32]: - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { - // X[i, j] = (A[i, j]) + (B[i, j]); + // Stmt produced by 'Compute' API: + // { + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // C[i, j] = i * (j + 1); + // } // } - // } - - // Tensor computation Y: Tensor Y[64, 32]: - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { - // Y[i, j] = sigmoid(X(i, j)); + // for (int i_1 = 0; i_1 < 64; i_1++) { + // for (int j_1 = 0; j_1 < 32; j_1++) { + // D[i_1, j_1] = (C[i_1, j_1]) - i_1; + // } // } // } - // Creating a loop nest is as quite simple, we just need to specify a list - // of all and a list of output tensors: - // NOLINTNEXTLINE(bugprone-argument-comment) - LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y}); - - // An IR used in LoopNest is based on tensor statements, represented by - // `Stmt` class. Statements are used to specify the loop nest structure, and - // to take a sneak peek at them, let's print out what we got right after - // creating our LoopNest object: - std::cout << *loopnest.root_stmt() << std::endl; + // One transformation we can apply to this computation is inlining: i.e. + // taking the expression that defines values of C and substituting a load + // from C with it. + // To do that, we first need to create a special object called LoopNest - + // all transformations are methods of this class. To create a loopnest we + // need to provide a list of output buffers and the root statement: + LoopNest nest(block, {D.buf()}); + + // We can always retrieve the Stmt back from LoopNest: + std::cout << "LoopNest root stmt: " << std::endl + << *nest.root_stmt() << std::endl; // Prints: + // LoopNest root stmt: // { // for (int i = 0; i < 64; i++) { // for (int j = 0; j < 32; j++) { - // X[i, j] = (A[i, j]) + (B[i, j]); + // C[i, j] = i * (j + 1); // } // } // for (int i_1 = 0; i_1 < 64; i_1++) { // for (int j_1 = 0; j_1 < 32; j_1++) { - // Y[i_1, j_1] = sigmoid(X(i_1, j_1)); + // D[i_1, j_1] = (C[i_1, j_1]) - i_1; // } // } // } - // To introduce statements let's first look at their three main types (in - // fact, there are more than 3 types, but the other types would be easy to - // understand once the overall structure is clear): - // 1) Block - // 2) For - // 3) Store - // - // A `Block` statement is simply a list of other statements. - // A `For` is a statement representing one axis of computation. It contains - // an index variable (Var), boundaries of the axis (start and end - both are - // `Expr`s), and a `Block` statement body. - // A `Store` represents an assignment to a tensor element. It contains a Buf - // representing the target tensor, a list of expressions for indices of the - // element, and the value to be stored, which is an arbitrary expression. - - // Once we've constructed the loop nest, we can apply various tranformations - // to it. To begin with, let's inline computation of X into computation of Y - // and see what happens to our statements. - loopnest.computeInline(loopnest.getLoopBodyFor(X)); - std::cout << *loopnest.root_stmt() << std::endl; + // Now we can apply the inlining transformation: + nest.computeInline(C.buf()); + std::cout << "Stmt after inlining:" << std::endl + << *nest.root_stmt() << std::endl; // Prints: + // Stmt after inlining: // { // for (int i = 0; i < 64; i++) { // for (int j = 0; j < 32; j++) { - // Y[i, j] = sigmoid((A[i, j]) + (B[i, j])); + // D[i, j] = i * (j + 1) - i; // } // } // } - // - // As you can see, the first two loops have disappeared and the expression - // for X[i,j] has been inserted into the Y[i,j] computation. - - // Loop transformations can be composed, so we can do something else with - // our loop nest now. Let's split the inner loop with a factor of 9, for - // instance. - std::vector loops = loopnest.getLoopStmtsFor(Y); - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - ForPtr j_inner; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - ForPtr j_tail; - int split_factor = 9; - loopnest.splitWithTail( - loops[1], // loops[0] is the outer loop, loops[1] is inner - split_factor, - &j_inner, // further transformations - &j_tail); - // loops[1] will become the outer loop, j_outer, after splitWithTail. - std::cout << *loopnest.root_stmt() << std::endl; + + // We can also apply algebraic simplification to a statement: + StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt()); + std::cout << "Stmt after simplification:" << std::endl + << *simplified << std::endl; // Prints: + // Stmt after simplification: // { // for (int i = 0; i < 64; i++) { - // for (int j_outer = 0; j_outer < (32 - 0) / 9; j_outer++) { - // for (int j_inner = 0; j_inner < 9; j_inner++) { - // Y[i, j_outer * 9 + j_inner] = sigmoid((A[i, j_outer * 9 + ... + // for (int j = 0; j < 32; j++) { + // D[i, j] = i * j; + // } + // } + // } + + // Many loopnest transformations are stateless and can be applied without + // creating a LoopNest object. In fact, we plan to make all transformations + // stateless. + // splitWithTail is one such transformation: it splits an iteration space + // of a given loop into two with a given factor. + ForPtr outer_loop = to(to(simplified)->stmts().front()); + LoopNest::splitWithTail(outer_loop, 13); + // Call simplifier once more to fold some arithmetic. + simplified = IRSimplifier::simplify(simplified); + std::cout << "Stmt after splitWithTail:" << std::endl + << *simplified << std::endl; + // Prints: + // Stmt after splitWithTail: + // { + // for (int i_outer = 0; i_outer < 4; i_outer++) { + // for (int i_inner = 0; i_inner < 13; i_inner++) { + // for (int j = 0; j < 32; j++) { + // D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j); // } // } - // for (int j_tail = 0; j_tail < (32 - 0) % 9; j_tail++) { - // Y[i, j_tail + ((32 - 0) / 9) * 9] = sigmoid((A[i, j_tail + ... + // } + // for (int i_tail = 0; i_tail < 12; i_tail++) { + // for (int j = 0; j < 32; j++) { + // D[i_tail + 52, j] = i_tail * j + 52 * j; // } // } // } - // TODO: List all available transformations - // TODO: Show how statements can be constructed manually + // NNC supports a wide range of loop nest transformations, which we are not + // listing here. Please refer to documentation in + // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h + // for more details. } std::cout << "*** Codegen ***" << std::endl; @@ -335,13 +344,14 @@ int main(int argc, char* argv[]) { // An ultimate goal of tensor expressions is to be provide a mechanism to // execute a given computation in the fastest possible way. So far we've // looked at how we could describe what computation we're interested in, but - // we haven't looked at how to actually execute it. So far all we've been - // dealing with was just symbols with no actual data associated, in this - // section we would look at how we can bridge that gap. + // we haven't looked at how to actually execute it. + // + // All we've been dealing with was just symbols with no actual data + // associated, in this section we would look at how we can bridge that gap. // Let's start by constructing a simple computation for us to work with: - Placeholder A("A", kInt, {64, 32}); - Placeholder B("B", kInt, {64, 32}); + BufHandle A("A", {64, 32}, kInt); + BufHandle B("B", {64, 32}, kInt); Tensor X = Compute( "X", {{64, "i"}, {32, "j"}}, @@ -349,7 +359,8 @@ int main(int argc, char* argv[]) { return A.load(i, j) + B.load(i, j); }); - // And let's lower it to a loop nest, as we did in the previous section: + // And let's lower it to a loop nest, as we did in the previous section. We + // can pass Tensor object directly: LoopNest loopnest({X}); std::cout << *loopnest.root_stmt() << std::endl; // Prints: @@ -416,6 +427,115 @@ int main(int argc, char* argv[]) { // X[10] = A[10] + B[10] = 8 } - // TODO: Show how TorchScript IR is translated to TE + std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl; + { + // This section requires a LLVM-enabled PyTorch build, so we have to use a + // guard: +#ifdef TORCH_ENABLE_LLVM + + // Often we would like to convert a TorchScript IR to TE rather than + // construct TE IR from scratch. NNC provides an API to perform such + // lowering: it takes a TorchScript graph and returns an object that can be + // used to invoke the generated kernel. + // This API is currently used by the TorchScript JIT fuser and can also be + // used ahead of time to pre-compile parts of a model. + // + // To get familiar with this API let's first start with defining a simple + // TorchScript graph: + const auto graph_string = R"IR( + graph(%A : Float(5, 3, strides=[3, 1], device=cpu), + %B : Float(5, 3, strides=[3, 1], device=cpu)): + %AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B) + %one : int = prim::Constant[value=1]() + %AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB) + %AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one) + return (%AAB_plus_B))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + // This graph defines a simple computation of A*A*B + B where A and B are + // input 5x3 tensors. + + // To lower this TorchScript graph to TE, we just need to create a + // TensorExprKernel object. In its constructor it constructs the + // corresponding TE IR and compiles it for the given backend (in this + // example for CPU using LLVM compiler). + TensorExprKernel kernel(graph); + + // We can retrieve the generated TE stmt from the kernel object: + StmtPtr kernel_stmt = kernel.getCodeGenStmt(); + std::cout << "TE Stmt constructed from TorchScript: " << std::endl + << *kernel_stmt << std::endl; + // Prints: + // TE Stmt constructed from TorchScript: + // { + // for (int v = 0; v < 5; v++) { + // for (int _tail_tail = 0; _tail_tail < 3; _tail_tail++) { + // aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) * + // ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) + + // (tB[_tail_tail + 3 * v]); + // } + // } + // } + + // We can also examine generated LLVM IR and assembly code: + std::cout << "Generated LLVM IR: " << std::endl; + auto ir_str = kernel.getCodeText("ir"); + printLinesToFrom(ir_str, 15, 20); + // Prints: + // Generated LLVM IR: + // %9 = bitcast float* %2 to <8 x float>* + // %10 = load <8 x float>, <8 x float>* %9 ... + // %11 = bitcast float* %5 to <8 x float>* + // %12 = load <8 x float>, <8 x float>* %11 ... + // %13 = fmul <8 x float> %10, %12 + // %14 = fmul <8 x float> %10, %13 + + std::cout << "Generated assembly: " << std::endl; + auto asm_str = kernel.getCodeText("asm"); + printLinesToFrom(asm_str, 10, 15); + // Prints: + // Generated assembly: + // vmulps %ymm1, %ymm0, %ymm2 + // vfmadd213ps %ymm1, %ymm0, %ymm2 + // vmovups %ymm2, (%rax) + // vmovss 32(%rcx), %xmm0 + // vmovss 32(%rdx), %xmm1 + // vmulss %xmm1, %xmm0, %xmm2 + + // We can also execute the generated kernel: + auto A = + at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) * + 2.0; + auto B = + at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) * + 3.0; + std::vector inputs = {A, B}; + std::vector stack = torch::fmap(inputs); + kernel.run(stack); + auto R = stack[0].toTensor(); + + // Let's print one of the elements from the result tensor to verify that the + // computation did happen and was correct: + std::cout << "R[2][2] = " << R[2][2] << std::endl; + // Prints: + // R[2][2] = 15 + // [ CPUFloatType{} ] +#endif + } return 0; } + +void printLinesToFrom(const std::string& input_str, int from, int to) { + std::istringstream f(input_str); + std::string s; + int idx = 0; + while (getline(f, s)) { + if (idx > from) { + std::cout << s << "\n"; + } + if (idx++ > to) { + break; + } + } +} From 07c5cb8c48d655ba73adc2da2b88399f3ab48638 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Fri, 27 Aug 2021 17:37:05 -0700 Subject: [PATCH 317/530] [Static Runtime] Optimize memory planner initialization (#64101) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64101 Checking `getOutOfPlaceOperation(n)` is a very expensive operation, especially in multithreaded environments, due to a lock acquisition when the NNC cache is queried. This slows down the memory planner initialization time, and by extension, the latency for the first static runtime inference. There are two optimizations in this diff: * Cache the result of `p_node->has_out_variant()` to avoid the call to `getOutOfPlaceOperation`. This speeds up calls to `canReuseInputOutputs`, which in turn speeds up `isOptimizableContainerType` * Precompute all `isOptimizableContainerType` during static runtime initialization to avoid a pass over all of each node's inputs. Test Plan: All unit tests pass: `buck test caffe2/benchmarks/static_runtime/...` Reviewed By: movefast1990 Differential Revision: D30595579 fbshipit-source-id: 70aaa7af9589c739c672788bf662f711731864f2 --- torch/csrc/jit/runtime/static/impl.cpp | 31 ++++++++++++++++++-------- torch/csrc/jit/runtime/static/impl.h | 11 +++++++++ torch/csrc/jit/runtime/static/ops.cpp | 29 +++++++++++++++--------- torch/csrc/jit/runtime/static/ops.h | 8 +++++-- 4 files changed, 57 insertions(+), 22 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 643842a74691c..ee8e9038b1c48 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -319,7 +319,9 @@ LivenessMap GetLivenessMap( // first: Values that are candidates for memory planning // second: A deterministc order of all values std::pair, std::vector> -GetMemoryPlanningCandidates(const std::shared_ptr& graph) { +GetMemoryPlanningCandidates( + const std::shared_ptr& graph, + const FastMap& node_has_out_variant) { // for determinism FastSet seen_values; std::vector all_values; @@ -328,7 +330,8 @@ GetMemoryPlanningCandidates(const std::shared_ptr& graph) { // these need to be removed from "can_reuse" after analyzing all nodes FastSet cannot_reuse; for (auto* n : graph->nodes()) { - bool can_reuse_inputs_outputs = canReuseInputsOutputs(n); + bool can_reuse_inputs_outputs = + canReuseInputsOutputs(n, node_has_out_variant); for (const auto* v : n->inputs()) { if (!seen_values.count(v)) { all_values.emplace_back(v); @@ -628,6 +631,7 @@ StaticModule::StaticModule( // construct SSA definition for non-constant nodes int node_idx = 0; + FastMap node_has_out_variant; for (Node* node : graph_->nodes()) { if (node->kind() == prim::Constant) { continue; @@ -639,14 +643,22 @@ StaticModule::StaticModule( input_ssa_defs.emplace_back(value_to_ssa_def.at(input)); } node_inputs_ssa_def_map_[node_idx] = input_ssa_defs; - nodes_.emplace_back( - ProcessedNode(node, std::move(ivalue_inputs), opts.enable_out_variant)); + auto pnode = + ProcessedNode(node, std::move(ivalue_inputs), opts.enable_out_variant); + node_has_out_variant.emplace(node, pnode.has_out_variant()); + nodes_.emplace_back(std::move(pnode)); for (const auto i : c10::irange(node->outputs().size())) { value_to_ivalue[node->outputs()[i]] = nullptr; value_to_ssa_def[node->outputs()[i]] = std::make_pair(node_idx, i); } node_idx++; } + for (auto& pnode : nodes_) { + if (pnode.outputs().size() == 1 && + isOptimizableContainerType(pnode.node(), node_has_out_variant)) { + node_is_optimizable_container_type_.emplace(pnode.node()); + } + } for (auto output : graph_->outputs()) { output_ssa_defs_.emplace_back(value_to_ssa_def[output]); } @@ -657,7 +669,7 @@ StaticModule::StaticModule( if (opts_.optimize_memory) { auto lm = GetLivenessMap(graph_, external_values_, alias_db); - auto values = GetMemoryPlanningCandidates(graph_); + auto values = GetMemoryPlanningCandidates(graph_, node_has_out_variant); value_to_same_storage_values_ = GenerateSameStorageValues(lm, external_values_, values, alias_db); } @@ -1177,7 +1189,8 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) { // check for intermediates if (!ival->isNone()) { TORCH_CHECK( - ival->isTensor() || isOptimizableContainerType(pnode.node()), + ival->isTensor() || + static_module_.is_optimizable_container_type(pnode.node()), error_msg); if (ival->isTensor()) { const auto& t = ival->toTensor(); @@ -1262,9 +1275,9 @@ MemoryPlanner::MemoryPlanner( const auto& type = out_v->type(); if (type->castRaw()) { managed_tensor_values.insert(out_v); - } else if (isOptimizableContainerType(pnode.node())) { - // We "leak" certain container types because their allocations take - // a long time + } else if (runtime->is_optimizable_container_type(pnode.node())) { + // We "leak" certain container types because their allocations + // take a long time leaked_values.insert(out_v); } } diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 6cff047b4d2ce..d8a99f78cad2d 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -160,6 +160,11 @@ class TORCH_API StaticModule { return nodes_; } + bool is_optimizable_container_type(Node* n) const { + auto it = node_is_optimizable_container_type_.find(n); + return it != node_is_optimizable_container_type_.end(); + } + const c10::optional& schema() const { return schema_; } @@ -204,6 +209,8 @@ class TORCH_API StaticModule { // map a value to the set of values that may share the same storage with it FastMap> value_to_same_storage_values_; + + FastSet node_is_optimizable_container_type_; }; class TORCH_API StaticRuntime { @@ -287,6 +294,10 @@ class TORCH_API StaticRuntime { void check_for_memory_leak(bool output_returned = true); + bool is_optimizable_container_type(Node* n) const { + return static_module_.is_optimizable_container_type(n); + } + private: // helper method for copying input args/kwargs into inputs_ void set_inputs( diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index f171d2889f551..3b586689a6c5c 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -25,6 +25,7 @@ #include #include #include +#include C10_DEFINE_bool( static_runtime_enable_fast_math, @@ -312,27 +313,33 @@ bool hasVarArgs(Node* n) { return false; } -// Expensive check, use sparingly. -// This is needed to make sure that we only switch to out variants for the -// supported overloads, which is checked in the `Generate` step in -// `SROperatorRegistry()->Create(op_name)->Generate(n)` -bool canReuseInputsOutputs(Node* n) { +bool canReuseInputsOutputs( + Node* n, + const FastMap& node_has_out_variant) { + auto it = node_has_out_variant.find(n); + if (it != node_has_out_variant.end()) { + return it->second; + } return getOutOfPlaceOperation(n) != nullptr; } // returns true if the producers of the inputs // to this operations are out of place. // This means the IValues will not change run to run -bool inputsCanRunOutOfPlace(Node* n) { +bool inputsCanRunOutOfPlace( + Node* n, + const FastMap& node_has_out_variant) { for (auto* input : n->inputs()) { - if (!canReuseInputsOutputs(input->node())) { + if (!canReuseInputsOutputs(input->node(), node_has_out_variant)) { return false; } } return true; } -bool isOptimizableContainerType(Node* n) { +bool isOptimizableContainerType( + Node* n, + const FastMap& node_has_out_variant) { const auto& type = n->output()->type(); bool is_supported_type = false; if (type->kind() == TypeKind::ListType) { @@ -348,7 +355,7 @@ bool isOptimizableContainerType(Node* n) { }); is_supported_type = iter != types.end(); } - return is_supported_type && inputsCanRunOutOfPlace(n); + return is_supported_type && inputsCanRunOutOfPlace(n, node_has_out_variant); } REGISTER_OPERATOR_FUNCTOR( @@ -356,7 +363,7 @@ REGISTER_OPERATOR_FUNCTOR( prim_ListConstruct, [](Node* n) -> SROperator { const auto& type = n->output()->type()->expectRef(); - bool can_optimize = isOptimizableContainerType(n); + bool can_optimize = isOptimizableContainerType(n, FastMap()); return [can_optimize, &type](ProcessedNode* p_node) { const auto& out_l = p_node->Output(0); if (!out_l.isNone() && can_optimize) { @@ -376,7 +383,7 @@ REGISTER_OPERATOR_FUNCTOR( prim::TupleConstruct, prim_TupleConstruct, [](Node* n) -> SROperator { - bool can_optimize = isOptimizableContainerType(n); + bool can_optimize = isOptimizableContainerType(n, FastMap()); return [can_optimize](ProcessedNode* p_node) { const auto& out_l = p_node->Output(0); if (!out_l.isNone() && can_optimize) { diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h index ff5d69e1cb895..311143ca7392f 100644 --- a/torch/csrc/jit/runtime/static/ops.h +++ b/torch/csrc/jit/runtime/static/ops.h @@ -133,8 +133,12 @@ bool opIsRegistered(const c10::Symbol& op_name); // as native ops in Static Runtime bool nativeOpIsRegistered(const c10::Symbol& op_name); -bool canReuseInputsOutputs(Node* n); -bool isOptimizableContainerType(Node* n); +bool canReuseInputsOutputs( + Node* n, + const FastMap& node_has_out_variant); +bool isOptimizableContainerType( + Node* n, + const FastMap& node_has_out_variant); std::function getOutOfPlaceOperation(Node* n); std::function getNativeOperation(Node* n); From 9ccb9299e072bb611fc67169e6f0d1fb9e49bedd Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Fri, 27 Aug 2021 18:51:09 -0700 Subject: [PATCH 318/530] To add Nesterov Adam algorithm description to documentation (#63793) Summary: It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper https://github.com/pytorch/pytorch/issues/63236. In this PR we are adding description of Nesterov Adam Algorithm to the documentation. For more details, we refer to the paper https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ NAdam Pull Request resolved: https://github.com/pytorch/pytorch/pull/63793 Reviewed By: NivekT Differential Revision: D30617057 Pulled By: iramazanli fbshipit-source-id: cd2054b0d9b6883878be74576e86e307f32f1435 --- torch/optim/nadam.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py index 55a790610c5a5..deaaf20b1d710 100644 --- a/torch/optim/nadam.py +++ b/torch/optim/nadam.py @@ -6,7 +6,34 @@ class NAdam(Optimizer): r"""Implements NAdam algorithm. - It has been proposed in `Incorporating Nesterov Momentum into Adam`_. + .. math:: + \begin{aligned} + &\rule{110mm}{0.4pt} \\ + &\textbf{input} : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)}, + \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)} \\ + &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)} \\ + &\textbf{initialize} : m_0 \leftarrow 0 \text{ ( first moment)}, + v_0 \leftarrow 0 \text{ ( second moment)} \\[-1.ex] + &\rule{110mm}{0.4pt} \\ + &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ + &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ + &\hspace{5mm}if \: \lambda \neq 0 \\ + &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ + &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{t \psi} \big) \\ + &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\ + &\hspace{5mm}m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ + &\hspace{5mm}v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2) g^2_t \\ + &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex] + & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i}) \\ + &\hspace{5mm}\widehat{v_t} \leftarrow v_t/\big(1-\beta_2^t \big) \\ + &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/ + \big(\sqrt{\widehat{v_t}} + \epsilon \big) \\ + &\rule{110mm}{0.4pt} \\[-1.ex] + &\bf{return} \: \theta_t \\[-1.ex] + &\rule{110mm}{0.4pt} \\[-1.ex] + \end{aligned} + + For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_. Args: params (iterable): iterable of parameters to optimize or dicts defining From a3a7a67048c11ee74fbdd54037a6dbaf90367964 Mon Sep 17 00:00:00 2001 From: Yuchen Huang Date: Fri, 27 Aug 2021 18:57:22 -0700 Subject: [PATCH 319/530] [iOS][GPU] Consolidate array and non-array kernel for hardswish (#63369) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63369 ghstack-source-id: 136918152 (Note: this ignores all push blocking failures!) Test Plan: - `buck test pp-macos` - Op tests in PyTorchPlayground app - Run mobilenetv3 test https://pxl.cl/1Ncls Reviewed By: xta0 Differential Revision: D30354454 fbshipit-source-id: 88bf4f8b5871e63170161b3f3e44f99b8a3086c6 --- aten/src/ATen/native/metal/MetalShaders.h | 41 ++++++++++--------- .../native/metal/mpscnn/tests/MPSCNNTests.h | 1 + .../native/metal/mpscnn/tests/MPSCNNTests.mm | 12 ++++++ .../metal/mpscnn/tests/MetalOpTestRunner.mm | 1 + .../ATen/native/metal/ops/MetalHardswish.mm | 4 +- 5 files changed, 37 insertions(+), 22 deletions(-) diff --git a/aten/src/ATen/native/metal/MetalShaders.h b/aten/src/ATen/native/metal/MetalShaders.h index 5c256723a59e5..0ee703f2ee261 100644 --- a/aten/src/ATen/native/metal/MetalShaders.h +++ b/aten/src/ATen/native/metal/MetalShaders.h @@ -393,31 +393,32 @@ kernel void clamp(texture2d_array in_arr[[texture(0), functi } } -kernel void hardswish(texture2d_array in[[texture(0)]], - texture2d_array out[[texture(1)]], +constant bool hardswish_is_arr = (ushort_arg_0 > 1 || ushort_arg_1 > 4); +constant bool hardswish_is_tex = !hardswish_is_arr; +kernel void hardswish(texture2d_array in_arr[[texture(0), function_constant(hardswish_is_arr)]], + texture2d in_tex[[texture(0), function_constant(hardswish_is_tex)]], + texture2d_array out_arr[[texture(1), function_constant(hardswish_is_arr)]], + texture2d out_tex[[texture(1), function_constant(hardswish_is_tex)]], ushort3 gid[[thread_position_in_grid]]) { - if (gid.x >= out.get_width() || gid.y >= out.get_height()) { + const ushort oH = ushort_arg_2; + const ushort oW = ushort_arg_3; + if (gid.x >= oW || gid.y >= oH) { return; } ushort2 gid_ = gid.xy; - half4 value = in.read(gid_, gid.z); - half4 mask1 = half4(value < 3.0); - half4 mask2 = half4(value > -3.0); - half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value); - out.write(outval, gid_, gid.z); -} - -kernel void hardswish_nonarray(texture2d in[[texture(0)]], - texture2d out[[texture(1)]], - ushort2 gid[[thread_position_in_grid]]) { - if (gid.x >= out.get_width() || gid.y >= out.get_height()) { - return; + if (hardswish_is_arr) { + half4 value = in_arr.read(gid_, gid.z); + half4 mask1 = half4(value < 3.0); + half4 mask2 = half4(value > -3.0); + half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value); + out_arr.write(outval, gid_, gid.z); + } else { + half4 value = in_tex.read(gid_); + half4 mask1 = half4(value < 3); + half4 mask2 = half4(value > -3.0); + half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value); + out_tex.write(outval, gid_); } - half4 value = in.read(gid); - half4 mask1 = half4(value < 3); - half4 mask2 = half4(value > -3.0); - half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value); - out.write(outval, gid); } constant bool out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4); diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h index ee992d9db5abd..599f2ceb64f4c 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h +++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h @@ -41,6 +41,7 @@ bool test_softmax(); bool test_sigmoid(); bool test_hardsigmoid(); bool test_hardswish(); +bool test_hardswish2(); bool test_upsampling_nearest2d_vec(); bool test_upsampling_nearest2d_vec2(); bool test_adaptive_avg_pool2d(); diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm index 69497a976a130..5a8f6de86996b 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm +++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm @@ -262,6 +262,18 @@ bool test_hardswish() { }); } +bool test_hardswish2() { + __block std::vector size{1, 3, 44, 44}; + return TEST(size, __PRETTY_FUNCTION__, ^bool { + auto X = + at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat)) * 12 - 6; + auto X2 = X.metal(); + auto Y1 = at::hardswish_(X); + auto Y2 = at::hardswish_(X2).cpu(); + return almostEqual(Y1, Y2); + }); +} + bool test_addmm() { bool result = true; for (int i = 0; i < ITER_COUNT; ++i) { diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm index d8b69adcc9d1e..f337e1dfc824e 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm +++ b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm @@ -69,6 +69,7 @@ - (void)registerTests { REG_TEST("test_sigmoid", test_sigmoid); REG_TEST("test_hardsigmoid", test_hardsigmoid); REG_TEST("test_hardswish", test_hardswish); + REG_TEST("test_hardswish2", test_hardswish2); REG_TEST("test_upsampling_nearest2d_vec", test_upsampling_nearest2d_vec); REG_TEST("test_upsampling_nearest2d_vec2", test_upsampling_nearest2d_vec2); REG_TEST("test_adaptive_avg_pool2d", test_adaptive_avg_pool2d); diff --git a/aten/src/ATen/native/metal/ops/MetalHardswish.mm b/aten/src/ATen/native/metal/ops/MetalHardswish.mm index 8d3526a4c6b2a..d571e483233dd 100644 --- a/aten/src/ATen/native/metal/ops/MetalHardswish.mm +++ b/aten/src/ATen/native/metal/ops/MetalHardswish.mm @@ -24,9 +24,9 @@ id encoder = [commandBuffer.buffer computeCommandEncoder]; id state = [[MetalContext sharedInstance] - specializedPipelineState:mpscnn::kernelFor( - X, "hardswish", "hardswish_nonarray") + specializedPipelineState:"hardswish" Constants:@[ + @(X.numberOfImages), @(X.featureChannels), @(X.height), @(X.width) From 0d0605eaa9243c938faddd3fb60f922c4a48c953 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Fri, 27 Aug 2021 20:58:20 -0700 Subject: [PATCH 320/530] [quant][graphmode][fx] Add reference quantized linear module (#63627) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63627 Added reference quantized linear module for the custom backend flow, the reference quantized module will have the following code: ``` w(float) -- quant - dequant \ x(float) ------------- F.linear --- ``` In the full model, we will see ``` w(float) -- quant - *dequant \ x -- quant --- *dequant -- *F.linear --- *quant - dequant ``` and the backend should be able to fuse the ops with `*` into a quantized linear Test Plan: python test/test_quantization.py TestQuantizeFx.test_conv_linear_reference Imported from OSS Reviewed By: vkuzo Differential Revision: D30504750 fbshipit-source-id: 5729921745c2b6a0fb344efc3689f3b170e89500 --- .../core/test_quantized_module.py | 51 +++---- test/quantization/fx/test_quantize_fx.py | 72 +++++++++- .../quantized/_reference/modules/__init__.py | 2 - .../_reference/modules/linear_relu.py | 28 ---- .../nn/quantized/_reference/modules/linear.py | 124 +++++++++++++----- .../nn/quantized/_reference/modules/utils.py | 45 +++++++ .../quantization/fx/quantization_patterns.py | 19 ++- torch/quantization/quantization_mappings.py | 5 +- 8 files changed, 240 insertions(+), 106 deletions(-) delete mode 100644 torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py create mode 100644 torch/nn/quantized/_reference/modules/utils.py diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py index 10d5831e87758..bc8a6b397eef8 100644 --- a/test/quantization/core/test_quantized_module.py +++ b/test/quantization/core/test_quantized_module.py @@ -6,7 +6,6 @@ import torch.nn.quantized as nnq import torch.nn.quantized._reference as nnqr import torch.nn.quantized.dynamic as nnqd -import torch.nn.functional as F import torch.quantization from torch.quantization import ( @@ -70,24 +69,21 @@ def test_linear_api(self): [4, 8], [True, False], [True, False], - [True, False], [True, False]) for (batch_size, in_features, out_features, use_bias, - use_fused, per_channel, is_reference) in options: + use_fused, per_channel) in options: self._test_linear_api_impl( batch_size, in_features, out_features, use_bias, use_fused, - per_channel, is_reference) + per_channel) - def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, use_fused, per_channel, is_reference): + def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, use_fused, per_channel): if torch.backends.quantized.engine == 'qnnpack': per_channel = False - # (use_fused, is_reference) -> quantized class + # use_fused -> quantized class class_map = { - (True, True) : nniqr.LinearReLU, - (True, False) : nniq.LinearReLU, - (False, True) : nnqr.Linear, - (False, False) : nnq.Linear, + True: nniq.LinearReLU, + False: nnq.Linear, } W = torch.rand(out_features, in_features).float() @@ -107,7 +103,7 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, B = torch.rand(out_features).float() if use_bias else None scale = 0.5 zero_point = 3 - qlinear = class_map[(use_fused, is_reference)](in_features, out_features) + qlinear = class_map[use_fused](in_features, out_features) qlinear_copy = qlinear # deepcopy does not work right now # qlinear_copy = copy.deepcopy(qlinear) @@ -127,21 +123,11 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, # Check if the module implementation matches calling the # ops directly - if is_reference: - weight = qlinear._qweight - bias = qlinear._bias - weight_dequant = weight.dequantize() - X_q_dq = X_q.dequantize() - Z_ref = F.linear(X_q_dq, weight_dequant, bias) - if use_fused: - Z_ref = F.relu(Z_ref, inplace=True) - Z_ref = torch.quantize_per_tensor(Z_ref, scale, zero_point, torch.quint8) + W_pack = qlinear._packed_params._packed_params + if use_fused: + Z_ref = torch.ops.quantized.linear_relu(X_q, W_pack, scale, zero_point) else: - W_pack = qlinear._packed_params._packed_params - if use_fused: - Z_ref = torch.ops.quantized.linear_relu(X_q, W_pack, scale, zero_point) - else: - Z_ref = torch.ops.quantized.linear(X_q, W_pack, scale, zero_point) + Z_ref = torch.ops.quantized.linear(X_q, W_pack, scale, zero_point) self.assertEqual(Z_ref, Z_q) self.assertTrue( @@ -163,16 +149,12 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, else: self.assertEqual(model_dict[key], loaded_dict[key]) - loaded_qlinear = class_map[(use_fused, is_reference)]( + loaded_qlinear = class_map[use_fused]( in_features, out_features) loaded_qlinear.load_state_dict(loaded_dict) - if is_reference: - self.assertEqual(qlinear._qweight, loaded_qlinear._qweight) - self.assertEqual(qlinear._bias, loaded_qlinear._bias) - else: - linear_unpack = torch.ops.quantized.linear_unpack - self.assertEqual(linear_unpack(qlinear._packed_params._packed_params), - linear_unpack(loaded_qlinear._packed_params._packed_params)) + linear_unpack = torch.ops.quantized.linear_unpack + self.assertEqual(linear_unpack(qlinear._packed_params._packed_params), + linear_unpack(loaded_qlinear._packed_params._packed_params)) self.assertEqual(qlinear.scale, loaded_qlinear.scale) self.assertEqual(qlinear.zero_point, loaded_qlinear.zero_point) # make sure loaded_qlinear has the same dir as qlinear since @@ -180,8 +162,7 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, self.checkScriptable(loaded_qlinear, [[X_q]], check_save_load=True) self.assertTrue(dir(qlinear) == dir(loaded_qlinear)) self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias()) - if not is_reference: - self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params)) + self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params)) Z_q2 = loaded_qlinear(X_q) self.assertEqual(Z_q, Z_q2) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 762919eeb04ea..7ae29e03f6a46 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -3,6 +3,7 @@ import torch.nn.functional as F import torch.nn as nn import torch.nn.quantized as nnq +import torch.nn.quantized._reference as nnqr import torch.nn.quantized.dynamic as nnqd import torch.nn.intrinsic as nni import torch.nn.intrinsic.quantized as nniq @@ -571,7 +572,7 @@ def forward(self, x): LinearModule, (), (linear_module_input,), - ns.call_module(nn.Linear) if is_reference else ns.call_module(nnqd.Linear), + ns.call_module(nnqr.Linear) if is_reference else ns.call_module(nnqd.Linear), None, ), ( @@ -579,7 +580,7 @@ def forward(self, x): LinearModule, (), (linear_module_input,), - ns.call_module(nn.Linear if is_reference else nnq.Linear), + ns.call_module(nnqr.Linear if is_reference else nnq.Linear), None, ), ] @@ -608,6 +609,13 @@ def test_conv_linear_reference(self): """ Test quantizing functional conv and linear with reference option """ tests = self._get_conv_linear_test_cases(is_reference=True) + + def _get_keys(prefix, is_dynamic): + all_keys = [prefix + "." + k for k in ["weight_qscheme", "weight_dtype"]] + if not is_dynamic: + all_keys.extend([prefix + "." + k for k in ["weight_scale", "weight_zero_point"]]) + return all_keys + for (is_dynamic, ModuleClass, module_constructor_inputs, inputs, quantized_node, weight_prepack_node) in tests: quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC @@ -623,13 +631,19 @@ def test_conv_linear_reference(self): qr = result_dict["quantized_reference"] def checkWeightQParams(model): - for module_name in ("linear", "conv"): + for module_name in ("conv",): if hasattr(model, module_name): self.assertTrue(hasattr(qr.get_submodule(module_name), "_weight_qparams")) self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name()) + for module_name in ("linear",): + if hasattr(model, module_name): + self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_qscheme")) + self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_scale")) + self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_zero_point")) + self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name()) - def checkSerDeser(model): - for module_name in ("linear", "conv"): + def checkSerDeser(model, is_dynamic): + for module_name in ("conv",): if hasattr(model, module_name): # make sure seralization works state_dict = copy.deepcopy(model.state_dict()) @@ -641,6 +655,20 @@ def checkSerDeser(model): module._weight_qparams["scale"] = None model.load_state_dict(state_dict) self.assertTrue(torch.equal(prev_scale, module._weight_qparams["scale"])) + for module_name in ("linear",): + if hasattr(model, module_name): + # make sure seralization works + state_dict = copy.deepcopy(model.state_dict()) + all_keys = _get_keys(module_name, is_dynamic) + for key in all_keys: + self.assertTrue(key in state_dict) + # check load_state_dict restores states + module = getattr(model, module_name) + prev_scale = module.weight_scale + module.weight_scale = None + model.load_state_dict(state_dict) + module = getattr(model, module_name) + self.assertTrue(torch.equal(prev_scale, module.weight_scale)) checkWeightQParams(qr) @@ -648,7 +676,7 @@ def checkSerDeser(model): # make sure the qparams are preserved after copy checkWeightQParams(qr) - checkSerDeser(qr) + checkSerDeser(qr, is_dynamic) @skipIfNoFBGEMM def test_dynamic_quant_weight_observer(self): @@ -2941,6 +2969,38 @@ def forward(self, x): ] self.checkGraphModuleNodes(m, expected_node_list=node_list) + def test_ref_linear_module(self): + """ Make sure the numerics for models with ref linear module + matches models with fbgemm/qnnpack module + """ + class M1(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 5) + + def forward(self, x): + return self.linear(x) + + class M2(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 5) + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(self.linear(x)) + + for M in [M1, M2]: + m = M().eval() + m = prepare_fx(m, {"": default_qconfig}) + m_copy = copy.deepcopy(m) + m = convert_fx(m, is_reference=False) + m_ref = convert_fx(m_copy, is_reference=True) + data = torch.randn(5, 10) + result = m(data) + result_ref = m_ref(data) + self.assertTrue(torch.equal(result, result_ref)) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops diff --git a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py b/torch/nn/intrinsic/quantized/_reference/modules/__init__.py index bf8ff3a3db5e1..33b18d8cf7d3f 100644 --- a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py +++ b/torch/nn/intrinsic/quantized/_reference/modules/__init__.py @@ -1,9 +1,7 @@ import torch -from .linear_relu import LinearReLU from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d __all__ = [ - 'LinearReLU', 'ConvReLU1d', 'ConvReLU2d', 'ConvReLU3d', diff --git a/torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py b/torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py deleted file mode 100644 index 39c595376fded..0000000000000 --- a/torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch -import torch.nn.intrinsic as nni -import torch.nn.quantized._reference as nnqr -import torch.nn.functional as F - -class LinearReLU(nnqr.Linear): - _FLOAT_MODULE = nni.LinearReLU - - def __init__( - self, - in_features, - out_features, - bias=True, - dtype=torch.qint8): - super().__init__(in_features, out_features, bias, dtype) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.linear(x_dequant, weight_dequant, self._bias) - float_result = F.relu(float_result, inplace=True) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) - return result - - def _get_name(self): - return "QuantizedLinearReLU(Reference)" diff --git a/torch/nn/quantized/_reference/modules/linear.py b/torch/nn/quantized/_reference/modules/linear.py index 276dc0161ded8..1df5499433d1c 100644 --- a/torch/nn/quantized/_reference/modules/linear.py +++ b/torch/nn/quantized/_reference/modules/linear.py @@ -1,51 +1,115 @@ import torch -import torch.nn.quantized as nnq +import torch.nn as nn import torch.nn.functional as F -from typing import Optional +from typing import Optional, Dict, Any +from .utils import _quantize_and_dequantize_weight +from .utils import _save_weight_qparams +from .utils import _get_weight_qparam_keys -class Linear(nnq.Linear): - """ A backend independent version of nn.quantized.Linear - we will not pack the parameters in this module, since weight packing is an - optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), - this is useful when user want to use this module in other backends like Glow. +class Linear(nn.Linear): + """ A reference quantized linear module that fits into the FX + Graph Mode Quantization workflow + activation will be floating point Tensor, we will store floating + point weight as well in the module, but in forward we'll quantize + and dequantize the weight before running the floating point functional + linear operator. """ - def __init__(self, in_features, out_features, bias_=True, - dtype=torch.qint8): - super().__init__(in_features, out_features, bias_, dtype) - self._qweight, self._bias = self._packed_params._weight_bias() - del self._packed_params + def __init__( + self, + in_features: int, + out_features: int, + bias_: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + weight_qparams: Optional[Dict[str, Any]] = None): + super().__init__(in_features, out_features, bias_, device, dtype) + if weight_qparams is None: + weight_qparams = { + "qscheme": torch.per_tensor_affine, + "dtype": torch.quint8, + "scale": 1.0, + "zero_point": 0 + } + self.weight_qscheme = weight_qparams["qscheme"] + self.weight_dtype = weight_qparams["dtype"] + assert self.weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \ + Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized linear module") + if self.weight_qscheme is not None: + self.register_buffer( + "weight_scale", + torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device)) + self.register_buffer( + "weight_zero_point", + torch.tensor( + weight_qparams["zero_point"], + dtype=torch.int, device=device)) + if self.weight_qscheme == torch.per_channel_affine: + self.register_buffer( + "weight_axis", + torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device)) + else: + # added for TorchScriptability, not used + self.register_buffer( + "weight_axis", + torch.tensor(0, dtype=torch.int, device=device)) def _get_name(self): return "QuantizedLinear(Reference)" + def get_weight(self): + """ + Fake quantize (quantize and dequantize) the weight with + the quantization parameters for weight, this is used to + simulate the numerics for the quantized weight in a quantized + model + """ + # supress mypy warning + assert isinstance(self.weight, torch.Tensor) + assert isinstance(self.weight_scale, torch.Tensor) + assert isinstance(self.weight_zero_point, torch.Tensor) + assert isinstance(self.weight_axis, torch.Tensor) + return _quantize_and_dequantize_weight( + self.weight, self.weight_qscheme, self.weight_dtype, self.weight_scale, + self.weight_zero_point, self.weight_axis) + def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.linear(x_dequant, weight_dequant, self._bias) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.linear --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.linear --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized linear + """ + weight_dequant = self.get_weight() + result = F.linear(x, weight_dequant, self.bias) return result def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) - destination[prefix + '_qweight'] = self._qweight - destination[prefix + '_bias'] = self._bias + _save_weight_qparams( + destination, prefix, self.weight_qscheme, self.weight_dtype, + self.weight_scale, self.weight_zero_point, self.weight_axis) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - self._qweight = state_dict[prefix + '_qweight'] - self._bias = state_dict[prefix + '_bias'] - state_dict.pop(prefix + '_qweight') - state_dict.pop(prefix + '_bias') + for key in _get_weight_qparam_keys(state_dict, prefix): + setattr(self, key, state_dict[prefix + key]) + state_dict.pop(prefix + key) super()._load_from_state_dict( state_dict, prefix, local_metadata, False, missing_keys, unexpected_keys, error_msgs) - def _weight_bias(self): - return self._qweight, self._bias - - def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: - self._qweight = w - self._bias = b + @classmethod + def from_float(cls, float_linear, weight_qparams): + qref_linear = Linear( + float_linear.in_features, float_linear.out_features, + float_linear.bias is not None, device=float_linear.weight.device, + dtype=float_linear.weight.dtype, weight_qparams=weight_qparams) + qref_linear.weight = torch.nn.Parameter(float_linear.weight.detach()) + if float_linear.bias is not None: + qref_linear.bias = torch.nn.Parameter(float_linear.bias.detach()) + return qref_linear diff --git a/torch/nn/quantized/_reference/modules/utils.py b/torch/nn/quantized/_reference/modules/utils.py new file mode 100644 index 0000000000000..7c366503dd872 --- /dev/null +++ b/torch/nn/quantized/_reference/modules/utils.py @@ -0,0 +1,45 @@ +import torch +from typing import Dict, Any + +def _quantize_and_dequantize_weight( + weight: torch.Tensor, + weight_qscheme: torch.qscheme, + weight_dtype: torch.dtype, + weight_scale: torch.Tensor, + weight_zero_point: torch.Tensor, + weight_axis: torch.Tensor): + """ Quantize and then dequantize the weight based on + the quantization parameters + """ + if weight_qscheme == torch.per_tensor_affine: + weight = torch.quantize_per_tensor(weight, weight_scale, weight_zero_point, weight_dtype) + weight_dequant = weight.dequantize() + elif weight_qscheme == torch.per_channel_affine: + weight = torch.quantize_per_channel( + weight, weight_scale, + weight_zero_point, weight_axis.item(), weight_dtype) # type: ignore[arg-type] + weight_dequant = weight.dequantize() + else: + weight_dequant = weight + return weight_dequant + +def _save_weight_qparams(destination, prefix, weight_qscheme, weight_dtype, weight_scale, weight_zero_point, weight_axis): + destination[prefix + "weight_qscheme"] = weight_qscheme + destination[prefix + "weight_dtype"] = weight_dtype + if weight_qscheme is not None: + destination[prefix + "weight_scale"] = weight_scale + destination[prefix + "weight_zero_point"] = weight_zero_point + if weight_qscheme == torch.per_channel_affine: + destination[prefix + "weight_axis"] = weight_axis + +def _get_weight_qparam_keys( + state_dict: Dict[str, Any], + prefix: str): + keys = ["weight_qscheme", "weight_dtype"] + weight_qscheme = state_dict[prefix + "weight_qscheme"] + if weight_qscheme is not None: + keys.append("weight_scale") + keys.append("weight_zero_point") + if weight_qscheme == torch.quantize_per_channel: + keys.append("weight_axis") + return keys diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 6362961ad8daa..e8b873658b504 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -869,6 +869,7 @@ def convert(self, # Get the float linear and attach qscheme and qparams # the the module float_linear = self.linear + fused_linear = None if isinstance(float_linear, (torch.nn.qat.Linear, torch.nn.intrinsic.qat.LinearReLU)): float_linear = float_linear.to_float() # change qat linear to linear @@ -876,10 +877,12 @@ def convert(self, setattr(modules[parent_name], name, float_linear) # Attach weight fake quant to the linear module if isinstance(float_linear, torch.nn.intrinsic.LinearReLU): + fused_linear = float_linear float_linear = float_linear[0] weight_post_process = self.linear.weight_fake_quant else: if isinstance(float_linear, torch.nn.intrinsic.LinearReLU): + fused_linear = float_linear float_linear = self.linear[0] # type: ignore[index] # Attach the weight observer to the module weight_post_process = qconfig.weight() # type: ignore[union-attr] @@ -887,7 +890,21 @@ def convert(self, weight_post_process(float_linear.weight) # type: ignore[operator] weight_qparams = get_qparam_dict(weight_post_process) - _to_reference(float_linear, weight_qparams) + # TODO: include the configuration in backend_config_dict + # we can have a map from module to reference module + # and allow user to register new ones + qlinear_cls = get_static_quant_module_class( + type(float_linear), is_reference=is_reference) + ref_linear = qlinear_cls.from_float(float_linear, weight_qparams) + + # if the parent is a fused linear (Sequential), we can replace the first + # item to ref linear, otherwise we can update + # the linear instance in the module tree + if fused_linear is not None: + fused_linear[0] = ref_linear + else: + parent_name, name = _parent_name(self.linear_node.target) + setattr(modules[parent_name], name, ref_linear) op_out = quantized_graph.create_node( 'call_module', self.linear_node.target, diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py index 775d40bb23efa..03b177805bac3 100644 --- a/torch/quantization/quantization_mappings.py +++ b/torch/quantization/quantization_mappings.py @@ -25,16 +25,14 @@ # Default map for swapping float module to reference quantized modules DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = { + nn.Linear: nnqr.Linear, nn.Conv1d: nnqr.Conv1d, nn.Conv2d: nnqr.Conv2d, nn.Conv3d: nnqr.Conv3d, - nn.Linear: nnqr.Linear, nni.ConvReLU1d: nniqr.ConvReLU1d, nni.ConvReLU2d: nniqr.ConvReLU2d, nni.ConvReLU3d: nniqr.ConvReLU3d, - nni.LinearReLU: nniqr.LinearReLU, # QAT Modules - nnqat.Linear: nnqr.Linear, nnqat.Conv2d: nnqr.Conv2d, nnqat.Conv3d: nnqr.Conv3d, nniqat.ConvBn1d: nnqr.Conv1d, @@ -45,7 +43,6 @@ nniqat.ConvBnReLU3d: nniqr.ConvReLU3d, nniqat.ConvReLU2d: nniqr.ConvReLU2d, nniqat.ConvReLU3d: nniqr.ConvReLU3d, - nniqat.LinearReLU: nniqr.LinearReLU, } # Default map for swapping float module to quantized ones From f4496528e38684b4482636998bf4bb63d5dd3140 Mon Sep 17 00:00:00 2001 From: Priya Ramani Date: Fri, 27 Aug 2021 22:50:20 -0700 Subject: [PATCH 321/530] [Light] Fix error message (#64010) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64010 Fixing typos in a error message Test Plan: Error message before fix: Lite Interpreter verson number does not match. The model version must be between 3 and 5But the model version is 6 Error message after fix: Lite Interpreter version number does not match. The model version must be between 3 and 5 but the model version is 6 Reviewed By: larryliu0820 Differential Revision: D30568367 fbshipit-source-id: 205f3278ee8dcf38579dbb828580a9e986ccacc1 --- torch/csrc/jit/mobile/import.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index db9f0b8c20cf5..d2865d071ea72 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -317,12 +317,12 @@ void BytecodeDeserializer::parseMethods( caffe2::serialize::kMinSupportedBytecodeVersion <= model_version && // NOLINTNEXTLINE(clang-diagnostic-sign-compare) model_version <= caffe2::serialize::kMaxSupportedBytecodeVersion, - "Lite Interpreter verson number does not match. ", + "Lite Interpreter version number does not match. ", "The model version must be between ", caffe2::serialize::kMinSupportedBytecodeVersion, " and ", caffe2::serialize::kMaxSupportedBytecodeVersion, - "But the model version is ", + " but the model version is ", model_version); bool has_debug_handles = debug_handles.has_value(); From d0c63e857d12f3ddc04a80defb2530694b4f263d Mon Sep 17 00:00:00 2001 From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" Date: Sat, 28 Aug 2021 11:44:58 -0700 Subject: [PATCH 322/530] Enhancement for smart serialization for out schemas (#63096) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63096 Test Plan: Imported from OSS Reviewed By: gmagogsfm Differential Revision: D30415255 Pulled By: tugsbayasgalan fbshipit-source-id: eb40440a3b46258394d035479f5fc4a4baa12bcc --- test/cpp/jit/test_interpreter.cpp | 9 ++++ test/cpp/jit/test_utils.cpp | 15 ++++++ test/cpp/jit/test_utils.h | 1 + test/jit/test_ignorable_args.py | 7 +++ .../jit/runtime/calculate_necessary_args.h | 43 ++++++++++++---- .../csrc/jit/runtime/interpreter/code_impl.h | 25 +++++++++- torch/csrc/jit/serialization/python_print.cpp | 50 ++++++++++++++----- 7 files changed, 126 insertions(+), 24 deletions(-) diff --git a/test/cpp/jit/test_interpreter.cpp b/test/cpp/jit/test_interpreter.cpp index a2418918336c5..bfdc1f3a0cb7e 100644 --- a/test/cpp/jit/test_interpreter.cpp +++ b/test/cpp/jit/test_interpreter.cpp @@ -175,6 +175,15 @@ TEST(InterpreterTest, IgnorableArgsInSchema) { ASSERT_TRUE(op_to_specified_args_non_const["aten::conv2d"] == 6); } +TEST(InterpreterTest, IgnorableArgsInSchemaWithOut) { + auto graph = build_mobile_export_with_out(); + MobileCode function(graph, ""); + auto op_to_specified_args = function.op_to_num_specified_args(); + ASSERT_TRUE(op_to_specified_args.size() == 1); + // this should be 3 when the add_out flag is set to True + ASSERT_TRUE(op_to_specified_args["aten::add.out"] == 4); +} + TEST(InterpreterTest, runAsyncBasicTest) { /* TODO: there are some problem with C++ parsing script program involving diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp index 27667f068588b..f2fb9e1fb0606 100644 --- a/test/cpp/jit/test_utils.cpp +++ b/test/cpp/jit/test_utils.cpp @@ -123,6 +123,21 @@ std::shared_ptr build_mobile_export_analysis_graph() { return g; } +std::shared_ptr build_mobile_export_with_out() { + const auto graph_string = R"IR( + graph(%x.1 : Tensor, + %y.1 : Tensor): + %8 : NoneType = prim::Constant() + %6 : int = prim::Constant[value=1]() + %7 : Tensor = aten::add(%x.1, %y.1, %6, %y.1) + return (%8))IR"; + + auto g = std::make_shared(); + torch::jit::parseIR(graph_string, g.get()); + g->lint(); + return g; +} + std::shared_ptr build_mobile_export_analysis_graph_nested() { // this is pretty much same test as build_mobile_export_analysis_graph(), // but some aten::slice operators are hidden under block statement to check diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h index 5e640aed0114a..1a1e1b82b10e8 100644 --- a/test/cpp/jit/test_utils.h +++ b/test/cpp/jit/test_utils.h @@ -74,6 +74,7 @@ std::pair runGradient( std::shared_ptr build_lstm(); std::shared_ptr build_mobile_export_analysis_graph(); +std::shared_ptr build_mobile_export_with_out(); std::shared_ptr build_mobile_export_analysis_graph_with_vararg(); std::shared_ptr build_mobile_export_analysis_graph_nested(); std::shared_ptr build_mobile_export_analysis_graph_non_const(); diff --git a/test/jit/test_ignorable_args.py b/test/jit/test_ignorable_args.py index b195e3cc4faaa..fb63c1973bf0e 100644 --- a/test/jit/test_ignorable_args.py +++ b/test/jit/test_ignorable_args.py @@ -1,5 +1,6 @@ import os import sys +import torch from torch._C import parse_ir from torch.testing import FileCheck @@ -43,3 +44,9 @@ def test_slice_ignorable_args_for_slice(self): # because in %16, %15 and %0 are default values for the schema. FileCheck().check("torch.slice(torch.slice(torch.tensor(_0), 0, 2), 1, None, 1)").run(src) self.assertEqual(function(), function_copy()) + + def test_add_out_ignorable_args(self): + @torch.jit.script + def fn(x: torch.Tensor, y: torch.Tensor): + torch.add(x, y, out=y) + FileCheck().check("torch.add(x, y, out=y)").run(fn.code) diff --git a/torch/csrc/jit/runtime/calculate_necessary_args.h b/torch/csrc/jit/runtime/calculate_necessary_args.h index 5f37660ee14a8..07df670b01040 100644 --- a/torch/csrc/jit/runtime/calculate_necessary_args.h +++ b/torch/csrc/jit/runtime/calculate_necessary_args.h @@ -7,18 +7,42 @@ namespace torch { namespace jit { -inline size_t CalculateNecessaryArgs( +inline std::pair CalculateNecessaryArgs( const std::vector& schema_args, - at::ArrayRef actual_inputs) { + at::ArrayRef actual_inputs, + bool allow_trailing_out_args) { + if (schema_args.size() == 0) { + return std::make_pair(0, 0); + } + + // count number of out arguments + auto schema_idx = schema_args.size() - 1; + if (allow_trailing_out_args) { + // skip over out arguments in the end. + while (schema_idx >= 0) { + auto current_arg = schema_args.at(schema_idx); + if (!current_arg.is_out()) { + break; + } + schema_idx--; + } + } + + auto num_out = schema_args.size() - schema_idx - 1; + if (schema_args.size() < actual_inputs.size()) { - return actual_inputs.size(); + return std::make_pair(actual_inputs.size(), num_out); + } + + // if it is the default args, we reset the index to the last element + if (!allow_trailing_out_args) { + schema_idx = schema_args.size() - 1; } // keeps track of trailing unnecessary args - int schema_size = schema_args.size(); - for (int schema_idx = schema_size - 1; schema_idx > -1; schema_idx--) { + while (schema_idx >= 0) { // this means it is not default argument, so it is necessary if (!schema_args.at(schema_idx).default_value().has_value()) { - return schema_idx + 1; + return std::make_pair(schema_idx + 1, num_out); } else { auto schema_value = schema_args.at(schema_idx).default_value().value().toIValue(); @@ -27,16 +51,17 @@ inline size_t CalculateNecessaryArgs( // well. auto actual_value = toIValue(actual_inputs[schema_idx]); if (!actual_value.has_value()) { - return schema_idx + 1; + return std::make_pair(schema_idx + 1, num_out); } // if the IR has same value as default value of the schema, // it is not neccessary argument. if (schema_value != actual_value.value()) { - return schema_idx + 1; + return std::make_pair(schema_idx + 1, num_out); } } + schema_idx--; } - return 0; + return std::make_pair(0, num_out); } } // namespace jit diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h index 00648de905767..682c695138674 100644 --- a/torch/csrc/jit/runtime/interpreter/code_impl.h +++ b/torch/csrc/jit/runtime/interpreter/code_impl.h @@ -105,6 +105,8 @@ struct CodeImpl { // This is because for all usages, at most 3 args are used. std::unordered_map op_to_num_specified_args_; + std::unordered_map op_to_num_out_args_; + // running count of uses as we emit. When we reach use_count_[v] = // v.uses().size() we know it is the final use and we can move rather than // load. @@ -292,6 +294,12 @@ struct CodeImpl { } } + void emitLoadInputs(at::ArrayRef inputs, size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + emitUse(inputs[i], false); + } + } + virtual void emitOperator(Node* node) { emitLoadInputs(node->inputs()); const Operator& op = node->getOperator(); @@ -737,13 +745,19 @@ struct MobileCodeImpl : CodeImpl { auto op_schema = node->getOperator().schema(); // skip if schema has vararg if (!op_schema.is_vararg()) { - auto numInclude = - CalculateNecessaryArgs(op_schema.arguments(), node->inputs()); + auto specifiedArgs = CalculateNecessaryArgs( + op_schema.arguments(), node->inputs(), false); + // preserving the old behavior + auto numInclude = specifiedArgs.first; + // TODO uncomment this + // auto numInclude = specifiedArgs.first + specifiedArgs.second; auto unique_name = op_schema.overload_name() != "" ? op_schema.name() + "." + op_schema.overload_name() : op_schema.name(); auto it = op_to_num_specified_args_.insert( std::pair(unique_name, 0)); + op_to_num_out_args_.insert(std::pair( + unique_name, specifiedArgs.second)); auto prev_value = it.first->second; it.first->second = std::max(numInclude, prev_value); } @@ -769,6 +783,13 @@ struct MobileCodeImpl : CodeImpl { num_include = it->second; } emitLoadInputs(node->inputs(), num_include); + // TODO: uncomment this + // auto num_out = op_to_num_out_args_.find(unique_op_name)->second; + // auto num_specified_before_out = num_include - num_out; + // emitLoadInputs(node->inputs(), 0, num_specified_before_out); + // emitLoadInputs(node->inputs(), node->inputs().size() - num_out, + // node->inputs().size()); + insertInstruction(OP, operator_table_.size()); } operator_table_.emplace_back(op.getOperation(node)); diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index 1ab968967392f..80123c625ea65 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -1162,23 +1162,47 @@ struct PythonPrintImpl { // calculate how many args are specified. // see (https://github.com/pytorch/pytorch/pull/56079) for more // details. - size_t necessary_args = - CalculateNecessaryArgs(schema.arguments(), node->inputs()); - for (const auto i : c10::irange(necessary_args)) { - if (i > 0) + size_t num_schema_args = schema.arguments().size(); + + // we only want to do this extra logic only when necessary. + if (num_schema_args > 0) { + // calculate how many args are specified. + // see (https://github.com/pytorch/pytorch/pull/56079) for more + // details. + auto specified_args = + CalculateNecessaryArgs(schema.arguments(), node->inputs(), true); + + auto num_necessary = specified_args.first; + auto num_out = specified_args.second; + + for (size_t i = 0; i < num_necessary; ++i) { + if (i > 0) + stmt << ", "; + auto v = useOf(node->inputs().at(i)); + // print the kwarg name if it is a kwarg only argument. + if (i < num_schema_args) { + auto arg = schema.arguments().at(i); + if (arg.kwarg_only()) { + stmt << arg.name() << "="; + } + } else { + // vararg functions like format can have extra arguments + AT_ASSERT(schema.is_vararg()); + } + stmt << *v; + } + + // print out args + for (size_t i = num_schema_args - num_out; i < num_schema_args; i++) { stmt << ", "; - auto v = useOf(node->inputs().at(i)); - // print the kwarg name if it is a kwarg only argument. - if (i < schema.arguments().size()) { auto arg = schema.arguments().at(i); - if (arg.kwarg_only()) { - stmt << arg.name() << "="; + TORCH_INTERNAL_ASSERT(arg.is_out()); + // figure out the corresponding input at this index + auto input_idx = node->inputs().size() - (num_schema_args - i); + if (input_idx < node->inputs().size()) { + stmt << arg.name() << "=" << *useOf(node->inputs().at(input_idx)); } - } else { - // vararg functions like format can have extra arguments - AT_ASSERT(schema.is_vararg()); } - stmt << *v; } stmt << ")"; } break; From 223f886032978487099da4f54e86e9e0549cde0c Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 28 Aug 2021 11:46:40 -0700 Subject: [PATCH 323/530] Move Parallel[Native|TBB] to GHA (#64123) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64123 Reviewed By: driazati Differential Revision: D30620966 Pulled By: malfet fbshipit-source-id: 9a23e4b3e16870f77bf18df4370cd468603d592d --- .circleci/cimodel/data/pytorch_build_data.py | 2 - .circleci/config.yml | 90 ---- .github/generated-ciflow-ruleset.json | 6 + .github/scripts/generate_ci_workflows.py | 38 +- ...rallelnative-linux-xenial-py3.6-gcc5.4.yml | 430 ++++++++++++++++++ ...-paralleltbb-linux-xenial-py3.6-gcc5.4.yml | 430 ++++++++++++++++++ 6 files changed, 892 insertions(+), 104 deletions(-) create mode 100644 .github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml create mode 100644 .github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index 5a85674d74fe9..156494589831b 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -7,8 +7,6 @@ ("5.4", [ # All this subtree rebases to master and then build ("3.6", [ ("important", [X(True)]), - ("parallel_tbb", [X(True)]), - ("parallel_native", [X(True)]), ("pure_torch", [X(True)]), ]), ]), diff --git a/.circleci/config.yml b/.circleci/config.yml index 1bb32b5cc0a3d..8df67e6fe2bc8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7166,70 +7166,6 @@ workflows: build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large - - pytorch_linux_build: - name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - - pytorch_linux_test: - name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test - requires: - - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_test: - name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test - requires: - - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_build: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - - pytorch_linux_test: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test - requires: - - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_test: - name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test - requires: - - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - pytorch_linux_build: name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build requires: @@ -9386,32 +9322,6 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-xenial-py3.6-gcc7" image_name: "pytorch-linux-xenial-py3.6-gcc7" - - pytorch_linux_build: - name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - - pytorch_linux_test: - name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test - requires: - - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_build: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - - pytorch_linux_test: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test - requires: - - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - pytorch_linux_build: name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build requires: diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index d13561190d01f..0fb27af006c85 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -10,6 +10,8 @@ "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", + "parallelnative-linux-xenial-py3.6-gcc5.4", + "paralleltbb-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-win-vs2019-cuda11.1-py3", @@ -27,6 +29,8 @@ "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", + "parallelnative-linux-xenial-py3.6-gcc5.4", + "paralleltbb-linux-xenial-py3.6-gcc5.4", "win-vs2019-cpu-py3" ], "ciflow/cuda": [ @@ -63,6 +67,8 @@ "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", + "parallelnative-linux-xenial-py3.6-gcc5.4", + "paralleltbb-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7" ], diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index f1819dbac589d..dd115405e03ea 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -272,18 +272,32 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} ), ), - # CIWorkflow( - # arch="linux", - # build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), - # CIWorkflow( - # arch="linux", - # build_environment="parallelnative-linux-xenial-py3.6-gcc5.4", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), + CIWorkflow( + arch="linux", + build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + test_runner_type=LINUX_CPU_TEST_RUNNER, + # This is a master only job despit on_pull_request is set to True + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, + ), + ), + CIWorkflow( + arch="linux", + build_environment="parallelnative-linux-xenial-py3.6-gcc5.4", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + test_runner_type=LINUX_CPU_TEST_RUNNER, + # This is a master only job despit on_pull_request is set to True + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, + ), + ), # CIWorkflow( # arch="linux", # build_environment="pure_torch-linux-xenial-py3.6-gcc5.4", diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml new file mode 100644 index 0000000000000..402ce38129052 --- /dev/null +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml @@ -0,0 +1,430 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: parallelnative-linux-xenial-py3.6-gcc5.4 + +on: + pull_request: + types: [unassigned] + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + +concurrency: + group: parallelnative-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + steps: + - name: noop + run: echo running ciflow_should_run + calculate-docker-image: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.2xlarge + needs: [ciflow_should_run] + env: + DOCKER_BUILDKIT: 1 + timeout-minutes: 90 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + - name: Check if image should be built + id: check + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + - name: Build and push docker image + if: ${{ steps.check.outputs.rebuild }} + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + DOCKER_SKIP_S3_UPLOAD: 1 + run: | + export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} + cd .circleci/docker && ./build_docker.sh + + build: + runs-on: linux.2xlarge + needs: [calculate-docker-image, ciflow_should_run] + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-build + steps: + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Build PyTorch + run: | + docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + + generate-test-matrix: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-18.04 + needs: [ciflow_should_run] + env: + TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: 1 + ENABLE_JIT_LEGACY_TEST: '' + ENABLE_MULTIGPU_TEST: '' + ENABLE_NOGPU_NO_AVX_TEST: '' + ENABLE_NOGPU_NO_AVX2_TEST: '' + ENABLE_SLOW_TEST: '' + NUM_TEST_SHARDS: 1 + MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu + NOGPU_RUNNER_TYPE: linux.2xlarge + PR_BODY: ${{ github.event.pull_request.body }} + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} + ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} + container: + image: python:3.9 + steps: + - name: Install dependencies + run: pip install typing-extensions + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - name: Generating test matrix + id: set-matrix + run: .github/scripts/generate_pytorch_test_matrix.py + + test: + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] + strategy: + matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} + steps: + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Test PyTorch + env: + BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + if [[ $NUM_TEST_SHARDS -ne 2 ]]; then + export SHARD_NUMBER=0 + fi + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086 + docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Zip test reports for upload + if: always() + env: + COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store PyTorch Test Reports + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + name: Store PyTorch Test Reports on S3 + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml new file mode 100644 index 0000000000000..59eceb58ea230 --- /dev/null +++ b/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml @@ -0,0 +1,430 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: paralleltbb-linux-xenial-py3.6-gcc5.4 + +on: + pull_request: + types: [unassigned] + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + +concurrency: + group: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + steps: + - name: noop + run: echo running ciflow_should_run + calculate-docker-image: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.2xlarge + needs: [ciflow_should_run] + env: + DOCKER_BUILDKIT: 1 + timeout-minutes: 90 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + - name: Check if image should be built + id: check + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + - name: Build and push docker image + if: ${{ steps.check.outputs.rebuild }} + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + DOCKER_SKIP_S3_UPLOAD: 1 + run: | + export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} + cd .circleci/docker && ./build_docker.sh + + build: + runs-on: linux.2xlarge + needs: [calculate-docker-image, ciflow_should_run] + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-build + steps: + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Build PyTorch + run: | + docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + + generate-test-matrix: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-18.04 + needs: [ciflow_should_run] + env: + TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: 1 + ENABLE_JIT_LEGACY_TEST: '' + ENABLE_MULTIGPU_TEST: '' + ENABLE_NOGPU_NO_AVX_TEST: '' + ENABLE_NOGPU_NO_AVX2_TEST: '' + ENABLE_SLOW_TEST: '' + NUM_TEST_SHARDS: 1 + MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu + NOGPU_RUNNER_TYPE: linux.2xlarge + PR_BODY: ${{ github.event.pull_request.body }} + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} + ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} + container: + image: python:3.9 + steps: + - name: Install dependencies + run: pip install typing-extensions + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - name: Generating test matrix + id: set-matrix + run: .github/scripts/generate_pytorch_test_matrix.py + + test: + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] + strategy: + matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} + steps: + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + submodules: recursive + - name: Display EC2 information + shell: bash + run: | + .github/scripts/display_ec2_information.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Test PyTorch + env: + BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ matrix.config }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + if [[ $NUM_TEST_SHARDS -ne 2 ]]; then + export SHARD_NUMBER=0 + fi + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086 + docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Zip test reports for upload + if: always() + env: + COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store PyTorch Test Reports + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + name: Store PyTorch Test Reports on S3 + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af From 8b6266fe4f2986f4707bcd884e16d50728191214 Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Sat, 28 Aug 2021 11:50:49 -0700 Subject: [PATCH 324/530] Automated submodule update: FBGEMM (#64129) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/f14e79481460a7c0dedf452a258072231cb343e6 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64129 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jspark1105 Differential Revision: D30621549 fbshipit-source-id: 34c109e75c96a261bf370f7a06dbb8b9004860ab --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index d4902e94367b9..e922280540acf 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit d4902e94367b9f074cadd29d7dc5ef6b0c69c6c1 +Subproject commit e922280540acf7920ea21f99b8db064f89bb8c11 From aefa2f3e643ab50fe4d8238ccdeb980143b6b454 Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Sat, 28 Aug 2021 15:54:53 -0700 Subject: [PATCH 325/530] To add RMSProp algorithm documentation (#63721) Summary: It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper https://github.com/pytorch/pytorch/issues/63236. In this PR we are adding description of RMSProp to the documentation. For more details, we refer to the paper https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf RMSProp Pull Request resolved: https://github.com/pytorch/pytorch/pull/63721 Reviewed By: albanD Differential Revision: D30612426 Pulled By: iramazanli fbshipit-source-id: c3ac630a9658d1282866b53c86023ac10cf95398 --- torch/optim/rmsprop.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py index 4aab0b3116fdb..dc72181b351f8 100644 --- a/torch/optim/rmsprop.py +++ b/torch/optim/rmsprop.py @@ -6,15 +6,44 @@ class RMSprop(Optimizer): r"""Implements RMSprop algorithm. - Proposed by G. Hinton in his - `course `_. - - The centered version first appears in `Generating Sequences + .. math:: + \begin{aligned} + &\rule{110mm}{0.4pt} \\ + &\textbf{input} : \alpha \text{ (alpha)},\: \gamma \text{ (lr)}, + \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)} \\ + &\hspace{13mm} \lambda \text{ (weight decay)},\: \mu \text{ (momentum)},\: centered\\ + &\textbf{initialize} : v_0 \leftarrow 0 \text{ (square average)}, \: + \textbf{b}_0 \leftarrow 0 \text{ (buffer)}, \: g^{ave}_0 \leftarrow 0 \\[-1.ex] + &\rule{110mm}{0.4pt} \\ + &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ + &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ + &\hspace{5mm}if \: \lambda \neq 0 \\ + &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ + &\hspace{5mm}v_t \leftarrow \alpha v_{t-1} + (1 - \alpha) g^2_t + \hspace{8mm} \\ + &\hspace{5mm} \tilde{v_t} \leftarrow v_t \\ + &\hspace{5mm}if \: centered \\ + &\hspace{10mm} g^{ave}_t \leftarrow g^{ave}_{t-1} \alpha + (1-\alpha) g_t \\ + &\hspace{10mm} \tilde{v_t} \leftarrow \tilde{v_t} - \big(g^{ave}_{t} \big)^2 \\ + &\hspace{5mm}if \: \mu > 0 \\ + &\hspace{10mm} \textbf{b}_t\leftarrow \mu \textbf{b}_{t-1} + + g_t/ \big(\sqrt{\tilde{v_t}} + \epsilon \big) \\ + &\hspace{10mm} \theta_t \leftarrow \theta_{t-1} - \gamma \textbf{b}_t \\ + &\hspace{5mm} else \\ + &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - + \gamma g_t/ \big(\sqrt{\tilde{v_t}} + \epsilon \big) \hspace{3mm} \\ + &\rule{110mm}{0.4pt} \\[-1.ex] + &\bf{return} \: \theta_t \\[-1.ex] + &\rule{110mm}{0.4pt} \\[-1.ex] + \end{aligned} + + For further details regarding the algorithm we refer to + `lecture notes `_ by G. Hinton. + and centered version `Generating Sequences With Recurrent Neural Networks `_. - The implementation here takes the square root of the gradient average before adding epsilon (note that TensorFlow interchanges these two operations). The effective - learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha` + learning rate is thus :math:`\gamma/(\sqrt{v} + \epsilon)` where :math:`\gamma` is the scheduled learning rate and :math:`v` is the weighted moving average of the squared gradient. From 4f969db325a7a70878bd3eae5bbb3fecd598d4ca Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sat, 28 Aug 2021 19:18:10 -0700 Subject: [PATCH 326/530] [nnc] Fix batchnorm implementation (#64112) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64112 Fixes #64062 Test Plan: Imported from OSS Reviewed By: zhxchen17 Differential Revision: D30622897 Pulled By: bertmaher fbshipit-source-id: 7d7c6131aa786e61fa1d0a517288396a0bdb1d22 --- test/test_jit_fuser_te.py | 25 ++++++++++++++++++++ torch/csrc/jit/tensorexpr/operators/norm.cpp | 19 +++++---------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 014f142cf1443..6d2432aa151f8 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1912,6 +1912,31 @@ def eager(x): x = torch.ones((8, 1)) torch.testing.assert_close(eager(x), script(x)) + def test_batch_norm(self): + def test(fn, args): + trace = torch.jit.trace(fn, args) + self.assertAllFused(trace.graph_for(*args)) + torch.testing.assert_allclose(fn(*args), trace(*args)) + + def bn(i, x): + return torch.batch_norm(i, x, x, x, x, False, 0.1, 1e-4, False).relu() + + def bn_no_weight(i, x): + return torch.batch_norm(i, None, x, x, x, False, 0.1, 1e-4, False).relu() + + def bn_no_bias(i, x): + return torch.batch_norm(i, x, None, x, x, False, 0.1, 1e-4, False).relu() + + def bn_neither(i, x): + return torch.batch_norm(i, None, None, x, x, False, 0.1, 1e-4, False).relu() + + for device in self.devices: + i = torch.randn(4, 16, 32, 40, device=device) + x = torch.randn(16, device=device) + for fn in [bn, bn_no_weight, bn_no_bias, bn_neither]: + test(fn, (i, x)) + + works_list = [ '__radd__', '__rdiv__', diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp index 610f928d4e0b8..2e19d735d1809 100644 --- a/torch/csrc/jit/tensorexpr/operators/norm.cpp +++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp @@ -38,11 +38,15 @@ Tensor computeBatchNorm( constant(inputs[7]) // eps }; + ExprHandle weight = FloatImm::make(1); + ExprHandle bias = FloatImm::make(0); if (hasWeight) { - exprInputs.push_back(tensorOrConstant(inputs[1], {c})); + weight = tensorOrConstant(inputs[1], {c}); + exprInputs.push_back(weight); } if (hasBias) { - exprInputs.push_back(tensorOrConstant(inputs[2], {c})); + bias = tensorOrConstant(inputs[2], {c}); + exprInputs.push_back(bias); } promoteInputs(exprInputs); @@ -50,18 +54,7 @@ Tensor computeBatchNorm( ExprHandle mean = exprInputs[1]; ExprHandle var = exprInputs[2]; ExprHandle eps = exprInputs[3]; - ExprHandle weight = FloatImm::make(1); - ExprHandle bias = FloatImm::make(0); - - if (hasWeight) { - weight = exprInputs[4]; - } - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - if (hasBias) { - bias = exprInputs[5]; - } - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) auto inv_var = rsqrt(var + eps); auto alpha = inv_var * weight; auto beta = bias - mean * alpha; From 405c15516c3a052fbad33fa8fcb6cde84ebc97d8 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sat, 28 Aug 2021 19:57:10 -0700 Subject: [PATCH 327/530] Parse int64 sizes/strides (#64076) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64076 We were parsing sizes into int32s, so if you had a tensor with more than 2^32 elements, you couldn't represent it. ghstack-source-id: 136933273 Test Plan: parseIR with size of 4e9 Reviewed By: ZolotukhinM Differential Revision: D30521116 fbshipit-source-id: 1e28e462cba52d648e0e2acb4e234d86aae25a3e --- torch/csrc/jit/frontend/schema_type_parser.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp index db1a1e83bc7ce..b4e6ca880ebce 100644 --- a/torch/csrc/jit/frontend/schema_type_parser.cpp +++ b/torch/csrc/jit/frontend/schema_type_parser.cpp @@ -235,7 +235,7 @@ TypePtr SchemaTypeParser::parseRefinedTensor() { const std::string& num = L.expect(TK_NUMBER).text(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::string::size_type num_len; - size_t stride = c10::stoi(num, &num_len); + auto stride = c10::stoll(num, &num_len); strides.push_back(stride); }); return; @@ -260,7 +260,7 @@ TypePtr SchemaTypeParser::parseRefinedTensor() { const std::string& num = L.expect(TK_NUMBER).text(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::string::size_type num_len; - size_t dim = c10::stoi(num, &num_len); + auto dim = c10::stoll(num, &num_len); dims.emplace_back(dim); }); if (seen_strides) { From 2e6221a232d39917e2736b248c53fa85dfb8986e Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sat, 28 Aug 2021 19:57:10 -0700 Subject: [PATCH 328/530] [nnc] Make 64-bit dimensions work (#64077) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64077 We were assuming kernel dimensions fit in 32 bits (the old fuser made this assumption too), but we should be able to support 64. ghstack-source-id: 136933272 Test Plan: unit tests; new IR level test with huge sizes Reviewed By: ZolotukhinM Differential Revision: D30596689 fbshipit-source-id: 23b7e393a2ebaecb0c391a6b1f0c4b05a98bcc94 --- test/cpp/tensorexpr/test_kernel.cpp | 40 ++++-- test/cpp/tensorexpr/test_llvm.cpp | 78 ++++++----- test/cpp/tensorexpr/test_loopnest.cpp | 7 +- test/cpp/tensorexpr/test_reductions.cpp | 1 - torch/csrc/jit/tensorexpr/block_codegen.cpp | 11 +- .../csrc/jit/tensorexpr/bounds_inference.cpp | 2 +- torch/csrc/jit/tensorexpr/bounds_overlap.cpp | 13 +- torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 19 +-- torch/csrc/jit/tensorexpr/eval.cpp | 56 ++++++-- torch/csrc/jit/tensorexpr/eval.h | 14 ++ torch/csrc/jit/tensorexpr/expr.h | 2 +- torch/csrc/jit/tensorexpr/ir.cpp | 6 +- torch/csrc/jit/tensorexpr/ir.h | 24 ++++ torch/csrc/jit/tensorexpr/ir_printer.cpp | 8 ++ torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 129 ++++++++---------- torch/csrc/jit/tensorexpr/kernel.cpp | 81 +++++------ torch/csrc/jit/tensorexpr/kernel.h | 4 +- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 39 +++--- torch/csrc/jit/tensorexpr/llvm_jit.h | 4 +- torch/csrc/jit/tensorexpr/loopnest.cpp | 114 +++++++++------- .../jit/tensorexpr/mem_dependency_checker.cpp | 27 ++-- torch/csrc/jit/tensorexpr/registerizer.cpp | 5 +- torch/csrc/jit/tensorexpr/tensor.cpp | 7 +- 23 files changed, 397 insertions(+), 294 deletions(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 625fadb811710..f4d3b16b964f2 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -198,6 +198,22 @@ TEST_F(Kernel, _3) { } } +TEST_F(Kernel, Huge) { + const auto graph_string = R"IR( + graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)): + %1 : int = prim::Constant[value=0]() + %2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1) + %3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2) + return (%3))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + TensorExprKernel k(graph); + std::ostringstream oss; + oss << *k.getCodeGenStmt(); + const std::string& verification_pattern = "# CHECK: 4000000000"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); +} + TEST_F(Kernel, ParallelStrided) { const auto graph_string = R"IR( graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu), @@ -786,9 +802,9 @@ TEST_F(Kernel, SumOneAxis) { // Check the IR we produced const std::string& verification_pattern = R"IR( -# CHECK: for (int v = 0; v < +# CHECK: for (int64_t v = 0ll; v < # CHECK-NEXT: sum -# CHECK-NEXT: for (int v_1 = 0; v_1 < +# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < # CHECK-NEXT: sum)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -847,10 +863,10 @@ TEST_F(Kernel, SumMultipleAxes) { // Check the IR we produced const std::string& verification_pattern = R"IR( -# CHECK: int v = 0 -# CHECK: int v_1 = 0 -# CHECK: int v_2 = 0 -# CHECK: int v_3 = 0 +# CHECK: int64_t v = 0 +# CHECK: int64_t v_1 = 0 +# CHECK: int64_t v_2 = 0 +# CHECK: int64_t v_3 = 0 # CHECK: sum)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -1115,8 +1131,8 @@ TEST_F(Kernel, InlineProducerIntoReduction) { // We should have only one loop in the end. const std::string& verification_pattern = R"IR( - # CHECK: for (int v = 0; v < 5; - # CHECK-NEXT: for (int v_1 = 0; v_1 < 3; + # CHECK: for (int64_t v = 0ll; v < 5 + # CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3 # CHECK-NEXT: sum # CHECK-NOT: for)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -1154,11 +1170,11 @@ TEST_F(Kernel, InlineReductionIntoConsumer) { // We should have two loops in the end. const std::string& verification_pattern = R"IR( - # CHECK: for (int v = 0; v < 5; - # CHECK-NEXT: for (int v_1 = 0; v_1 < 3; + # CHECK: for (int64_t v = 0ll; v < 5 + # CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3 # CHECK-NEXT: sum - # CHECK: for (int v_2 = 0; v_2 < 5; - # CHECK-NEXT: for (int v_3 = 0; v_3 < 3; + # CHECK: for (int64_t v_2 = 0ll; v_2 < 5 + # CHECK-NEXT: for (int64_t v_3 = 0ll; v_3 < 3 # CHECK-NEXT: aten_mul # CHECK-NOT: for)IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 139763b071317..0e5cf5eb03a3d 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -1501,42 +1501,54 @@ TEST(LLVM, RFactorVectorizedReduction) { ExpectAllNear(b_v, b_ref, 1e-5); } -TEST(LLVM, SimpleParallel) { - for (int test_cfg = 0; test_cfg < 4; test_cfg++) { - // Compute a simple operation, and try all loop-axis combination to be - // parallel or sequential. - const int M = 4; - const int N = 6; - Tensor f = Compute( - "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) { - return cast(m + n); - }); - LoopNest loop_nest({f}); - auto const& loops = loop_nest.getLoopStmtsFor(f); - ForPtr m = loops[0]; - ForPtr n = loops[1]; - if (test_cfg & 0x1) { - m->set_parallel(); - } - if (test_cfg & 0x2) { - n->set_parallel(); - } - loop_nest.prepareForCodegen(); - StmtPtr stmt = loop_nest.root_stmt(); - LLVMCodeGen cg(stmt, {f}); +template +static void testSimpleParallel() { + // Compute a simple operation, and try all loop-axis combination to be + // parallel or sequential. + const int M = 4; + const int N = 6; + Tensor f = Compute( + "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) { + return cast(m + n); + }); + LoopNest loop_nest({f}); + auto const& loops = loop_nest.getLoopStmtsFor(f); + ForPtr m = loops[0]; + ForPtr n = loops[1]; + if (outer) { + m->set_parallel(); + } + if (inner) { + n->set_parallel(); + } + loop_nest.prepareForCodegen(); + StmtPtr stmt = loop_nest.root_stmt(); + LLVMCodeGen cg(stmt, {f}); - PaddedBuffer f_v(M, N, "f_v"); - std::vector args({f_v.data()}); - int value = cg.value(args); - ASSERT_EQ(value, 0); - PaddedBuffer f_ref(M, N, "f_ref"); - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { - f_ref(m, n) = m + n; - } + PaddedBuffer f_v(M, N, "f_v"); + std::vector args({f_v.data()}); + int value = cg.value(args); + ASSERT_EQ(value, 0); + PaddedBuffer f_ref(M, N, "f_ref"); + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + f_ref(m, n) = m + n; } - ExpectAllNear(f_v, f_ref, 1e-5); } + ExpectAllNear(f_v, f_ref, 1e-5); +} + +TEST(LLVM, SimpleParallelSS) { + testSimpleParallel(); +} +TEST(LLVM, SimpleParallelSP) { + testSimpleParallel(); +} +TEST(LLVM, SimpleParallelPS) { + testSimpleParallel(); +} +TEST(LLVM, SimpleParallelPP) { + testSimpleParallel(); } TEST(LLVM, CompositeParallel) { diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 28934f622d057..c2b33e2a184d2 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -4734,8 +4734,8 @@ TEST(LoopNest, VectorizeUse) { } const char* int64Loop = R"IR( -# CHECK: for (int64_t n = 0; n < 12; n++) { -# CHECK: b[n] = (a[n]) + 1; +# CHECK: for (int64_t n = 0ll; n < 12ll; n++) { +# CHECK: b[n] = (a[n]) + 1ll; # CHECK: } )IR"; @@ -4744,7 +4744,8 @@ TEST(LoopNest, Int64Direct) { Placeholder a("a", kLong, {N}); Placeholder b("b", kLong, {N}); VarHandle n("n", kLong); - StmtPtr s = For::make(n, 0, N, b.store({n}, a.load({n}) + LongImm::make(1l))); + StmtPtr s = For::make( + n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l))); s = IRSimplifier::simplify(s); std::ostringstream oss; oss << *s; diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index 411b58db57f9e..3d2c0ecc27bfe 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -1712,7 +1712,6 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) { #CHECK-NOT: tmp )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); - SimpleIREvaluator cg(s, {b, c, m, n, k}); cg.call({in, out, M, N, K}); diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp index 1ae3330799c64..51b7b77f6d39d 100644 --- a/torch/csrc/jit/tensorexpr/block_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp @@ -76,7 +76,7 @@ void BlockAnalysis::visit(ForPtr v) { v->body()->accept(this); } else if (loop_options.is_gpu_thread_index()) { auto block_size = v->stop(); - block_size_ = to(block_size)->value(); + block_size_ = *intValue(block_size); v->body()->accept(this); } else { IRVisitor::visit(v); @@ -185,15 +185,14 @@ void BlockPrinter::PrintArguments(const std::unordered_set& bufs) { // The dims for the multi-dim tensors for (unsigned long d = 0; d < num_dims; d++) { - auto dim_val = to(multidimbuf->dim(d)); - this->dim_values_map.emplace(this->dim_names[d], dim_val->value()); + auto dim_val = *intValue(multidimbuf->dim(d)); + this->dim_values_map.emplace(this->dim_names[d], dim_val); } // The dimensions for the flattened tensors - auto val = to(buf->dim(0)); + auto val = *intValue(buf->dim(0)); if (block_analysis_->is_buf_store_target(buf)) { - this->dim_values_map.emplace( - this->flat_dim_names[num_dims - 1], val->value()); + this->dim_values_map.emplace(this->flat_dim_names[num_dims - 1], val); } } diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp index 55dbacf087899..649fd0e69da8e 100644 --- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp +++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp @@ -185,7 +185,7 @@ std::vector getBoundExtents( std::vector extents; for (size_t i = 0; i < starts.size(); ++i) { ExprPtr dim = IRSimplifier::simplify( - alloc(alloc(stops[i], starts[i]), alloc(1))); + alloc(alloc(stops[i], starts[i]), immLike(stops[i], 1))); extents.push_back(dim); } diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp index 4ac5c6b96fb9a..fdfff12ad7666 100644 --- a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp +++ b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp @@ -130,8 +130,8 @@ std::vector subtractBound(Bound a, Bound b, OverlapKind overlap) { auto vars = VarFinder::find(lowDiff); if (vars.size() == 1) { lowDiff = IRSimplifier::simplify(alloc( - SubstituteInClone(b.start, {{*vars.begin(), alloc(1)}}), - SubstituteInClone(a.start, {{*vars.begin(), alloc(1)}}))); + SubstituteInClone(b.start, {{*vars.begin(), immLike(b.start, 1)}}), + SubstituteInClone(a.start, {{*vars.begin(), immLike(a.start, 1)}}))); } } @@ -139,8 +139,8 @@ std::vector subtractBound(Bound a, Bound b, OverlapKind overlap) { auto vars = VarFinder::find(highDiff); if (vars.size() == 1) { highDiff = IRSimplifier::simplify(alloc( - SubstituteInClone(b.end, {{*vars.begin(), alloc(1)}}), - SubstituteInClone(a.end, {{*vars.begin(), alloc(1)}}))); + SubstituteInClone(b.end, {{*vars.begin(), immLike(b.end, 1)}}), + SubstituteInClone(a.end, {{*vars.begin(), immLike(a.end, 1)}}))); } } @@ -157,12 +157,13 @@ std::vector subtractBound(Bound a, Bound b, OverlapKind overlap) { if (hasHead) { res.emplace_back( - a.start, IRSimplifier::simplify(alloc(b.start, alloc(1)))); + a.start, + IRSimplifier::simplify(alloc(b.start, immLike(b.start, 1)))); } if (hasTail) { ExprPtr tailStart = - IRSimplifier::simplify(alloc(b.end, alloc(1))); + IRSimplifier::simplify(alloc(b.end, immLike(b.end, 1))); res.emplace_back(tailStart, a.end); } diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index b342f1464b0c2..30d42075189fb 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -45,18 +45,9 @@ class ScopedVarName { VarPtr var_ = nullptr; }; -static int as_int(ExprPtr expr) { - auto v = to(expr); - if (!v) { - throw malformed_input( - "cuda_codegen: non Int expr interpreted as int", expr); - } - - return v->value(); -} - static bool is_zero(ExprPtr expr) { - return as_int(expr) == 0; + auto v = intValue(expr); + return v && *v == 0; } static const at::cuda::NVRTC& nvrtc() { @@ -222,11 +213,11 @@ void CudaPrinter::print_flat_alloc(AllocatePtr alloc) { // TODO: this should be merged with the storage flattener. int64_t flat_size = 1; for (auto dim : dims) { - IntImmPtr dim_i = to(dim); + auto dim_i = intValue(dim); if (dim_i) { - flat_size *= dim_i->value(); + flat_size *= *dim_i; } else { - throw std::runtime_error("Only IntImm dimensions are supported for now"); + throw std::runtime_error("Only integer dimensions are supported for now"); } } os() << dtypeToCppString(alloc->dtype()) << " " << (*alloc->buffer_var()) diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp index 05c3ff8245221..e42ce77820e11 100644 --- a/torch/csrc/jit/tensorexpr/eval.cpp +++ b/torch/csrc/jit/tensorexpr/eval.cpp @@ -10,6 +10,17 @@ namespace tensorexpr { RegisterCodeGen ir_eval_codegen_reg("simple_ir_eval"); +int64_t Value::intValue() const { +#define TYPE_CASE(Type, Name) \ + if (dtype_ == k##Name) { \ + return int64_t{Name##values[0]}; \ + } + AT_FORALL_INT_TYPES(TYPE_CASE); +#undef TYPE_CASE + throw unsupported_dtype(); + return 0; +} + template inline typename std::enable_if::value, T>::type mod_value( T lhs, @@ -537,15 +548,16 @@ class SimpleIREvaluatorImpl : public IRVisitor { TORCH_API void visit(ForPtr v) override { ExprPtr var_node = v->var(); v->start()->accept(this); - int start = value_.as(); + auto dtype = value_.dtype(); + auto start = value_.intValue(); v->stop()->accept(this); - int stop = value_.as(); + auto stop = value_.intValue(); if (eval_context_.count(var_node)) { throw malformed_input("could not find var_node in For context", v); } - for (int i = start; i < stop; i++) { - eval_context_[var_node] = Value(i); + for (auto i = start; i < stop; i++) { + eval_context_[var_node] = Value(dtype, i); if (v->body()) { v->body()->accept(this); } @@ -555,9 +567,9 @@ class SimpleIREvaluatorImpl : public IRVisitor { TORCH_API void visit(RampPtr v) override { v->base()->accept(this); - int base = value().as(); + auto base = value().intValue(); v->stride()->accept(this); - int stride = value().as(); + auto stride = value().intValue(); int lanes = v->lanes(); std::vector values(lanes); @@ -609,6 +621,24 @@ class SimpleIREvaluatorImpl : public IRVisitor { } } + template + std::vector toLongVec(T&& t) { + return std::vector{std::begin(t), std::end(t)}; + } + + std::vector indexVec(const Value& v) { + switch (v.dtype().scalar_type()) { +#define TYPE_CASE(Type, Name) \ + case ScalarType::Name: \ + return toLongVec(v.as_vec()); + AT_FORALL_INT_TYPES(TYPE_CASE); +#undef TYPE_CASE + default: + throw unsupported_dtype(); + } + return {}; + } + TORCH_API void visit(LoadPtr v) override { auto iter = buffer_mapping_.find(v->buf()); if (iter == buffer_mapping_.end()) { @@ -618,7 +648,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { ExprPtr flat_idx = flatten_index(v->buf()->dims(), v->indices()); flat_idx->accept(this); - std::vector index = value().as_vec(); + auto index = indexVec(value()); ScalarType v_sdtype = v->dtype().scalar_type(); switch (v_sdtype) { #define TYPE_CASE(Type, Name) \ @@ -647,7 +677,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { ExprPtr flat_idx = flatten_index(v->buf()->dims(), v->indices()); flat_idx->accept(this); - std::vector index = value().as_vec(); + auto index = indexVec(value()); ScalarType v_sdtype = v->value()->dtype().scalar_type(); switch (v_sdtype) { @@ -696,7 +726,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { buf_dtypes.push_back((int8_t)b->dtype().scalar_type()); for (ExprPtr dim_expr : b->dims()) { dim_expr->accept(this); - buf_dims.push_back(value().as()); + buf_dims.push_back(value().intValue()); } } for (ExprPtr a : v->args()) { @@ -706,7 +736,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { if (value().dtype() == kLong) { val = value().as(); } else if (value().dtype() == kInt) { - val = value().as(); + val = value().intValue(); } else { throw malformed_input( "extra_args in ExternalCalls must have int64 dtype", v); @@ -789,10 +819,10 @@ class SimpleIREvaluatorImpl : public IRVisitor { void visit(AllocatePtr v) override { BufPtr b = v->buf(); std::vector dims = b->dims(); - int total_byte_size = b->dtype().byte_size(); + int64_t total_byte_size = b->dtype().byte_size(); for (auto& dim : dims) { dim->accept(this); - total_byte_size *= value_.as(); + total_byte_size *= value_.intValue(); } auto int_count = (total_byte_size + sizeof(int) - 1) / sizeof(int); std::unique_ptr> buffer(new std::vector(int_count)); @@ -824,7 +854,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { void visit(CondPtr v) override { v->condition()->accept(this); - if (value().as()) { + if (value().intValue()) { if (v->true_stmt()) { v->true_stmt()->accept(this); } diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index 38ec99bd431cf..494ba283ea902 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -29,6 +29,18 @@ class Value { Intvalues.push_back(0); } + template + Value(Dtype dtype, T v) : dtype_(dtype) { +#define TYPE_CASE(Type, Name) \ + if (dtype == k##Name) { \ + Name##values.push_back(v); \ + return; \ + } + AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); +#undef TYPE_CASE + throw unsupported_dtype(); + } + #define VALUE_CTOR(Type, Name) \ Value(Type v) : dtype_(k##Name) { \ Name##values.push_back(v); \ @@ -50,6 +62,8 @@ class Value { template const std::vector& as_vec() const; + int64_t intValue() const; + Dtype dtype() const { return dtype_; } diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index a4f317f48e666..fbbea12387920 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -319,7 +319,7 @@ class TORCH_API BufHandle : public ExprHandle { // object. For example: VarHandle x('x'); ExprHandle x2 = x; class TORCH_API VarHandle : public ExprHandle { public: - VarHandle() : ExprHandle(nullptr) {} + VarHandle() : ExprHandle() {} explicit VarHandle(Dtype dtype) : ExprHandle(Var::make(dtype)) {} VarHandle(const std::string& name_hint, Dtype dtype) : ExprHandle(Var::make(name_hint, dtype)) {} diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp index f66c0c5ba0701..2680f5366b46e 100644 --- a/torch/csrc/jit/tensorexpr/ir.cpp +++ b/torch/csrc/jit/tensorexpr/ir.cpp @@ -88,17 +88,17 @@ ExprPtr flatten_index( throw malformed_input("dimensions mismatch in flatten_index"); } if (ndim == 0) { - return alloc(0); + return alloc(0); } std::vector strides(ndim); // stride[i] = stride[i+1]*dims[i+1], i < ndim-1 // stride[i] = 1, i = ndim-1 - strides[ndim - 1] = alloc(1); + strides[ndim - 1] = immLike(dims[ndim - 1], 1); for (size_t i = 1; i < ndim; i++) { strides[ndim - 1 - i] = alloc(strides[ndim - i], dims[ndim - i]); } - ExprPtr total_index = alloc(0); + ExprPtr total_index = immLike(indices[0], 0); for (const auto i : c10::irange(ndim)) { total_index = alloc(total_index, alloc(indices[i], strides[i])); } diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 7fe1fd1a07abb..1218082e6af98 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -344,6 +344,30 @@ ExprPtr getImmediateByType(Dtype dtype, T initialVal) { return getImmediateByType(dtype.scalar_type(), initialVal); } +template +ExprPtr immLike(ExprPtr e, T v) { + return getImmediateByType(e->dtype(), v); +} + +template +ExprPtr immLike(ExprHandle e, T v) { + return immLike(e.node(), v); +} + +inline c10::optional intValue(ExprPtr e) { +#define TYPE_CASE(Type, Name) \ + if (auto v = to(e)) { \ + return v->value(); \ + } + AT_FORALL_INT_TYPES(TYPE_CASE); +#undef TYPE_CASE + return c10::nullopt; +} + +inline c10::optional intValue(ExprHandle e) { + return intValue(e.node()); +} + template T immediateAs(ExprPtr e) { #define TYPE_CASE(Type, Name) \ diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index 2e1fc6e6952a7..ca90d9995e0d2 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -206,11 +206,19 @@ static void formatImm(std::ostream& os, T v) { } } +static void formatIntSuffix(std::ostream& os, int64_t v) { + os << "ll"; +} + +template +static void formatIntSuffix(std::ostream& os, T v) {} + template < typename T, std::enable_if_t::value>* = nullptr> static void formatImm(std::ostream& os, T v) { os << +v; + formatIntSuffix(os, v); } // NOLINTNEXTLINE diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index 23216dd4002f7..6820bbb5748a2 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -430,8 +430,7 @@ ExprPtr PolynomialTransformer::mutate(AddPtr v) { // Otherwise this is a new polynomial with no scalar and two variable // terms. - return alloc( - hasher_, getImmediateByType(v->dtype(), 0), lhsTerm, rhsTerm); + return alloc(hasher_, immLike(v, 0), lhsTerm, rhsTerm); } // Adds are commutative. @@ -452,19 +451,17 @@ ExprPtr PolynomialTransformer::mutate(AddPtr v) { // Simple Term with a scalar and variable type. if (scalar) { return alloc( - hasher_, - scalar, - alloc(hasher_, getImmediateByType(v->dtype(), 1), variable)); + hasher_, scalar, alloc(hasher_, immLike(v, 1), variable)); } // If LHS is neither Term not Polynomial, wrap it in a Term. if (!lhsTerm && !lhsPoly) { - lhsTerm = alloc(hasher_, getImmediateByType(v->dtype(), 1), lhs_new); + lhsTerm = alloc(hasher_, immLike(v, 1), lhs_new); } // Same for RHS. if (!rhsTerm && !rhsPoly) { - rhsTerm = alloc(hasher_, getImmediateByType(v->dtype(), 1), rhs_new); + rhsTerm = alloc(hasher_, immLike(v, 1), rhs_new); } // If we now have a poly and a term, we can insert. @@ -480,8 +477,7 @@ ExprPtr PolynomialTransformer::mutate(AddPtr v) { } // If all else fails we have a new Polynomial with two new variable Terms. - return alloc( - hasher_, getImmediateByType(v->dtype(), 0), lhsTerm, rhsTerm); + return alloc(hasher_, immLike(v, 0), lhsTerm, rhsTerm); } ExprPtr PolynomialTransformer::subTerms( @@ -490,7 +486,7 @@ ExprPtr PolynomialTransformer::subTerms( bool negated) { // If RHS not already negated, negate it. if (!negated) { - ExprPtr minusOne = getImmediateByType(rhs->dtype(), -1); + ExprPtr minusOne = immLike(rhs, -1); ExprPtr negateScalar = evaluateOp(alloc(minusOne, rhs->scalar())); rhs = alloc(hasher_, negateScalar, rhs->variables()); } @@ -529,8 +525,7 @@ ExprPtr PolynomialTransformer::subPolynomials( for (auto rt : rhs->variables()) { // Polynomials add their terms, so negate the RHS's Terms. - ExprPtr negated = evaluateOp( - alloc(getImmediateByType(rt->dtype(), -1), rt->scalar())); + ExprPtr negated = evaluateOp(alloc(immLike(rt, -1), rt->scalar())); TermPtr newRHS = alloc(hasher_, negated, rt->variables()); addOrUpdateTerm(varmap, newRHS); } @@ -594,7 +589,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { auto ret = subPolynomials(lhsPoly, rhsPoly); if (!ret) { // Cancelled out completely. - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } return ret; } @@ -605,8 +600,8 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { // Polynomial - Term. if (lhsPoly && rhsTerm) { // Negate the term. - ExprPtr negate = evaluateOp(alloc( - getImmediateByType(rhsTerm->dtype(), -1), rhsTerm->scalar())); + ExprPtr negate = + evaluateOp(alloc(immLike(rhsTerm, -1), rhsTerm->scalar())); TermPtr newTerm = alloc(hasher_, negate, rhsTerm->variables()); return insertTerm(lhsPoly, newTerm); } @@ -614,7 +609,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { // Term - Polynomial. if (rhsPoly && lhsTerm) { // Negate every part of the Polynomial. - ExprPtr minusOne = getImmediateByType(lhsTerm->dtype(), -1); + ExprPtr minusOne = immLike(lhsTerm, -1); ExprPtr negateScalar = evaluateOp(alloc(minusOne, rhsPoly->scalar())); std::vector variables; @@ -645,7 +640,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { ExprPtr newScalar = evaluateOp(alloc(lhs_new, rhsPoly->scalar())); // Negate each term in the Polynomial RHS. - ExprPtr minusOne = getImmediateByType(rhsPoly->dtype(), -1); + ExprPtr minusOne = immLike(rhsPoly, -1); std::vector variables; for (auto t : rhsPoly->variables()) { ExprPtr negate = evaluateOp(alloc(minusOne, t->scalar())); @@ -657,15 +652,14 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { if (lhsTerm && rhsScalar) { // Negate the constant. - ExprPtr negate = evaluateOp( - alloc(getImmediateByType(rhs_new->dtype(), -1), rhs_new)); + ExprPtr negate = evaluateOp(alloc(immLike(rhs_new, -1), rhs_new)); return alloc(hasher_, negate, lhsTerm); } if (lhsScalar && rhsTerm) { // Negate the RHS Term. - ExprPtr negate = evaluateOp(alloc( - getImmediateByType(rhsTerm->scalar()->dtype(), -1), rhsTerm->scalar())); + ExprPtr negate = evaluateOp( + alloc(immLike(rhsTerm->scalar(), -1), rhsTerm->scalar())); return alloc( hasher_, lhs_new, alloc(hasher_, negate, rhsTerm->variables())); @@ -675,29 +669,24 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { if (lhsScalar) { // Create a negated term. return alloc( - hasher_, - lhs_new, - alloc(hasher_, getImmediateByType(v->dtype(), -1), rhs_new)); + hasher_, lhs_new, alloc(hasher_, immLike(v, -1), rhs_new)); } if (rhsScalar) { // Negate the scalar. - ExprPtr negate = evaluateOp( - alloc(getImmediateByType(rhs_new->dtype(), -1), rhs_new)); + ExprPtr negate = evaluateOp(alloc(immLike(rhs_new, -1), rhs_new)); return alloc( - hasher_, - negate, - alloc(hasher_, getImmediateByType(v->dtype(), 1), lhs_new)); + hasher_, negate, alloc(hasher_, immLike(v, 1), lhs_new)); } // no scalar... if (!lhsTerm && !lhsPoly) { - lhsTerm = alloc(hasher_, getImmediateByType(v->dtype(), 1), lhs_new); + lhsTerm = alloc(hasher_, immLike(v, 1), lhs_new); } bool createdRHSnegated = false; if (!rhsTerm && !rhsPoly) { - rhsTerm = alloc(hasher_, getImmediateByType(v->dtype(), -1), rhs_new); + rhsTerm = alloc(hasher_, immLike(v, -1), rhs_new); createdRHSnegated = true; } @@ -714,7 +703,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { // Insert wrapper Term into negated RHS Poly. if (rhsPoly) { CHECK(lhsTerm); - ExprPtr minusOne = getImmediateByType(rhsPoly->dtype(), -1); + ExprPtr minusOne = immLike(rhsPoly, -1); ExprPtr newScalar = evaluateOp(alloc(minusOne, rhsPoly->scalar())); // Negate each term in the Polynomial RHS. @@ -728,8 +717,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) { return insertTerm(poly, lhsTerm); } - return alloc( - hasher_, getImmediateByType(v->dtype(), 0), lhsTerm, rhsTerm); + return alloc(hasher_, immLike(v, 0), lhsTerm, rhsTerm); } // Multiply two terms together, usually creating a new term with the variable @@ -930,7 +918,7 @@ ExprPtr PolynomialTransformer::mutate(MulPtr v) { // Handle special case mul by 0. if (scalar && immediateEquals(scalar, 0)) { - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } // Catch cases of rounding (Div(A/B) * B). @@ -994,13 +982,11 @@ ExprPtr PolynomialTransformer::mutate(MulPtr v) { // Multiplying Polynomial by variable can be wrapped in a term and handled // by polyByTerm also. if (lhsPoly) { - auto term = - alloc(hasher_, getImmediateByType(rhs_new->dtype(), 1), rhs_new); + auto term = alloc(hasher_, immLike(rhs_new, 1), rhs_new); return polyByTerm(lhsPoly, term); } if (rhsPoly) { - auto term = - alloc(hasher_, getImmediateByType(lhs_new->dtype(), 1), lhs_new); + auto term = alloc(hasher_, immLike(lhs_new, 1), lhs_new); return polyByTerm(rhsPoly, term); } @@ -1014,8 +1000,7 @@ ExprPtr PolynomialTransformer::mutate(MulPtr v) { } // Two variables, create a new Term. - return alloc( - hasher_, getImmediateByType(v->dtype(), 1), lhs_new, rhs_new); + return alloc(hasher_, immLike(v, 1), lhs_new, rhs_new); } ExprPtr factorizeDivision(ExprPtr lhs_new, ExprPtr rhs_new) { @@ -1048,10 +1033,8 @@ ExprPtr factorizeDivision(ExprPtr lhs_new, ExprPtr rhs_new) { return nullptr; } - leftScalar = evaluateOp( - alloc
    (leftScalar, getImmediateByType(leftScalar->dtype(), GCD))); - rightScalar = evaluateOp( - alloc
    (rightScalar, getImmediateByType(rightScalar->dtype(), GCD))); + leftScalar = evaluateOp(alloc
    (leftScalar, immLike(leftScalar, GCD))); + rightScalar = evaluateOp(alloc
    (rightScalar, immLike(rightScalar, GCD))); if (lhsTerm) { lhs_new = alloc(lhsTerm->hasher(), leftScalar, lhsTerm->variables()); @@ -1127,12 +1110,12 @@ ExprPtr PolynomialTransformer::mutate(ModPtr v) { // x % 1 == 0. if (rhs_new->isConstant() && immediateEquals(rhs_new, 1)) { // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } // x % x => 0. if (hasher_.hash(lhs_new) == hasher_.hash(rhs_new)) { - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } TermPtr lhsTerm = to(lhs_new); @@ -1149,13 +1132,13 @@ ExprPtr PolynomialTransformer::mutate(ModPtr v) { if (rhs_new->isConstant() && immediateEquals( evaluateOp(alloc(lhsTerm->scalar(), rhs_new)), 0)) { - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } // (x * y * z) % x => 0. for (auto component : lhsTerm->variables()) { if (hasher_.hash(component) == hasher_.hash(rhs_new)) { - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } } @@ -1189,7 +1172,7 @@ ExprPtr PolynomialTransformer::mutate(ModPtr v) { immediateEquals( evaluateOp(alloc(lhsTerm->scalar(), rhsTerm->scalar())), 0)) { - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } } } @@ -1862,7 +1845,7 @@ ExprPtr polyGCD(PolynomialPtr poly) { return nullptr; } - return getImmediateByType(poly->dtype(), GCD); + return immLike(poly, GCD); } // A ModRound is a div-mod-mul in which the divisor in div and multiplier in mul @@ -1981,7 +1964,7 @@ c10::optional isModRound(TermPtr e) { } if (!scalar) { - scalar = getImmediateByType(multiplier->dtype(), 1); + scalar = immLike(multiplier, 1); } // TODO: this leaks memory! @@ -2261,23 +2244,23 @@ ExprPtr TermExpander::mutate(PolynomialPtr v) { } // Negate the term back to positive since we'll be subtracting it. - ExprPtr negated = evaluateOp(alloc( - getImmediateByType(node->scalar()->dtype(), -1), node->scalar())); + ExprPtr negated = + evaluateOp(alloc(immLike(node->scalar(), -1), node->scalar())); TermPtr newRHS = alloc(node->hasher(), negated, node->variables()); lastNode = alloc(lastNode, newRHS->accept_mutator(this)); } if (scalarWritten || immediateEquals(v->scalar(), 0)) { if (!lastNode) { - return getImmediateByType(v->dtype(), 0); + return immLike(v, 0); } return lastNode; } if (immediateIsNegative(v->scalar())) { // Negate the scalar and subtract. - ExprPtr negated = evaluateOp( - alloc(getImmediateByType(lastNode->dtype(), -1), v->scalar())); + ExprPtr negated = + evaluateOp(alloc(immLike(lastNode, -1), v->scalar())); lastNode = alloc(lastNode, evaluateOp(negated)); } else { // we want to avoid a cast to the scalar if it would happen. @@ -2344,7 +2327,7 @@ ExprPtr TermExpander::mutate(MinTermPtr v) { ExprPtr TermExpander::mutate(RoundOffPtr v) { TermPtr term = alloc( simplifier_->hasher(), - getImmediateByType(v->dtype(), 1), + immLike(v, 1), alloc
    (v->lhs(), v->rhs()), v->rhs()); return term->accept_mutator(this); @@ -2352,8 +2335,10 @@ ExprPtr TermExpander::mutate(RoundOffPtr v) { ExprPtr buf_flat_size(BufPtr v) { std::vector dims = v->dims(); - - ExprPtr flattened = getImmediateByType(kInt, 1); + if (dims.size() == 0) { + return alloc(1); + } + ExprPtr flattened = immLike(dims[0], 1); for (auto& dim : dims) { flattened = alloc(flattened, dim); } @@ -2684,7 +2669,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { return nullptr; } ExprPtr check_n_value = IRSimplifier::simplify( - alloc(rhsScalar, alloc(0), kGT)); + alloc(rhsScalar, immLike(rhsScalar, 0), kGT)); if (!immediateEquals(check_n_value, 1)) { return nullptr; } @@ -2719,7 +2704,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { // range auto end = got->second.second; ExprPtr check_start = IRSimplifier::simplify( - alloc(start, alloc(0), kGE)); + alloc(start, immLike(start, 0), kGE)); ExprPtr check_end = IRSimplifier::simplify(alloc(end, rhsScalar, kLE)); if (!check_start->isConstant() || !check_end->isConstant() || @@ -2731,7 +2716,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { // simplify type 1) exprs: '(i+x)/n' => 'x/n' ExprPtr sign_check = - IRSimplifier::simplify(alloc(main, alloc(0), kGE)); + IRSimplifier::simplify(alloc(main, immLike(main, 0), kGE)); ExprPtr main_mod = IRSimplifier::simplify(alloc(main, rhsScalar)); ExprPtr mod_check = IRSimplifier::simplify( alloc(alloc(main_mod, end), rhsScalar, kLE)); @@ -2742,6 +2727,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { // simplify type 2 exprs: '(i+j*n)/n' => 'j' auto ret_var = to(ret); + // FIXME: Allow any integral type. if (ret_var && ret_var->dtype() == kInt) { // retrieve j's range info auto got = var_bound_info.find(ret_var); @@ -2750,8 +2736,8 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { } // check if j is not negative - sign_check = IRSimplifier::simplify( - alloc(got->second.first, alloc(0), kGE)); + sign_check = IRSimplifier::simplify(alloc( + got->second.first, immLike(got->second.first, 0), kGE)); if (sign_check->isConstant() && immediateEquals(sign_check, 1)) { return ret_var; } @@ -2801,7 +2787,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { return nullptr; } ExprPtr check_n_value = IRSimplifier::simplify( - alloc(rhsScalar, alloc(0), kGT)); + alloc(rhsScalar, immLike(rhsScalar, 0), kGT)); if (!immediateEquals(check_n_value, 1)) { return nullptr; } @@ -2838,7 +2824,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { // range auto end = got->second.second; ExprPtr check_start = IRSimplifier::simplify( - alloc(start, alloc(0), kGE)); + alloc(start, immLike(start, 0), kGE)); ExprPtr check_end = IRSimplifier::simplify(alloc(end, rhsScalar, kLE)); if (!check_start->isConstant() || !check_end->isConstant() || @@ -2848,7 +2834,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { // simplify type 1) exprs: '(i+x)%n' => 'i+x%n' ExprPtr sign_check = - IRSimplifier::simplify(alloc(main, alloc(0), kGE)); + IRSimplifier::simplify(alloc(main, immLike(main, 0), kGE)); ExprPtr main_mod = IRSimplifier::simplify(alloc(main, rhsScalar)); ExprPtr mod_check = IRSimplifier::simplify( alloc(alloc(main_mod, end), rhsScalar, kLE)); @@ -2860,6 +2846,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { // simplify type 2) exprs: '(i+j*n)%n' => 'i' ExprPtr main_div = IRSimplifier::simplify(alloc
    (main, rhsScalar)); auto j_var = to(main_div); + // FIXME: Allow any integral type. if (j_var && j_var->dtype() == kInt) { // retrieve j's range info auto got = var_bound_info.find(j_var); @@ -2868,8 +2855,8 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) { } // check if j is not negative - sign_check = IRSimplifier::simplify( - alloc(got->second.first, alloc(0), kGE)); + sign_check = IRSimplifier::simplify(alloc( + got->second.first, immLike(got->second.first, 0), kGE)); if (sign_check->isConstant() && immediateEquals(sign_check, 1)) { return var_key; } @@ -2920,7 +2907,7 @@ ExprPtr SimplifierUnderContext::mutate(ModPtr v) { auto start = got->second.first; auto end = got->second.second; ExprPtr check_start = IRSimplifier::simplify( - alloc(start, alloc(0), kGE)); + alloc(start, immLike(start, 0), kGE)); ExprPtr check_end = IRSimplifier::simplify(alloc(end, rhsScalar, kLE)); if (check_start->isConstant() && check_end->isConstant() && diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index f72fbf7c18c37..0d0d19e004981 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -202,11 +202,11 @@ c10::optional getTensorInfoJit(torch::jit::Value* v) { c10::optional getTensorInfo(BufHandle b) { std::vector dims; for (auto dim : b.dims()) { - auto val = to(dim.node()); + auto val = intValue(dim.node()); if (!val) { return c10::nullopt; } - dims.push_back(val->value()); + dims.push_back(*val); } return TensorInfo{dims, static_cast(b.dtype().scalar_type())}; } @@ -396,7 +396,7 @@ ExprHandle tensorOrConstant( return constant(v); } -size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) { +int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) { if (idx < 0) { // Handle negative indexing idx = list_size + idx; @@ -405,7 +405,7 @@ size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) { if (idx < 0 || idx >= list_size) { AT_ERROR("Invalid index ", idx, " for list_size", list_size); } - return static_cast(idx); + return idx; } ExprHandle broadcast(BufHandle b, const std::vector& axes) { @@ -441,8 +441,8 @@ std::vector computeIndicesToBroadcast( auto axisIt = outputAxes.rbegin(); auto sizeIt = inputSizes.rbegin(); while (sizeIt != inputSizes.rend()) { - auto const& size = sizeIt->AsNode(); - if (size && size->value() == 1) { + auto const& size = intValue(*sizeIt); + if (size && *size == 1) { bcast.emplace_back(0); } else { bcast.emplace_back(*axisIt); @@ -525,7 +525,9 @@ static at::ScalarType tensorType(BufPtr b) { std::vector bufferSizes(BufPtr b) { std::vector sizes; for (size_t i = 0; i < b->ndim(); i++) { - sizes.push_back(to(b->dim(i))->value()); + auto dim = intValue(b->dim(i)); + TORCH_INTERNAL_ASSERT(dim); + sizes.push_back(*dim); } return sizes; } @@ -543,7 +545,8 @@ ExprHandle TensorExprKernel::chunk( std::vector indices; for (size_t i = 0; i < axes.size(); ++i) { if (i == norm_dim) { - indices.push_back(axes[i] + IntImm::make((int)chunkIdx * (int)step)); + indices.push_back( + axes[i] + ExprHandle(immLike(axes[i], chunkIdx * step))); } else { indices.push_back(axes[i]); } @@ -642,7 +645,7 @@ std::vector TensorExprKernel::sizesFromVaryingShape( const c10::VaryingShape& shape) { std::vector dims; for (const auto i : c10::irange(*shape.size())) { - dims.push_back(IntImm::make(*shape[i])); + dims.push_back(*shape[i]); } return dims; } @@ -664,7 +667,7 @@ std::vector TensorExprKernel::sizesForValue( if (v->type()->isSubtypeOf(FloatType::get()) || v->type()->isSubtypeOf(IntType::get())) { - return {1}; + return {int64_t{1}}; } if (v->type()->isSubtypeOf(NoneType::get())) { return {}; @@ -820,7 +823,7 @@ std::vector TensorExprKernel::inferSizesForValue( TORCH_INTERNAL_ASSERT(n->input(1)->node()->kind() == prim::Constant); int64_t dim = n->input(1)->node()->i(attr::value); auto shape = sizesForValue(inputs[0]); - size_t norm_dim = normalizeAndCheckIndex(dim, shape.size()); + auto norm_dim = normalizeAndCheckIndex(dim, shape.size()); ExprHandle concat_dim_size = 0; for (auto input : inputs) { concat_dim_size = concat_dim_size + sizesForValue(input)[norm_dim]; @@ -889,11 +892,11 @@ ExprHandle clamp( } static bool isOne(ExprHandle e) { - auto const& n = e.AsNode(); + auto const& n = intValue(e); if (!n) { return false; } - return n->value() == 1; + return *n == 1; } std::pair, bool> broadcastShapesImpl( @@ -1150,6 +1153,7 @@ std::pair> processCatList( } return {highType, nonEmptyInputs}; } + Tensor computeCatWoConditionals( const std::vector& inputs, const std::vector& outputShape) { @@ -1184,8 +1188,7 @@ Tensor computeCatWoConditionals( } int64_t concat_dim = c10::get(arg_dim); - size_t norm_concat_dim = - normalizeAndCheckIndex(concat_dim, outputShape.size()); + auto norm_concat_dim = normalizeAndCheckIndex(concat_dim, outputShape.size()); auto gen_code_for_input = [&](const BufHandle& inp, size_t inp_pos, @@ -1196,7 +1199,8 @@ Tensor computeCatWoConditionals( std::vector store_indices(dims.size()); for (size_t i = 0; i < dims.size(); ++i) { for_vars[i] = alloc( - "i" + c10::to_string(inp_pos) + "_" + c10::to_string(i), kInt); + "i" + c10::to_string(inp_pos) + "_" + c10::to_string(i), + dims[i].dtype()); load_indices[i] = for_vars[i]; if (i == norm_concat_dim) { store_indices[i] = alloc(for_vars[i], concat_dim_size); @@ -1209,8 +1213,8 @@ Tensor computeCatWoConditionals( auto load_promoted = promoteToDtype(ExprHandle(load_expr), high_type); StmtPtr st = alloc(output_buf, store_indices, load_promoted.node()); for (size_t i = dims.size(); i > 0; --i) { - st = - alloc(for_vars[i - 1], alloc(0), dims[i - 1].node(), st); + st = alloc( + for_vars[i - 1], immLike(dims[i - 1], 0), dims[i - 1].node(), st); } return st; }; @@ -1221,7 +1225,7 @@ Tensor computeCatWoConditionals( auto input_dims = ExprVectorToExprHandleVector(non_empty_inputs[i].node()->dims()); if (concat_dim_size == nullptr) { - concat_dim_size = alloc(0); + concat_dim_size = immLike(input_dims[norm_concat_dim], 0); } block->append_stmt(gen_code_for_input( non_empty_inputs[i], i, concat_dim_size, input_dims)); @@ -1253,7 +1257,7 @@ Tensor computeCat( } int64_t dim_ = c10::get(argDim); - size_t dim = normalizeAndCheckIndex(dim_, axes.size()); + auto dim = normalizeAndCheckIndex(dim_, axes.size()); // Promote input types. // Note that we need to consider all inputs, including empty - they // also affect the resultant dtype. @@ -1273,18 +1277,18 @@ Tensor computeCat( std::vector newAxes(axes.begin(), axes.end()); ExprHandle load = promoteToDtype( tensorOrConstant(nonEmptyInputs[0], newAxes), highType); - size_t offset = to(nonEmptyInputs[0].node()->dim(dim))->value(); - newAxes[dim] = newAxes[dim] - IntImm::make(offset); + auto offset = *intValue(nonEmptyInputs[0].node()->dim(dim)); + newAxes[dim] = newAxes[dim] - ExprHandle(immLike(newAxes[dim], offset)); for (size_t ii = 1; ii < nonEmptyInputs.size(); ++ii) { auto input = nonEmptyInputs[ii]; load = ifThenElse( - CompareSelect::make(axes[dim], IntImm::make(offset), kLT), + CompareSelect::make(axes[dim], offset, kLT), load, promoteToDtype(tensorOrConstant(input, newAxes), highType)); - offset += to(input.node()->dim(dim))->value(); - newAxes[dim] = axes[dim] - IntImm::make(offset); + offset += *intValue(input.node()->dim(dim)); + newAxes[dim] = axes[dim] - ExprHandle(immLike(axes[dim], offset)); } return load; @@ -2334,12 +2338,12 @@ Tensor tensorexpr::computeOperandValue( ExprHandle cur_stride = 1; std::vector dims, indices; for (size_t idx = 0; idx < view_dims.size(); idx++) { - dims.push_back(alloc(view_dims[idx])); + dims.push_back(alloc(view_dims[idx])); indices.push_back(axes[idx].node()); } ExprHandle flat_idx = ExprHandle(flatten_index(dims, indices)); std::vector orig_buf_indexes(A.ndim(), ExprHandle(0)); - ExprHandle stride = IntImm::make(1); + ExprHandle stride = ExprHandle(immLike(flat_idx, 1)); for (size_t idx = 0; idx < A.ndim(); idx++) { size_t dim_idx = A.ndim() - idx - 1; // We don't need to generate mod-div for the first dimension - @@ -2799,7 +2803,7 @@ static std::vector toExprHandles(const std::vector& sizes) { std::vector dims; dims.reserve(sizes.size()); for (auto const& size : sizes) { - dims.emplace_back(IntImm::make(size)); + dims.emplace_back(size); } return dims; } @@ -2831,8 +2835,7 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) { std::vector inputTensorDims; for (size_t i = 0; i < *tt->sizes().size(); i++) { auto const size = *tt->sizes()[i]; - inputTensorDims.emplace_back( - DimArg(IntImm::make(size), "i" + c10::to_string(i))); + inputTensorDims.emplace_back(DimArg(size, "i" + c10::to_string(i))); } auto const strides = tt->strides(); result = Compute( @@ -2841,12 +2844,11 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) { [&](const std::vector& axes) { ExprHandle idx = 0; for (size_t i = 0; i < axes.size(); i++) { - idx = idx + axes[i] * IntImm::make(*strides[i]); + idx = idx + axes[i] * *strides[i]; } return inBuffer.load(idx); }); bufs_.emplace(input, result.buf()); - bufferArgs_.emplace_back(inBuffer); break; } @@ -2956,10 +2958,10 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { return Compute( "output_1", dims, [&](const std::vector& axes_input) { std::vector axes(axes_input.begin(), axes_input.end()); - auto absolute_position = IntImm::make(0); + auto absolute_position = ExprHandle(immLike(axes[0], 0)); for (size_t i = 0; i < axes.size(); ++i) { - absolute_position = - absolute_position + (IntImm::make(default_strides[i]) * axes[i]); + absolute_position = absolute_position + + (ExprHandle(immLike(axes[i], default_strides[i])) * axes[i]); } std::vector sorted_stride_indices = reverse_sort_indices(strides); @@ -2967,10 +2969,11 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { for (size_t stride_index : sorted_stride_indices) { auto stride = strides[stride_index]; auto size = sizes[stride_index]; - auto index = Div::make(absolute_position, IntImm::make(stride)); + auto index = absolute_position / + ExprHandle(immLike(absolute_position, stride)); if (size != 1) { - absolute_position = - Mod::make(absolute_position, IntImm::make(stride)); + absolute_position = absolute_position % + ExprHandle(immLike(absolute_position, stride)); } new_axes[stride_index] = index; } @@ -2992,7 +2995,7 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) { std::vector te_sizes; te_sizes.reserve(sizes.size()); for (auto s : sizes) { - te_sizes.push_back(IntImm::make(s)); + te_sizes.push_back(s); } BufPtr buf = alloc( diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 99a3b123a6816..4b92b020fce31 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -19,7 +19,7 @@ template inline std::vector bufferSizes(const T& t) { std::vector sizes; for (size_t i = 0; i < t->ndim(); i++) { - sizes.push_back(to(t->dim(i))->value()); + sizes.push_back(*intValue(t->dim(i))); } return sizes; } @@ -62,7 +62,7 @@ ExprHandle tensorOrConstant( const ArgValue& v, const std::vector& axes); -size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size); +int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size); ExprHandle broadcast(BufHandle b, const std::vector& axes); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index a93fd64df0a68..026d52bfc938c 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -275,17 +275,17 @@ class LLVMCodeGenImpl : public IRVisitor { }; extern "C" { -typedef void (*ParallelCallee)(int index, int8_t* packed_data); +typedef void (*ParallelCallee)(int64_t index, int8_t* packed_data); void DispatchParallel( int8_t* func, - int start, - int stop, + int64_t start, + int64_t stop, int8_t* packed_data) noexcept { // TODO: preserve the func type. try { ParallelCallee callee = reinterpret_cast(func); at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) { - for (int index = f_begin; index < f_end; index++) { + for (int64_t index = f_begin; index < f_end; index++) { callee(index, packed_data); } }); @@ -537,10 +537,6 @@ void LLVMCodeGenImpl::emitKernel( irb_.CreateRet(value_); - if (llvm::verifyFunction(*fn_, &llvm::outs())) { - throw std::runtime_error("Function verification failed"); - } - // print graph debug info before optimization llvm::SmallVector asmBuffer; llvm::raw_svector_ostream asmStream(asmBuffer); @@ -550,6 +546,10 @@ void LLVMCodeGenImpl::emitKernel( GRAPH_DEBUG( "\nLLVM module before optimizations\n\n", asmStream.str().str(), "\n"); + if (llvm::verifyFunction(*fn_, &llvm::outs())) { + throw std::runtime_error("Function verification failed"); + } + optimize(*module_); asmBuffer.set_size(0); @@ -1144,8 +1144,8 @@ void LLVMCodeGenImpl::visit(LoadPtr v) { // Handle the case where the load is contiguous and unmasked efficiently auto idx_ramp = to(v->flat_index()); if (idx_ramp) { - auto stride_imm = to(idx_ramp->stride()); - if (stride_imm && stride_imm->value() == 1) { + auto stride_imm = intValue(idx_ramp->stride()); + if (stride_imm && *stride_imm == 1) { v->base_handle()->accept(this); auto base = this->value_; idx_ramp->base()->accept(this); @@ -1256,7 +1256,7 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) { // Create the new body closure code. auto func_type = - llvm::FunctionType::get(VoidTy_, {IntTy_, Int8PtrTy_}, false); + llvm::FunctionType::get(VoidTy_, {LongTy_, Int8PtrTy_}, false); llvm::Function* func = llvm::Function::Create( func_type, llvm::Function::PrivateLinkage, "func", module_.get()); auto func_body = llvm::BasicBlock::Create(getContext(), "func_body", func); @@ -1268,6 +1268,10 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) { packed_func_args_raw, packed_caller_args->getType()); // Unpack the arguments from the opaque buffer. + if (v->var()->dtype().scalar_type() != c10::kLong) { + index = irb_.CreateIntCast( + index, dtypeToLLVM(v->var()->dtype()), v->var()->dtype().is_signed()); + } body_closure_args = unpackFuncArgs(packed_func_args, body_arg_vars.size()); // Set the codegen to the new func. // TODO: this should be replaced by RAII wrappers. @@ -1290,12 +1294,14 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) { irb_.CreatePointerCast(packed_caller_args, Int8PtrTy_); llvm::Value* func_value = irb_.CreatePointerCast(func, Int8PtrTy_); llvm::FunctionType* dispatcher_fntype = llvm::FunctionType::get( - VoidTy_, {Int8PtrTy_, IntTy_, IntTy_, Int8PtrTy_}, false); + VoidTy_, {Int8PtrTy_, LongTy_, LongTy_, Int8PtrTy_}, false); FunctionCallee dispatcher_callee = module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype); llvm::Function* dispatcher = llvm::cast(dispatcher_callee.getCallee()); dispatcher->addFnAttr(llvm::Attribute::NoUnwind); + start = irb_.CreateIntCast(start, LongTy_, true); + stop = irb_.CreateIntCast(stop, LongTy_, true); irb_.CreateCall( dispatcher, {func_value, start, stop, packed_caller_args_ptr}); value_ = llvm::ConstantInt::get(IntTy_, 0); @@ -1320,7 +1326,7 @@ void LLVMCodeGenImpl::visit(ForPtr v) { irb_.SetInsertPoint(condBlock); // Set up phi node for index variable. - auto idx = irb_.CreatePHI(IntTy_, 2); + auto idx = irb_.CreatePHI(start->getType(), 2); idx->addIncoming(start, preheader); if (!varToVal_.count(v->var())) { varToVal_.emplace(v->var(), idx); @@ -1345,7 +1351,8 @@ void LLVMCodeGenImpl::visit(ForPtr v) { body = irb_.GetInsertBlock(); // Increment the index variable and branch back to loop test. - auto inc = irb_.CreateAdd(idx, llvm::ConstantInt::getSigned(IntTy_, 1)); + auto inc = + irb_.CreateAdd(idx, llvm::ConstantInt::getSigned(start->getType(), 1)); irb_.CreateBr(condBlock); idx->addIncoming(inc, body); @@ -1430,8 +1437,8 @@ void LLVMCodeGenImpl::visit(StorePtr v) { // Handle the case where the store is contiguous and unmasked efficiently auto idx_ramp = to(v->flat_index()); if (idx_ramp) { - auto stride_imm = to(idx_ramp->stride()); - if (stride_imm && stride_imm->value() == 1) { + auto stride_imm = intValue(idx_ramp->stride()); + if (stride_imm && *stride_imm == 1) { idx_ramp->base()->accept(this); auto first_idx = value_; diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h index 8585900abc8d6..a837899cdce1d 100644 --- a/torch/csrc/jit/tensorexpr/llvm_jit.h +++ b/torch/csrc/jit/tensorexpr/llvm_jit.h @@ -20,8 +20,8 @@ namespace tensorexpr { extern "C" { void DispatchParallel( int8_t* func, - int start, - int stop, + int64_t start, + int64_t stop, int8_t* packed_data) noexcept; } diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index d3a4b919bef33..11020cc2eda08 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -127,8 +127,8 @@ class Vectorizer : public IRMutator { ExprPtr start = v->start(); ExprPtr stop = v->stop(); - IntImmPtr start_imm = to(start); - IntImmPtr stop_imm = to(stop); + auto start_imm = intValue(start); + auto stop_imm = intValue(stop); if (!start_imm) { throw std::runtime_error( "Can't vectorize due to non-constant loop start!"); @@ -140,8 +140,8 @@ class Vectorizer : public IRMutator { } var_ = var; - start_ = start_imm; - lanes_ = stop_imm->value(); + start_ = immLike(start, *start_imm); + lanes_ = *stop_imm; StmtPtr new_body = body->accept_mutator(this); if (new_body == body) { @@ -531,11 +531,11 @@ class FunctionInliner : public IRMutator { if (auto index_var = to(i)) { index_vars_.insert(index_var); producer_index_vars_.push_back(index_var); - } else if (to(i) != nullptr) { + } else if (intValue(i)) { // If the index can be a constant, then that dimension must have size 1 // (since we don't support in-place writes). Resolves issue 52581. TORCH_INTERNAL_ASSERT( - to(i)->value() == 0, + *intValue(i) == 0, "Constant index impression should always be zero"); producer_index_vars_.push_back(nullptr); } else { @@ -553,8 +553,7 @@ class FunctionInliner : public IRMutator { ExprPtr func_caller_param = dims.at(i); if (func_callee_arg == nullptr) { TORCH_INTERNAL_ASSERT( - to(func_caller_param) != nullptr && - to(func_caller_param)->value() == 0, + intValue(func_caller_param) && *intValue(func_caller_param) == 0, "We are implicitly assuming that if you have an index of 0, that must also be inlined into an index of 0"); continue; } @@ -1140,7 +1139,7 @@ bool LoopNest::optimizeConditionals() { // only include the RHS of the conditions in the if-then-else expressions // we need to start with `0` which is the initial bound, given that we // only handle normalized loops (check for this is done below). - std::vector comp_values = {alloc(0)}; + std::vector comp_values; std::vector sub_exprs; auto ifthenelse_exprs = NodeFinder::find(store); if (ifthenelse_exprs.empty()) { @@ -1155,6 +1154,8 @@ bool LoopNest::optimizeConditionals() { ifthenelse_exprs.front(), &cond_var, &comp_values, &sub_exprs)) { continue; } + TORCH_INTERNAL_ASSERT(comp_values.size() >= 1); + comp_values.insert(comp_values.begin(), immLike(comp_values[0], 0)); auto fors = getLoopStmtsFor(store); if (cond_var != fors.back()->var()) { @@ -1290,10 +1291,10 @@ void LoopNest::vectorizeInnerLoops() { } void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) { - if (to(f->start()) && to(f->stop())) { - int start_val = to(f->start())->value(); - int stop_val = to(f->stop())->value(); - int size_val = stop_val - start_val; + if (intValue(f->start()) && intValue(f->stop())) { + auto start_val = *intValue(f->start()); + auto stop_val = *intValue(f->stop()); + auto size_val = stop_val - start_val; if (factor >= size_val) { *head = f; *tail = nullptr; @@ -1311,7 +1312,7 @@ void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) { } ExprPtr head_end = alloc( - alloc(f->start(), alloc(factor)), f->stop(), true); + alloc(f->start(), immLike(f->stop(), factor)), f->stop(), true); *head = alloc(f->var(), f->start(), head_end, Stmt::clone(f->body())); p->insert_stmt_before(*head, f); @@ -1330,10 +1331,10 @@ void LoopNest::sliceHead(ForPtr f, int factor) { } void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) { - if (to(f->start()) && to(f->stop())) { - int start_val = to(f->start())->value(); - int stop_val = to(f->stop())->value(); - int size_val = stop_val - start_val; + if (intValue(f->start()) && intValue(f->stop())) { + auto start_val = *intValue(f->start()); + auto stop_val = *intValue(f->stop()); + auto size_val = stop_val - start_val; if (factor >= size_val) { *head = nullptr; *tail = f; @@ -1351,7 +1352,7 @@ void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) { } ExprPtr tail_start = alloc( - f->start(), alloc(f->stop(), alloc(factor)), true); + f->start(), alloc(f->stop(), immLike(f->stop(), factor)), true); *tail = alloc(f->var(), tail_start, f->stop(), Stmt::clone(f->body())); p->insert_stmt_after(*tail, f); @@ -1390,17 +1391,17 @@ void LoopNest::splitWithTail( } bool tail_is_needed = true; - if (to(f->start()) && to(f->stop())) { - int start_val = to(f->start())->value(); - int stop_val = to(f->stop())->value(); - int size_val = stop_val - start_val; - int tail_size = size_val % factor; + if (intValue(f->start()) && intValue(f->stop())) { + auto const start_val = *intValue(f->start()); + auto const stop_val = *intValue(f->stop()); + auto const size_val = stop_val - start_val; + auto const tail_size = size_val % factor; if (tail_size == 0) { tail_is_needed = false; } } - IntImmPtr factor_expr = alloc(factor); + ExprPtr factor_expr = immLike(f->stop(), factor); ExprPtr size = alloc(f->stop(), f->start()); ExprPtr split_count = alloc
    (size, factor_expr); ExprPtr tail_size = alloc(size, factor_expr); @@ -1423,7 +1424,7 @@ void LoopNest::splitWithTail( StmtPtr body_tail = SubstituteInClone(f->body(), {{f->var(), combined_index2}}); - *tail = alloc(i_tail, alloc(0), tail_size, body_tail); + *tail = alloc(i_tail, immLike(tail_size, 0), tail_size, body_tail); p->insert_stmt_after(*tail, f); } else { @@ -1433,10 +1434,11 @@ void LoopNest::splitWithTail( StmtPtr body_inner = Substitute(f->removeBody(), {{f->var(), combined_index1}}); - *inner = alloc(i_inner, alloc(0), factor_expr, body_inner); + *inner = + alloc(i_inner, immLike(factor_expr, 0), factor_expr, body_inner); // The input loop `f` will be the outer loop after split. f->set_var(i_outer); - f->set_start(alloc(0)); + f->set_start(immLike(split_count, 0)); f->set_stop(split_count); f->set_body(*inner); } @@ -1458,20 +1460,20 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) { ExprPtr start = IRSimplifier::simplify(f->start()); ExprPtr stop = IRSimplifier::simplify(f->stop()); if (start->isConstant() && stop->isConstant()) { - int start_val = immediateAs(start); - int stop_val = immediateAs(stop); - int size_val = stop_val - start_val; - int tail_size = size_val % factor; + auto start_val = *intValue(start); + auto stop_val = *intValue(stop); + auto size_val = stop_val - start_val; + auto tail_size = size_val % factor; if (tail_size == 0) { tail_is_needed = false; } } - IntImmPtr factor_expr = alloc(factor); + auto factor_expr = immLike(f->stop(), factor); ExprPtr size = alloc(f->stop(), f->start()); // split_count = (size + factor - 1) / factor ExprPtr split_count = alloc
    ( - alloc(alloc(size, factor_expr), alloc(1)), factor_expr); + alloc(alloc(size, factor_expr), immLike(size, 1)), factor_expr); const std::string& loop_var_name = f->var()->name_hint(); Dtype loop_var_dtype = f->var()->dtype(); @@ -1487,8 +1489,8 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) { // TODO: is it ok that we're doing it eagerly? In the other implementation we // are only materializing predicates at the last, lowering, step. if (tail_is_needed) { - IntImmPtr start = to(f->start()); - if (!start || start->value() != 0) { + auto start = intValue(f->start()); + if (!start || *start != 0) { throw unimplemented_lowering(); } @@ -1499,10 +1501,11 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) { } body_inner = Substitute(body_inner, {{f->var(), combined_index}}); - *inner = alloc(i_inner, alloc(0), factor_expr, body_inner); + *inner = + alloc(i_inner, immLike(factor_expr, 0), factor_expr, body_inner); // The input loop `f` will be the outer loop after split. f->set_var(i_outer); - f->set_start(alloc(0)); + f->set_start(immLike(split_count, 0)); f->set_stop(split_count); f->set_body(*inner); } @@ -2177,7 +2180,7 @@ bool LoopNest::normalize(ForPtr f) { {{f->var(), (VarHandle(f->var()) + ExprHandle(f->start())).node()}}); f->set_body(IRSimplifier::simplify(for_body_normalized)); f->set_stop(IRSimplifier::simplify(alloc(f->stop(), f->start()))); - f->set_start(alloc(0)); + f->set_start(immLike(f->stop(), 0)); return true; } @@ -2242,7 +2245,7 @@ bool LoopNest::flatten(const std::vector& loops, ForPtr* flattened) { normalized_loops[0]->var()->name_hint() + "_flat", normalized_loops[0]->var()->dtype()); VarMapping var_mapping; - ExprPtr stop = alloc(1); + ExprPtr stop = immLike(flat_var, 1); for (size_t i = 0; i < normalized_loops.size(); ++i) { size_t idx = normalized_loops.size() - i - 1; auto curr_loop = normalized_loops[idx]; @@ -2255,7 +2258,7 @@ bool LoopNest::flatten(const std::vector& loops, ForPtr* flattened) { Substitute(normalized_loops.back()->removeBody(), var_mapping); normalized_loops.front()->set_var(flat_var); - normalized_loops.front()->set_start(alloc(0)); + normalized_loops.front()->set_start(immLike(stop, 0)); normalized_loops.front()->set_stop(stop); normalized_loops.front()->set_body(flattened_body); *flattened = normalized_loops.front(); @@ -2357,7 +2360,7 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) { std::vector new_dims(buf->dims()); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i]) { - new_dims[i] = alloc(1); + new_dims[i] = immLike(buf->dims()[i], 1); } } buf->set_dims(new_dims); @@ -2368,7 +2371,7 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) { std::vector new_indices(indices); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i]) { - new_indices[i] = alloc(0); + new_indices[i] = immLike(indices[i], 0); } } return new_indices; @@ -2652,12 +2655,13 @@ LoopNest::AccessResult LoopNest::cacheAccesses( // Determine the size of the cache, and create a loop var for each dimension. for (size_t i = 0; i < info.start.size(); ++i) { - ExprPtr dim = IRSimplifier::simplify( - alloc(alloc(info.stop[i], info.start[i]), alloc(1))); + ExprPtr dim = IRSimplifier::simplify(alloc( + alloc(info.stop[i], info.start[i]), immLike(info.stop[i], 1))); tmp_dims.push_back(dim); - new_loop_vars.push_back(alloc(var_names[i % var_names.size()], kInt)); + new_loop_vars.push_back( + alloc(var_names[i % var_names.size()], info.stop[i]->dtype())); new_loop_vars_expr.push_back(new_loop_vars[i]); } @@ -2708,8 +2712,8 @@ LoopNest::AccessResult LoopNest::cacheAccesses( tmp_buf, new_loop_vars_expr, getImmediateByType(tmp_buf->dtype(), 0)); for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) { - tmp_init = - alloc(new_loop_vars[i], alloc(0), tmp_dims[i], tmp_init); + tmp_init = alloc( + new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_init); } if (is_block) { @@ -2730,7 +2734,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses( for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) { tmp_store = alloc( - new_loop_vars[i], alloc(0), tmp_dims[i], tmp_store); + new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_store); } if (is_block) { @@ -2749,7 +2753,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses( for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) { tmp_store = alloc( - new_loop_vars[i], alloc(0), tmp_dims[i], tmp_store); + new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_store); } if (is_block) { @@ -2766,7 +2770,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses( for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) { tmp_store = alloc( - new_loop_vars[i], alloc(0), tmp_dims[i], tmp_store); + new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_store); } if (is_block) { @@ -2914,7 +2918,8 @@ void LoopNest::computeAt(StmtPtr s, ForPtr f) { std::vector temp_indices(dims.size()); for (const auto i : c10::irange(dims.size())) { // TODO: Use name-hint of the producer indices instead of 'idx' - temp_indices[i] = alloc(std::string("idx") + c10::to_string(i), kInt); + temp_indices[i] = + alloc(std::string("idx") + c10::to_string(i), dims[i]->dtype()); } // Prepare substitute rules for constructing the temp statement from the prod @@ -2955,7 +2960,10 @@ void LoopNest::computeAt(StmtPtr s, ForPtr f) { // dimensions in reversed order. size_t dim_idx = dims.size() - 1 - i; bd = alloc( - to(temp_indices[dim_idx]), alloc(0), dims[dim_idx], bd); + to(temp_indices[dim_idx]), + immLike(dims[dim_idx], 0), + dims[dim_idx], + bd); } // Add constructed stmts to the consumer loop diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp index 8f6f2b106b1b2..e1688e37cbe7f 100644 --- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp +++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp @@ -185,13 +185,13 @@ void AccessInfo::dumpDOT(std::ostream& os) const { if (bounds_.size() > 0) { for (size_t i = 0; i < bounds_.size() - 1; ++i) { os << *IRSimplifier::simplify( - alloc(bounds_[i].end, alloc(1))) + alloc(bounds_[i].end, immLike(bounds_[i].end, 1))) << ", "; } size_t i = bounds_.size() - 1; os << *IRSimplifier::simplify( - alloc(bounds_[i].end, alloc(1))); + alloc(bounds_[i].end, immLike(bounds_[i].end, 1))); os << "]\"\n "; } if (isWrite()) { @@ -632,7 +632,7 @@ bool executionSafetyCheck( // Invert the startDiff so mod works. if (diffNegative != strideNegative) { startDiff = - IRSimplifier::simplify(alloc(alloc(0), startDiff)); + IRSimplifier::simplify(alloc(immLike(startDiff, 0), startDiff)); } // If both accesses have the same stride, and the difference in start @@ -650,7 +650,7 @@ bool executionSafetyCheck( CompareSelectOperation op = strideNegative ? kLT : kGT; ExprPtr check = IRSimplifier::simplify( - alloc(startDiff, alloc(0), op)); + alloc(startDiff, immLike(startDiff, 0), op)); // If the start difference modulo the minimum stride is offset from that // stride, then the ranges have distinct strides. @@ -731,7 +731,7 @@ void MemDependencyChecker::visit(ForPtr v) { for (const auto i : c10::irange(indices.size())) { VarFinder vf; if (vf.find(indices[i]).count(var) == 0) { - loopIndicesStride[i] = alloc(0); + loopIndicesStride[i] = immLike(indices[i], 0); } else { // If we've previously swapped the start and end of this bound, we // should apply the substitution to the reverse of the bounds. @@ -740,19 +740,19 @@ void MemDependencyChecker::visit(ForPtr v) { SubstituteInClone(info->bounds()[i].end, {{var, v->start()}})); info->bounds()[i].start = IRSimplifier::simplify(SubstituteInClone( info->bounds()[i].start, - {{var, alloc(v->stop(), alloc(1))}})); + {{var, alloc(v->stop(), immLike(v->stop(), 1))}})); } else { info->bounds()[i].start = IRSimplifier::simplify( SubstituteInClone(info->bounds()[i].start, {{var, v->start()}})); info->bounds()[i].end = IRSimplifier::simplify(SubstituteInClone( info->bounds()[i].end, - {{var, alloc(v->stop(), alloc(1))}})); + {{var, alloc(v->stop(), immLike(v->stop(), 1))}})); } ExprPtr zeroStep = indices[i]; ExprPtr oneStep = SubstituteInClone( - indices[i], {{var, alloc(var, alloc(1))}}); + indices[i], {{var, alloc(var, immLike(var, 1))}}); loopIndicesStride[i] = IRSimplifier::simplify(alloc(oneStep, zeroStep)); @@ -785,7 +785,7 @@ void MemDependencyChecker::visit(ForPtr v) { bound.start = IRSimplifier::simplify( SubstituteInClone(bound.start, {{var, v->start()}})); bound.end = IRSimplifier::simplify(SubstituteInClone( - bound.end, {{var, alloc(v->stop(), alloc(1))}})); + bound.end, {{var, alloc(v->stop(), immLike(v->stop(), 1))}})); // If the start < end then swap the order of the bound. ExprPtr diff = @@ -1037,8 +1037,8 @@ void MemDependencyChecker::insertBuffers( IndexBounds bounds; for (auto d : b->dims()) { bounds.push_back( - {alloc(0), - IRSimplifier::simplify(alloc(d, alloc(1)))}); + {immLike(d, 0), + IRSimplifier::simplify(alloc(d, immLike(d, 1)))}); } auto info = std::make_shared(nextAccess_++, type, nullptr, var, bounds); @@ -1126,8 +1126,9 @@ void MemDependencyChecker::visit(AllocatePtr v) { // avoid failing the bound check. But this is not the correct approach and // should be fixed. ExprPtr flat_size = buf_flat_size(v->buf()); - flat_size = IRSimplifier::simplify(alloc(flat_size, alloc(1))); - bounds.push_back({alloc(0), flat_size}); + flat_size = + IRSimplifier::simplify(alloc(flat_size, immLike(flat_size, 1))); + bounds.push_back({immLike(flat_size, 0), flat_size}); auto info = std::make_shared( nextAccess_++, AccessType::Alloc, nullptr, var, bounds); diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp index bc26581970383..8684f2aabc810 100644 --- a/torch/csrc/jit/tensorexpr/registerizer.cpp +++ b/torch/csrc/jit/tensorexpr/registerizer.cpp @@ -18,7 +18,7 @@ void AccessInfo::addStore(StorePtr store, const std::shared_ptr& scope) { last_usage_ = store; store_cost_ = - IRSimplifier::simplify(alloc(store_cost_, alloc(1))); + IRSimplifier::simplify(alloc(store_cost_, immLike(store_cost_, 1))); stores_.push_back(store); conditionId_ = scope->conditionId(); @@ -34,7 +34,8 @@ void AccessInfo::addLoad( first_usage_ = first_usage_ ? block_->getEnclosedRoot(first_usage_) : usage; last_usage_ = usage; - load_cost_ = IRSimplifier::simplify(alloc(load_cost_, alloc(1))); + load_cost_ = + IRSimplifier::simplify(alloc(load_cost_, immLike(load_cost_, 1))); loads_.push_back(load); conditionId_ = scope->conditionId(); diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp index ea3902dcf3c0d..7a219fe728757 100644 --- a/torch/csrc/jit/tensorexpr/tensor.cpp +++ b/torch/csrc/jit/tensorexpr/tensor.cpp @@ -31,8 +31,8 @@ StmtPtr Tensor::constructStmt( for (const auto i : c10::irange(reduce_ndim)) { // Going in reverse order: from innermost loop to the outermost size_t dim_index = reduce_ndim - i - 1; - s = alloc( - reduce_args[dim_index], alloc(0), reduce_dims[dim_index], s); + auto const& dim = reduce_dims[dim_index]; + s = alloc(reduce_args[dim_index], immLike(dim, 0), dim, s); } if (init_expr) { StorePtr init_stmt = alloc(buf(), indices, init_expr); @@ -43,7 +43,8 @@ StmtPtr Tensor::constructStmt( for (const auto i : c10::irange(ndim)) { // Going in reverse order: from innermost loop to the outermost size_t dim_index = ndim - i - 1; - s = alloc(args[dim_index], alloc(0), buf()->dim(dim_index), s); + auto const& dim = buf()->dim(dim_index); + s = alloc(args[dim_index], immLike(dim, 0), dim, s); } return s; } From 371c6612b39c1fa18d6c2e22613e292ae87b686f Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Sun, 29 Aug 2021 09:56:34 -0700 Subject: [PATCH 329/530] Automated submodule update: FBGEMM (#64141) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/9939bac9defab4d18fb7fdded7e1a76c0c2b49b4 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64141 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jspark1105 Differential Revision: D30629417 fbshipit-source-id: 1b1ad3d4caff925f798b86b358ab193554c9b8e0 --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index e922280540acf..9939bac9defab 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit e922280540acf7920ea21f99b8db064f89bb8c11 +Subproject commit 9939bac9defab4d18fb7fdded7e1a76c0c2b49b4 From 52d7dd73987dde6bcfe9c672cfb1b36e284becef Mon Sep 17 00:00:00 2001 From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" Date: Sun, 29 Aug 2021 10:19:56 -0700 Subject: [PATCH 330/530] [DOC] improve docstring for Optimizer.state_dict (#63153) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63153 Fixes: https://github.com/pytorch/pytorch/issues/60121 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D30629462 Pulled By: tugsbayasgalan fbshipit-source-id: a9160e02ac53bb1a6219879747d73aae9ebe4d2f --- torch/optim/optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py index 02f1cc265937b..79f72f041822b 100644 --- a/torch/optim/optimizer.py +++ b/torch/optim/optimizer.py @@ -100,7 +100,8 @@ def state_dict(self): * state - a dict holding current optimization state. Its content differs between optimizer classes. - * param_groups - a dict containing all parameter groups + * param_groups - a list containing all parameter groups where each + parameter group is a dict """ # Save order indices instead of Tensors param_mappings = {} From 7ebdbf82dccea370edda161936cc533c012e690a Mon Sep 17 00:00:00 2001 From: Garrett Cramer Date: Sun, 29 Aug 2021 11:33:48 -0700 Subject: [PATCH 331/530] add support for sending cpu sparse tensors over rpc (#62794) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62794 This pr updates jit serialization to support pickling Sparse COO tensors. This pr updates message.cpp to support Sparse COO tensors. A bug was filed a few years ago https://github.com/pytorch/pytorch/issues/30807. I tested the fix by adding sparse tensor tests to rpc_test.py and dist_autograd_test.py. cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 gmagogsfm Test Plan: Imported from OSS Reviewed By: soulitzer Differential Revision: D30608848 Pulled By: gcramer23 fbshipit-source-id: 629ba8e4a3d8365875a709c9b87447c7a71204fb --- torch/csrc/distributed/rpc/message.cpp | 11 +- torch/csrc/jit/serialization/pickler.cpp | 44 ++ torch/csrc/jit/serialization/pickler.h | 1 + torch/csrc/jit/serialization/unpickler.cpp | 35 + torch/csrc/jit/serialization/unpickler.h | 1 + .../distributed/rpc/dist_autograd_test.py | 653 +++++++++++++---- .../_internal/distributed/rpc/rpc_test.py | 663 +++++++++++++++--- 7 files changed, 1172 insertions(+), 236 deletions(-) diff --git a/torch/csrc/distributed/rpc/message.cpp b/torch/csrc/distributed/rpc/message.cpp index 02771140f69bb..7265ed400b2e9 100644 --- a/torch/csrc/distributed/rpc/message.cpp +++ b/torch/csrc/distributed/rpc/message.cpp @@ -68,10 +68,17 @@ void Message::setId(int64_t id) { std::vector> Message::getStorages() const { + // Sparse tensors do not have storage. Instead, a sparse tensor + // contains two tensors indices and values, and both contain storage. std::vector> storages; - storages.reserve(tensors_.size()); + storages.reserve(2 * tensors_.size()); for (const auto& tensor : tensors_) { - storages.emplace_back(tensor.storage().getWeakStorageImpl()); + if (tensor.is_sparse()) { + storages.emplace_back(tensor._indices().storage().getWeakStorageImpl()); + storages.emplace_back(tensor._values().storage().getWeakStorageImpl()); + } else { + storages.emplace_back(tensor.storage().getWeakStorageImpl()); + } } return storages; } diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 4a4e8663b3838..f465eaf4dff00 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -353,6 +353,44 @@ void Pickler::pushTensor(const IValue& ivalue) { } } +void Pickler::pushLiteralSparseTensor(const at::Tensor& tensor) { + pushGlobal("torch._utils", "_rebuild_sparse_tensor"); + push(PickleOpCode::MARK); + // layout + auto layout = static_cast(tensor.layout()); + pushInt(layout); + switch (layout) { + case static_cast(c10::Layout::Sparse): + // size + push(PickleOpCode::MARK); + for (auto size : tensor.sizes()) { + pushInt(size); + } + push(PickleOpCode::TUPLE); + // requires grad + pushIValue(tensor.requires_grad()); + // indices + pushTensor(tensor._indices()); + // values + pushTensor(tensor._values()); + break; + default: + TORCH_CHECK( + false, + "Unsupported sparse tensor layout type in serialization ", + static_cast(layout)); + break; + } + // backward_hooks + pushGlobal("collections", "OrderedDict"); + push(PickleOpCode::EMPTY_TUPLE); + // Construct the collections.OrderedDict for the backward_hooks + push(PickleOpCode::REDUCE); + push(PickleOpCode::TUPLE); + // Call torch._utils._rebuild_sparse_coo_tensor + push(PickleOpCode::REDUCE); +} + void Pickler::pushLiteralTensor(const IValue& ivalue) { // In contrast to tensor references, literal tensors are included in the // pickle program binary blob. They are written to the file after the STOP @@ -362,6 +400,12 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) { // The format here is the same one used by `torch.save()`. The code for the // format can be found in `torch/serialization.py`. auto& tensor = ivalue.toTensor(); + + if (tensor.is_sparse() || tensor.is_sparse_csr()) { + pushLiteralSparseTensor(tensor); + return; + } + bool quantized = tensor.is_quantized(); // The arguments to this function are: // storage, storage_offset, size, stride, requires_grad, backward_hooks diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h index ac54ac45a2886..3dc6bef9d9131 100644 --- a/torch/csrc/jit/serialization/pickler.h +++ b/torch/csrc/jit/serialization/pickler.h @@ -172,6 +172,7 @@ class TORCH_API Pickler { void pushTensor(const IValue& ivalue); void pushTensorReference(const IValue& ivalue); void pushLiteralTensor(const IValue& ivalue); + void pushLiteralSparseTensor(const at::Tensor& tensor); void pushTuple(const IValue& ivalue); void pushString(const std::string& string); void pushDevice(const IValue& ivalue); diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index 581b94978c459..f944387465446 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -550,6 +550,9 @@ void Unpickler::readGlobal( // Unpickle a tensor bool quantized = class_name == "_rebuild_qtensor"; rebuildTensor(quantized); + } else if ( + module_name == "torch._utils" && class_name == "_rebuild_sparse_tensor") { + rebuildSparseTensor(); } else if (module_name == "builtins" && class_name == "complex") { globals_.emplace_back([this] { auto elems = pop(stack_).toTuple()->elements(); @@ -647,6 +650,38 @@ void Unpickler::readGlobal( stack_.emplace_back(int64_t(globals_.size() - 1)); } +void Unpickler::rebuildSparseTensor() { + globals_.emplace_back([this] { + auto tup = pop(stack_).toTuple(); + const auto& elements = tup->elements(); + size_t idx = 0; + auto layout = elements.at(idx++).toInt(); + at::Tensor result; + switch (layout) { + case static_cast(c10::Layout::Sparse): { + std::vector size = tupleToIntList(elements.at(idx++)); + bool requires_grad = elements.at(idx++).toBool(); + auto& indices_tensor = elements.at(idx++).toTensor(); + auto& values_tensor = elements.at(idx++).toTensor(); + auto options = values_tensor.options() + .layout(c10::Layout::Sparse) + .requires_grad(requires_grad); + result = at::_sparse_coo_tensor_unsafe( + indices_tensor, values_tensor, size, options); + result = autograd::make_variable(result, options.requires_grad()); + break; + } + default: + TORCH_CHECK( + false, + "Unsupported sparse tensor layout type in serialization ", + static_cast(layout)); + break; + } + stack_.emplace_back(std::move(result)); + }); +} + void Unpickler::rebuildTensor(bool quantized) { globals_.emplace_back([this, quantized] { auto tup = pop(stack_).toTuple(); diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h index f404deee848be..586ff9cc4ae59 100644 --- a/torch/csrc/jit/serialization/unpickler.h +++ b/torch/csrc/jit/serialization/unpickler.h @@ -108,6 +108,7 @@ class TORCH_API Unpickler { const std::string& module_name, const std::string& class_name); void rebuildTensor(bool quantized); + void rebuildSparseTensor(); #ifdef USE_DISTRIBUTED void rebuildRRef(); #endif diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py index 017a61b7debf5..fba50303068e7 100644 --- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py @@ -64,13 +64,29 @@ def _torch_ones(sizes, requires_grad=False): # rref tensor equals to the given grad. def _compare_owner_value(context_id, rref, grad): grads = dist_autograd.get_gradients(context_id) - return torch.equal(grads[rref.local_value()], grad) + x = grads[rref.local_value()] + if x.is_sparse: + assert grad.is_sparse + x = x.to_dense() + grad = grad.to_dense() + else: + assert not grad.is_sparse + return torch.equal(x, grad) def create_tensor(): return torch.ones((3, 3), requires_grad=True) +def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32): + i = [[0, 1, 1], [2, 0, 2]] + v = [3.2, 4.1, 5.3] + tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype) + if coalesce: + tensor = tensor.coalesce() + return tensor + + @torch.jit.script def create_torchscript_tensor() -> torch.Tensor: return torch.ones((3, 3)).requires_grad_() @@ -143,20 +159,28 @@ def _all_contexts_cleaned_up(timeout_seconds=10): # This function creates a dis atugorad context, run rpc_sync on the given ps, # and then blocks until the ps has verified the grads are correctly accumulated. -def _run_trainer(rref_t1, t2, ps, rank_diff): +def _run_trainer(rref_t1, t2, ps, rank_diff, sparse): with dist_autograd.context() as context_id: ret = rpc.rpc_sync(ps, my_rref_add, args=(rref_t1, t2)) - dist_autograd.backward(context_id, [ret.sum()]) + if sparse: + loss = torch.sparse.sum(ret) + else: + loss = ret.sum() + dist_autograd.backward(context_id, [loss]) # prevent deleting dist autograd context rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff)) rpc.rpc_sync(ps, _check_rpc_done, args=(0,)) # This function is the same as _run_trainer, except rpc calls torchscript # function "my_script_ref_add" instead of python funciton "my_rref_add" -def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff): +def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff, sparse): with dist_autograd.context() as context_id: ret = rpc.rpc_sync(ps, my_script_ref_add, args=(rref_t1, t2)) - dist_autograd.backward(context_id, [ret.sum()]) + if sparse: + loss = torch.sparse.sum(ret) + else: + loss = ret.sum() + dist_autograd.backward(context_id, [loss]) # prevent deleting dist autograd context rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff)) rpc.rpc_sync(ps, _check_rpc_done, args=(0,)) @@ -379,14 +403,18 @@ def _verify_graph_for_nested_rpc_call(self, ctx): "torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name() ) - def _test_graph(self, fn, exec_mode): + def _test_graph(self, fn, exec_mode, sparse): dst_rank = (self.rank + 1) % self.world_size initialize_pg(self.file_init_method, self.rank, self.world_size) with dist_autograd.context() as context_id: - t1 = torch.ones(3, 3, requires_grad=True) - t2 = torch.zeros(3, 3, requires_grad=True) + if sparse: + t1 = build_sparse_tensor() + t2 = build_sparse_tensor() + else: + t1 = torch.ones(3, 3, requires_grad=True) + t2 = torch.zeros(3, 3, requires_grad=True) if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync(worker_name(dst_rank), fn, args=(t1, t2)) elif ExecMode.REMOTE == exec_mode: @@ -436,29 +464,49 @@ def _test_graph(self, fn, exec_mode): @dist_init def test_graph_for_builtin_call(self): - self._test_graph(torch.add, ExecMode.RPC_SYNC) + self._test_graph(torch.add, ExecMode.RPC_SYNC, False) + + @dist_init + def test_graph_for_builtin_call_sparse(self): + self._test_graph(torch.add, ExecMode.RPC_SYNC, True) @dist_init def test_graph_for_python_call(self): - self._test_graph(my_py_add, ExecMode.RPC_SYNC) + self._test_graph(my_py_add, ExecMode.RPC_SYNC, False) + + @dist_init + def test_graph_for_python_call_sparse(self): + self._test_graph(my_py_add, ExecMode.RPC_SYNC, True) @dist_init def test_graph_for_builtin_remote_call(self): - self._test_graph(torch.add, ExecMode.REMOTE) + self._test_graph(torch.add, ExecMode.REMOTE, False) + + @dist_init + def test_graph_for_builtin_remote_call_sparse(self): + self._test_graph(torch.add, ExecMode.REMOTE, True) @dist_init def test_graph_for_python_remote_call(self): - self._test_graph(my_py_add, ExecMode.REMOTE) + self._test_graph(my_py_add, ExecMode.REMOTE, False) + + @dist_init + def test_graph_for_python_remote_call_sparse(self): + self._test_graph(my_py_add, ExecMode.REMOTE, True) # 3-layer nested calls - def _test_graph_for_py_nested_call(self, exec_mode): + def _test_graph_for_py_nested_call(self, exec_mode, sparse): dst_rank = (self.rank + 1) % self.world_size initialize_pg(self.file_init_method, self.rank, self.world_size) with dist_autograd.context() as context_id: - t1 = torch.ones(3, 3, requires_grad=True) - t2 = torch.zeros(3, 3, requires_grad=True) + if sparse: + t1 = build_sparse_tensor(requires_grad=True) + t2 = build_sparse_tensor(requires_grad=True) + else: + t1 = torch.ones(3, 3, requires_grad=True) + t2 = torch.zeros(3, 3, requires_grad=True) nest_dst_rank = (dst_rank + 1) % self.world_size if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync( @@ -531,21 +579,33 @@ def _test_graph_for_py_nested_call(self, exec_mode): @dist_init def test_graph_for_py_nested_call(self): - self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC) + self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC, False) + + @dist_init + def test_graph_for_py_nested_call_sparse(self): + self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC, True) @dist_init def test_graph_for_py_nested_remote_call(self): - self._test_graph_for_py_nested_call(ExecMode.REMOTE) + self._test_graph_for_py_nested_call(ExecMode.REMOTE, False) + + @dist_init + def test_graph_for_py_nested_remote_call_sparse(self): + self._test_graph_for_py_nested_call(ExecMode.REMOTE, True) # Rank0->Rank1->Rank0 - def _test_graph_for_py_nested_call_itself(self, exec_mode): + def _test_graph_for_py_nested_call_itself(self, exec_mode, sparse): dst_rank = (self.rank + 1) % self.world_size initialize_pg(self.file_init_method, self.rank, self.world_size) with dist_autograd.context() as context_id: - t1 = torch.ones(3, 3, requires_grad=True) - t2 = torch.zeros(3, 3, requires_grad=True) + if sparse: + t1 = build_sparse_tensor(requires_grad=True) + t2 = build_sparse_tensor(requires_grad=True) + else: + t1 = torch.ones(3, 3, requires_grad=True) + t2 = torch.zeros(3, 3, requires_grad=True) if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync( worker_name(dst_rank), @@ -610,18 +670,30 @@ def _test_graph_for_py_nested_call_itself(self, exec_mode): @dist_init def test_graph_for_py_nested_call_itself(self): - self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC) + self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC, False) + + @dist_init + def test_graph_for_py_nested_call_itself_sparse(self): + self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC, True) @dist_init def test_graph_for_py_nested_remote_call_itself(self): - self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE) + self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE, False) + + @dist_init + def test_graph_for_py_nested_remote_call_itself_sparse(self): + self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE, True) - def _test_no_graph_with_tensors_not_require_grad(self, exec_mode): + def _test_no_graph_with_tensors_not_require_grad(self, exec_mode, sparse): initialize_pg(self.file_init_method, self.rank, self.world_size) dst_rank = (self.rank + 1) % self.world_size with dist_autograd.context() as context_id: - t1 = torch.ones(3, 3, requires_grad=False) - t2 = torch.zeros(3, 3, requires_grad=False) + if sparse: + t1 = build_sparse_tensor(requires_grad=False) + t2 = build_sparse_tensor(requires_grad=False) + else: + t1 = torch.ones(3, 3, requires_grad=False) + t2 = torch.zeros(3, 3, requires_grad=False) if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync( worker_name(dst_rank), torch.add, args=(t1, t2) @@ -656,11 +728,19 @@ def _test_no_graph_with_tensors_not_require_grad(self, exec_mode): @dist_init def test_no_graph_with_tensors_not_require_grad(self): - self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC) + self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC, False) + + @dist_init + def test_no_graph_with_tensors_not_require_grad_sparse(self): + self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC, True) @dist_init def test_no_graph_with_tensors_not_require_grad_remote(self): - self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE) + self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE, False) + + @dist_init + def test_no_graph_with_tensors_not_require_grad_remote_sparse(self): + self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE, True) def _test_grad_only_on_return_value(self, exec_mode): initialize_pg(self.file_init_method, self.rank, self.world_size) @@ -699,13 +779,16 @@ def test_grad_only_on_return_value(self): def test_grad_only_on_return_value_remote(self): self._test_grad_only_on_return_value(ExecMode.REMOTE) - def _test_rpc_complex_args(self, exec_mode): + def _test_rpc_complex_args(self, exec_mode, sparse): with dist_autograd.context() as context_id: num_tensors = 10 tensors = [] for i in range(num_tensors): - tensors.append(torch.ones(3, 3, requires_grad=(i % 2 == 0))) - + if sparse: + tensor = build_sparse_tensor(requires_grad=(i % 2 == 0)) + else: + tensor = torch.ones(3, 3, requires_grad=(i % 2 == 0)) + tensors.append(tensor) dst_rank = self._next_rank() if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync( @@ -739,11 +822,19 @@ def _test_rpc_complex_args(self, exec_mode): @dist_init def test_rpc_complex_args(self): - self._test_rpc_complex_args(ExecMode.RPC_SYNC) + self._test_rpc_complex_args(ExecMode.RPC_SYNC, False) + + @dist_init + def test_rpc_complex_args_sparse(self): + self._test_rpc_complex_args(ExecMode.RPC_SYNC, True) @dist_init def test_remote_complex_args(self): - self._test_rpc_complex_args(ExecMode.REMOTE) + self._test_rpc_complex_args(ExecMode.REMOTE, False) + + @dist_init + def test_remote_complex_args_sparse(self): + self._test_rpc_complex_args(ExecMode.REMOTE, True) def context_cleanup_test_helper(self, rpc_args, func, nested=False): initialize_pg(self.file_init_method, self.rank, self.world_size) @@ -788,11 +879,22 @@ def test_context_cleanup_tensor_with_grad(self): t2 = torch.zeros(3, 3, requires_grad=True) self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add) + @dist_init + def test_context_cleanup_tensor_with_grad_sparse(self): + t1 = build_sparse_tensor(requires_grad=True) + t2 = build_sparse_tensor(requires_grad=True) + self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add) + @dist_init def test_context_cleanup_tensor_no_grad(self): t1 = torch.ones(3, 3, requires_grad=False) self.context_cleanup_test_helper(rpc_args=(t1, t1), func=torch.add) + @dist_init + def test_context_cleanup_tensor_no_grad_sparse(self): + t1 = build_sparse_tensor(requires_grad=False) + self.context_cleanup_test_helper(rpc_args=(t1, t1), func=torch.add) + @dist_init def test_context_cleanup_no_tensors(self): self.context_cleanup_test_helper(rpc_args=(1, 1), func=my_scalar_add) @@ -807,6 +909,16 @@ def test_context_cleanup_nested_rpc(self): rpc_args=args, func=my_py_nested_call, nested=True ) + @dist_init + def test_context_cleanup_nested_rpc_sparse(self): + t1 = build_sparse_tensor(requires_grad=True) + t2 = build_sparse_tensor(requires_grad=True) + dst_rank = (self.rank + 1) % self.world_size + args = (t1, t2, dst_rank, self.world_size, 0) + self.context_cleanup_test_helper( + rpc_args=args, func=my_py_nested_call, nested=True + ) + @dist_init def test_worker_ids_recorded(self): dst_ranks = {rank for rank in range(self.world_size) if rank != self.rank} @@ -876,23 +988,27 @@ def test_error_in_context(self): worker_name(self._next_rank()), torch.matmul, args=(t1, t2) ) - @dist_init - def test_backward_no_grad_on_tensor(self): - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) + def _backward_no_grad_on_tensor(self, t1, t2, sparse): with dist_autograd.context() as context_id: loss = rpc.rpc_sync( worker_name(self._next_rank()), torch.add, - args=(t1, t2)).sum() - + args=(t1, t2)) + if sparse: + loss = torch.sparse.sum(loss) + else: + loss = loss.sum() dist_autograd.backward(context_id, [loss], retain_graph=True) self.assertIsNone(t1.grad) self.assertIsNone(t2.grad) # Now populate .grad with local autograd engine and # verify dist autograd doesn't mess with it. - loss_local = torch.add(t1, t2).sum() + loss_local = torch.add(t1, t2) + if sparse: + loss_local = torch.sparse.sum(loss_local) + else: + loss_local = loss_local.sum() loss_local.backward() self.assertIsNotNone(t1.grad) self.assertIsNotNone(t2.grad) @@ -903,18 +1019,34 @@ def test_backward_no_grad_on_tensor(self): self.assertEqual(t1_grad_before, t1.grad) self.assertEqual(t2_grad_before, t2.grad) - def _test_backward_simple(self, dst): - # Run the same code locally and with dist autograd and verify gradients - # are same. - local_grads = None - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) + @dist_init + def test_backward_no_grad_on_tensor(self): + self._backward_no_grad_on_tensor( + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3), requires_grad=True), + False + ) + + @dist_init + def test_backward_no_grad_on_tensor_sparse(self): + self._backward_no_grad_on_tensor( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + True + ) + + # Run the same code locally and with dist autograd and verify gradients + # are same. + def _backward_simple(self, dst, t1, t2, local_grads, sparse): for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]: with dist_autograd.context() as context_id: ret = self._exec_func_with_dst( dst, exec_mode, torch.add, t1, t2 ) - loss = ret.sum() + if sparse: + loss = torch.sparse.sum(ret) + else: + loss = ret.sum() ret = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @@ -922,29 +1054,65 @@ def _test_backward_simple(self, dst): @dist_init def test_backward_simple(self): - self._test_backward_simple(self._next_rank()) + self._backward_simple( + self._next_rank(), + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3), requires_grad=True), + None, + False + ) + + @dist_init + def test_backward_simple_sparse(self): + self._backward_simple( + self._next_rank(), + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + None, + True + ) @dist_init def test_backward_simple_self(self): - self._test_backward_simple(self.rank) + self._backward_simple( + self.rank, + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3), requires_grad=True), + None, + False + ) + + @dist_init + def test_backward_simple_self_sparse(self): + self._backward_simple( + self.rank, + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + None, + True + ) # The current rank first creates a tensor on the rref_owner, and then passes # the rref with another tensor to the callee to run either my_rref_add or # my_nested_rref_add, depending on whether the callee is the rref owner. # The grad of tensor lives on the current rank, and the grad of the rref # tensor lives on the rref owner. - def _test_backward_rref(self, callee, rref_owner): - local_grads = None - t1 = torch.ones((3, 3), requires_grad=True) - t2 = torch.zeros((3, 3), requires_grad=True) - + def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse): local_ret = torch.add(t1, t2) - local_ret.sum().backward() + if sparse: + local_ret = torch.sparse.sum(local_ret) + else: + local_ret = local_ret.sum() + local_ret.backward() with dist_autograd.context() as context_id: - rref_t1 = rpc.remote( - rref_owner, _torch_ones, args=((3, 3),), kwargs={"requires_grad": True} - ) - + if sparse: + rref_t1 = rpc.remote( + rref_owner, build_sparse_tensor, args=(False, True,) + ) + else: + rref_t1 = rpc.remote( + rref_owner, _torch_ones, args=((3, 3),), kwargs={"requires_grad": True} + ) if callee == rref_owner: rref = rpc.remote(callee, my_rref_add, args=(rref_t1, t2)) else: @@ -952,7 +1120,11 @@ def _test_backward_rref(self, callee, rref_owner): callee, my_nested_rref_add, args=(rref_owner, rref_t1, t2) ) ret = rref.to_here() - dist_autograd.backward(context_id, [ret.sum()]) + if sparse: + ret = torch.sparse.sum(ret) + else: + ret = ret.sum() + dist_autograd.backward(context_id, [ret]) # verify grads on caller grads = dist_autograd.get_gradients(context_id) @@ -972,20 +1144,81 @@ def _test_backward_rref(self, callee, rref_owner): def test_backward_rref(self): callee = worker_name(self._next_rank()) rref_owner = callee - self._test_backward_rref(callee, rref_owner) + self._backward_rref( + callee, + rref_owner, + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3), requires_grad=True), + None, + False + ) + + @dist_init + def test_backward_rref_sparse(self): + callee = worker_name(self._next_rank()) + rref_owner = callee + self._backward_rref( + callee, + rref_owner, + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + None, + True + ) @dist_init def test_backward_rref_multi(self): if self.rank > 0: callee = "worker0" rref_owner = callee - self._test_backward_rref(callee, rref_owner) + self._backward_rref( + callee, + rref_owner, + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3), requires_grad=True), + None, + False + ) + + @dist_init + def test_backward_rref_multi_sparse(self): + if self.rank > 0: + callee = "worker0" + rref_owner = callee + self._backward_rref( + callee, + rref_owner, + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + None, + True + ) @dist_init def test_backward_rref_nested(self): callee = worker_name((self.rank + 1) % self.world_size) rref_owner = worker_name((self.rank + 2) % self.world_size) - self._test_backward_rref(callee, rref_owner) + self._backward_rref( + callee, + rref_owner, + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3), requires_grad=True), + None, + False + ) + + @dist_init + def test_backward_rref_nested_sparse(self): + callee = worker_name((self.rank + 1) % self.world_size) + rref_owner = worker_name((self.rank + 2) % self.world_size) + self._backward_rref( + callee, + rref_owner, + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + None, + True + ) # In this test, every rank will serve as a parameter server (ps) and a # driver, and then kicks off trainers on the other three ranks. So, we have: @@ -996,13 +1229,19 @@ def test_backward_rref_nested(self): # # These four test ps-trainer groups run on completely separate autograd # graphs, but they share the same set of underlying RpcAgents. - def _test_trainer_ps(self, create_ref_fn, trainer_fn): - local_grads = None - t1 = torch.ones((3, 3), requires_grad=True) - t2 = torch.zeros((3, 3), requires_grad=True) + def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse): + if sparse: + t1 = build_sparse_tensor(requires_grad=True) + t2 = build_sparse_tensor(requires_grad=True) + else: + t1 = torch.ones((3, 3), requires_grad=True) + t2 = torch.zeros((3, 3), requires_grad=True) local_ret = torch.add(t1, t2) - local_ret.sum().backward() + if sparse: + torch.sparse.sum(local_ret).backward() + else: + local_ret.sum().backward() # create rref on self rref_t1 = rpc.remote( @@ -1018,7 +1257,7 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn): rpc.rpc_async( worker_name((self.rank + rank_diff) % self.world_size), trainer_fn, - args=(rref_t1, t2, worker_name(self.rank), rank_diff), + args=(rref_t1, t2, worker_name(self.rank), rank_diff, sparse), ) ) @@ -1045,7 +1284,19 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn): @dist_init def test_trainer_ps(self): - self._test_trainer_ps(create_tensor, _run_trainer) + self._test_trainer_ps( + create_tensor, + _run_trainer, + False + ) + + @dist_init + def test_trainer_ps_sparse(self): + self._test_trainer_ps( + build_sparse_tensor, + _run_trainer, + True + ) @dist_init def test_trainer_ps_torchscript_functions(self): @@ -1056,17 +1307,9 @@ def test_trainer_ps_torchscript_functions(self): import torch.distributed.rpc.api as api api._ignore_rref_leak = True - self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript) - - @dist_init - def test_backward_multiple_round_trips(self): - local_grads = None - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3)) - t3 = torch.rand((3, 3), requires_grad=True) - t4 = torch.rand((3, 3)) - t5 = torch.rand((3, 3), requires_grad=True) + self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript, False) + def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse): for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]: with dist_autograd.context() as context_id: # Multiple RPCs between different nodes. @@ -1074,15 +1317,44 @@ def test_backward_multiple_round_trips(self): val = self._exec_func(exec_mode, torch.mul, t3, val) s1 = self._exec_func(exec_mode, torch.stack, (t4, val)) s2 = self._exec_func(exec_mode, torch.stack, (t5, val)) - val = self._exec_func(exec_mode, torch.bmm, s1, s2) - val = self._exec_func(exec_mode, torch.matmul, val, val) - loss = val.sum() + if sparse: + val = self._exec_func(exec_mode, torch.mul, s1, s2) + val = self._exec_func(exec_mode, torch.mul, val, val) + loss = torch.sparse.sum(val) + else: + val = self._exec_func(exec_mode, torch.bmm, s1, s2) + val = self._exec_func(exec_mode, torch.matmul, val, val) + loss = val.sum() ret = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2, t3, t4, t5 ) local_grads = ret if ret else local_grads + @dist_init + def test_backward_multiple_round_trips(self): + self._backward_multiple_round_trips( + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3)), + torch.rand((3, 3), requires_grad=True), + torch.rand((3, 3)), + torch.rand((3, 3), requires_grad=True), + None, + False + ) + + @dist_init + def test_backward_multiple_round_trips_sparse(self): + self._backward_multiple_round_trips( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=False), + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=False), + build_sparse_tensor(requires_grad=True), + None, + True + ) + @dist_init def test_backward_different_tensor_dims(self): local_grads = None @@ -1317,41 +1589,70 @@ def test_backward_multiple_roots(self): exec_mode, [r1, r2, r3, r4], context_id, local_grads, t1, t2 ) - @dist_init - def test_backward_different_dtypes(self): + def _backward_different_dtypes(self, t1, t2, sparse): local_grads = None - t1 = torch.rand((3, 3), requires_grad=True, dtype=torch.float32) - t2 = torch.rand((3, 3), requires_grad=True, dtype=torch.float64) for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]: with dist_autograd.context() as context_id: - loss = self._exec_func(exec_mode, torch.add, t1, t2).sum() - + loss = self._exec_func(exec_mode, torch.add, t1, t2) + if sparse: + loss = torch.sparse.sum(loss) + else: + loss = loss.sum() local_grads = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @dist_init - def test_backward_simple_python_udf(self): - # Run the same code locally and with dist autograd and verify gradients - # are same. + def test_backward_different_dtypes(self): + self._backward_different_dtypes( + torch.rand((3, 3), requires_grad=True, dtype=torch.float32), + torch.rand((3, 3), requires_grad=True, dtype=torch.float64), + False + ) + + @dist_init + def test_backward_different_dtypes_sparse(self): + self._backward_different_dtypes( + build_sparse_tensor(requires_grad=True, dtype=torch.float32), + build_sparse_tensor(requires_grad=True, dtype=torch.float64), + True + ) + + # Run the same code locally and with dist autograd and verify gradients + # are same. + def _backward_simple_python_udf(self, t1, t2, sparse): local_grads = None - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]: with dist_autograd.context() as context_id: ret = self._exec_func(exec_mode, my_py_add, t1, t2) - loss = ret.sum() + if sparse: + loss = torch.sparse.sum(ret) + else: + loss = ret.sum() local_grads = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @dist_init - def test_backward_simple_script_call(self): - # Run the same code locally and with dist autograd and verify gradients - # are same. + def test_backward_simple_python_udf(self): + self._backward_simple_python_udf( + torch.rand(3, 3, requires_grad=True), + torch.rand(3, 3, requires_grad=True), + False + ) + + @dist_init + def test_backward_simple_python_udf_sparse(self): + self._backward_simple_python_udf( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + True + ) + + # Run the same code locally and with dist autograd and verify gradients + # are same. + def _backward_simple_script_call(self, t1, t2, sparse): local_grads = None - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) for exec_mode in [ ExecMode.LOCAL, ExecMode.RPC_SYNC, @@ -1360,12 +1661,31 @@ def test_backward_simple_script_call(self): ]: with dist_autograd.context() as context_id: forward_ret = self._exec_func(exec_mode, my_script_add, t1, t2) - loss = forward_ret.sum() + if sparse: + loss = torch.sparse.sum(forward_ret) + else: + loss = forward_ret.sum() ret = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) local_grads = ret if ret else local_grads + @dist_init + def test_backward_simple_script_call(self): + self._backward_simple_script_call( + torch.rand(3, 3, requires_grad=True), + torch.rand(3, 3, requires_grad=True), + False + ) + + @dist_init + def test_backward_simple_script_call_sparse(self): + self._backward_simple_script_call( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + True + ) + @staticmethod def _complex_python_udf(t1, t2): t3 = torch.nn.functional.linear(t1, t2) @@ -1474,17 +1794,17 @@ def _nested_python_udf(t1, t2, dst): t3 = t1 * t2 t4 = t1 + t2 res = rpc.rpc_sync(worker_name(dst), my_py_add, args=(t3, t4)) - return torch.linalg.multi_dot([t1, t2, t3, t4, res]) + return t1 * t2 * t3 * t4 * res - @dist_init - def test_backwards_nested_python_udf(self): - # Run equivalent of _nested_python_udf locally. - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) + def _backwards_nested_python_udf(self, t1, t2, sparse): t3 = t1 * t2 t4 = t1 + t2 res = t3 + t4 - loss = torch.linalg.multi_dot([t1, t2, t3, t4, res]).sum() + loss = t1 * t2 * t3 * t4 * res + if sparse: + loss = torch.sparse.sum(loss) + else: + loss = loss.sum() torch.autograd.backward([loss]) # Now run distributed autograd. @@ -1494,12 +1814,33 @@ def test_backwards_nested_python_udf(self): DistAutogradTest._nested_python_udf, args=(t1, t2, self._next_rank()), ) - dist_autograd.backward(context_id, [loss.sum()]) - + if sparse: + loss = torch.sparse.sum(loss) + else: + loss = loss.sum() + dist_autograd.backward(context_id, [loss]) grads = dist_autograd.get_gradients(context_id) self.assertEqual(t1.grad, grads[t1]) self.assertEqual(t2.grad, grads[t2]) + @dist_init + def test_backwards_nested_python_udf(self): + # Run equivalent of _nested_python_udf locally. + self._backwards_nested_python_udf( + torch.rand(3, 3, requires_grad=True), + torch.rand(3, 3, requires_grad=True), + False + ) + + @dist_init + def test_backwards_nested_python_udf_sparse(self): + # Run equivalent of _nested_python_udf locally. + self._backwards_nested_python_udf( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + True + ) + _test_clean_context_backward_context_id = None class MyBackwardFunc(Function): @@ -1594,8 +1935,7 @@ def _call_remote_embedding(cls, embedding_rref, input, offsets, per_sample_weigh def _get_grad(cls, embedding_rref, context_id): embedding = embedding_rref.local_value() grad_map = dist_autograd.get_gradients(context_id) - # Can't send sparse tensors over RPC: https://github.com/pytorch/pytorch/issues/30807 - return grad_map[embedding.weight].to_dense() + return grad_map[embedding.weight] @dist_init def test_embedding_bag_with_no_grad_tensors(self): @@ -1637,26 +1977,27 @@ def test_embedding_bag_with_no_grad_tensors(self): args=(remote_embedding, context_id), ) - self.assertEqual(local_grad.to_dense(), remote_grad) + self.assertEqual(local_grad, remote_grad) @classmethod - def _mixed_requires_grad(cls, t1, t2): + def _mixed_requires_grad_operaton(cls, t1, t2): if t2.requires_grad: return t1 - t2 else: return t1 * t2 - @dist_init - def test_mixed_requires_grad(self): + def _mixed_requires_grad(self, t1, t2, sparse): for exec_mode in [ExecMode.RPC_SYNC, ExecMode.REMOTE]: - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=False) with dist_autograd.context() as context_id: ret = self._exec_func( - exec_mode, DistAutogradTest._mixed_requires_grad, t1, t2 + exec_mode, DistAutogradTest._mixed_requires_grad_operaton, t1, t2 ) self.assertEqual(t1 * t2, ret) - dist_autograd.backward(context_id, [ret.sum()]) + if sparse: + loss = torch.sparse.sum(ret) + else: + loss = ret.sum() + dist_autograd.backward(context_id, [loss]) self.assertTrue(t1.requires_grad) self.assertFalse(t2.requires_grad) grads = dist_autograd.get_gradients(context_id) @@ -1664,6 +2005,22 @@ def test_mixed_requires_grad(self): self.assertNotIn(t2, grads) self.assertEqual(t2, grads[t1]) + @dist_init + def test_mixed_requires_grad(self): + self._mixed_requires_grad( + torch.rand(3, 3, requires_grad=True), + torch.rand(3, 3, requires_grad=False), + False + ) + + @dist_init + def test_mixed_requires_grad_sparse(self): + self._mixed_requires_grad( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=False), + True + ) + class TestDebugInfoFunc(Function): @staticmethod def forward(ctx, input): @@ -1801,37 +2158,69 @@ def test_backward_accumulate_grads(self): @staticmethod def _test_nested_backward_accumulate_grads(t1, t2, dst_rank): - return rpc.rpc_sync(worker_name(dst_rank), torch.matmul, args=(t1, t2)) + return rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2)) - @dist_init - def test_nested_backward_accumulate_grads(self): - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) + def _nested_backward_accumulate_grads(self, t1, t2, sparse): with dist_autograd.context() as context_id: - loss = rpc.rpc_sync( + ret = rpc.rpc_sync( worker_name(self._next_rank()), DistAutogradTest._test_nested_backward_accumulate_grads, args=(t1, t2, self._next_rank()), - ).sum() - + ) + if sparse: + loss = torch.sparse.sum(ret) + else: + loss = ret.sum() # Run backward twice. dist_autograd.backward(context_id, [loss], retain_graph=True) dist_autograd.backward(context_id, [loss]) @dist_init - def test_multiple_backward(self): - t1 = torch.rand((3, 3), requires_grad=True) - t2 = torch.rand((3, 3), requires_grad=True) + def test_nested_backward_accumulate_grads(self): + self._nested_backward_accumulate_grads( + torch.rand(3, 3, requires_grad=True), + torch.rand(3, 3, requires_grad=True), + False + ) + + @dist_init + def test_nested_backward_accumulate_grads_sparse(self): + self._nested_backward_accumulate_grads( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + True + ) + + def _multiple_backward(self, t1, t2, sparse): with dist_autograd.context() as context_id: loss = rpc.rpc_sync( worker_name(self._next_rank()), torch.add, - args=(t1, t2)).sum() - + args=(t1, t2)) + if sparse: + loss = torch.sparse.sum(loss) + else: + loss = loss.sum() # Run backward in a loop multiple times. for i in range(1000): dist_autograd.backward(context_id, [loss], retain_graph=True) + @dist_init + def test_multiple_backward(self): + self._multiple_backward( + torch.rand(3, 3, requires_grad=True), + torch.rand(3, 3, requires_grad=True), + False + ) + + @dist_init + def test_multiple_backward_sparse(self): + self._multiple_backward( + build_sparse_tensor(requires_grad=True), + build_sparse_tensor(requires_grad=True), + True + ) + @dist_init(clean_shutdown=False) def test_multiple_backward_with_errors(self): initialize_pg(self.file_init_method, self.rank, self.world_size) diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 1a44ef6e63b65..e0ef915ee8937 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -209,10 +209,13 @@ def add_rref_to_value(rref, value): def run_nested_pickle(pickle_cls_instance, tensor): return pickle_cls_instance.t + tensor -def build_sparse_tensor(): +def build_sparse_tensor(coalesce=False): i = [[0, 1, 1], [2, 0, 2]] v = [3, 4, 5] - return torch.sparse_coo_tensor(i, v, (2, 3)) + tensor = torch.sparse_coo_tensor(i, v, (2, 3)) + if coalesce: + tensor = tensor.coalesce() + return tensor def build_complex_tensors(): a = torch.ones(3, 3) @@ -238,6 +241,12 @@ def my_function(a, b, c): def my_tensor_function(a, b): return a + b +def my_container_sum(a): + result = a[0] + for tensor in a[1:]: + result += tensor + return result + def my_sleep_func(seconds=1): time.sleep(seconds) @@ -275,6 +284,14 @@ def nested_rpc(dst): return rpc.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1)) +def nested_rpc_sparse(dst): + return rpc.rpc_sync( + dst, + torch.add, + args=(build_sparse_tensor(), build_sparse_tensor()) + ) + + def multi_layer_nested_async_rpc(dst, world_size, ttl): # this method returns immediately without blocking the callee, but will # generate additional requests. @@ -296,10 +313,29 @@ def nested_rref(dst): ) +def nested_rref_sparse(dst): + return ( + rpc.remote( + dst, + torch.add, + args=(build_sparse_tensor(), build_sparse_tensor()) + ), + rpc.remote( + dst, + torch.add, + args=(build_sparse_tensor(), build_sparse_tensor()) + ), + ) + + def nested_remote(dst): rref = rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 3)) return rref.to_here() +def nested_remote_sparse(dst): + rref = rpc.remote(dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor())) + return rref.to_here() + def rref_forward_chain(dst, world_size, rref, ttl): if ttl > 0: @@ -328,6 +364,12 @@ def heavy_rpc(tensor): return 0 +def heavy_rpc_sparse(tensor): + for i in range(1, 100): + tensor *= i + tensor = tensor / (i + 1) + return 0 + @torch.jit.script def heavy_rpc_torchscript(tensor): for i in range(1, 100): @@ -600,6 +642,57 @@ def __init__(self, init_method): load_tests = load_tests +class MyEmbeddingBagModel(torch.nn.Module): + def __init__(self, sparse): + super().__init__() + self.eb = torch.nn.EmbeddingBag( + 10, + 10, + sparse=sparse + ) + + def forward(self, x): + return self.eb(x) + + +class MyParameterServer: + def __init__(self, trainers): + self.lock = Lock() + self.trainers = trainers + self.iteration = 0 + self.updates = 0 + self.futures = [] + self.total = None + self.gradient = None + + @staticmethod + def get_gradient(rref): + return rref.local_value().gradient + + @staticmethod + @rpc.functions.async_execution + def average(rref, riteration, tensor): + self = rref.local_value() + fut = torch.futures.Future() + with self.lock: + if riteration > self.iteration: + self.iteration = riteration + self.updates = 0 + self.futures.clear() + self.futures.append(fut) + if self.total is None: + self.total = tensor + else: + self.total += tensor + self.updates += 1 + if self.trainers == self.updates: + self.gradient = self.total / float(self.trainers) + for fut in self.futures: + result = self.total / float(self.trainers) + fut.set_result(result) + return fut + + class RpcTest(RpcAgentTestFixture): @dist_init def test_worker_id(self): @@ -641,10 +734,26 @@ def test_self_add(self): def test_send_to_rank(self): dst_rank = (self.rank + 1) % self.world_size + # Test dense tensor for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) self.assertEqual(ret, torch.ones(2, 2) + 1) + # Test sparse tensor + for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: + x = build_sparse_tensor() + y = build_sparse_tensor() + expected_tensor = (x + y) + ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y)) + self.assertEqual(expected_tensor, ret) + + for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: + x = build_sparse_tensor(coalesce=True) + y = build_sparse_tensor(coalesce=True) + expected_tensor = (x + y) + ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y)) + self.assertEqual(expected_tensor, ret) + # Test invalid ranks for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: with self.assertRaises(RuntimeError): @@ -662,41 +771,120 @@ def test_send_to_rank(self): with self.assertRaises(ValueError): self._run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) + def _self_py_udf_remote(self, worker_info, x, y, z): + rref = rpc.remote(worker_info, my_function, args=(x, y, z)) + self.assertEqual(rref.to_here(), x + y + z) + @dist_init def test_self_py_udf_remote(self): - self_worker_info = rpc.get_worker_info() - rref = rpc.remote(self_worker_info, my_function, args=(torch.ones(2, 2), 1, 3)) - self.assertEqual(rref.to_here(), torch.ones(2, 2) + 1 + 3) + self._self_py_udf_remote( + rpc.get_worker_info(), + torch.ones(2, 2), + 1, + 3 + ) + + @dist_init + def test_self_py_udf_remote_sparse(self): + self._self_py_udf_remote( + rpc.get_worker_info(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() + ) + - def _test_self_remote_rref_as_rpc_arg(self, dst): + def _self_remote_rref_as_rpc_arg(self, dst, x, y, z): self_worker_info = rpc.get_worker_info() - rref = rpc.remote(self_worker_info, my_function, args=(torch.ones(2, 2), 1, 3)) - fut = rpc.rpc_async(dst, add_rref_to_value, args=(rref, torch.ones(2, 2))) - ret = rpc.rpc_sync(dst, add_rref_to_value, args=(rref, torch.ones(2, 2) + 1)) - self.assertEqual(ret, torch.ones(2, 2) + 1 + 3 + torch.ones(2, 2) + 1) - self.assertEqual(fut.wait(), torch.ones(2, 2) + 1 + 3 + torch.ones(2, 2)) + rref = rpc.remote(self_worker_info, my_function, args=(x, y, z)) + fut = rpc.rpc_async(dst, add_rref_to_value, args=(rref, x)) + ret = rpc.rpc_sync(dst, add_rref_to_value, args=(rref, x + y)) + self.assertEqual(ret, x + y + z + x + y) + self.assertEqual(fut.wait(), x + y + z + x) @dist_init def test_self_remote_rref_as_rpc_arg(self): dst = worker_name((self.rank + 1) % self.world_size) - self._test_self_remote_rref_as_rpc_arg(dst) + self._self_remote_rref_as_rpc_arg( + dst, + torch.ones(2, 2), + 1, + 3 + ) + + @dist_init + def test_self_remote_rref_as_rpc_arg_sparse(self): + dst = worker_name((self.rank + 1) % self.world_size) + self._self_remote_rref_as_rpc_arg( + dst, + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() + ) @dist_init def test_self_remote_rref_as_self_rpc_arg(self): - self._test_self_remote_rref_as_rpc_arg(rpc.get_worker_info()) + self._self_remote_rref_as_rpc_arg( + rpc.get_worker_info(), + torch.ones(2, 2), + 1, + 3 + ) - def _test_self_remote_rref_as_remote_arg(self, dst): + @dist_init + def test_self_remote_rref_as_self_rpc_arg_sparse(self): + self._self_remote_rref_as_rpc_arg( + rpc.get_worker_info(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() + ) + + def _self_remote_rref_as_remote_arg(self, dst, x, y, z): self_worker_info = rpc.get_worker_info() - rref = rpc.remote(self_worker_info, my_function, args=(torch.ones(2, 2), 1, 3)) - ret_rref = rpc.remote(dst, add_rref_to_value, args=(rref, torch.ones(2, 2))) + rref = rpc.remote(self_worker_info, my_function, args=(x, y, z)) + ret_rref = rpc.remote(dst, add_rref_to_value, args=(rref, x)) self.assertEqual( - ret_rref.to_here(), torch.ones(2, 2) + 1 + 3 + torch.ones(2, 2) + ret_rref.to_here(), x + y + z + x ) @dist_init def test_self_remote_rref_as_remote_arg(self): dst = worker_name((self.rank + 1) % self.world_size) - self._test_self_remote_rref_as_remote_arg(dst) + self._self_remote_rref_as_remote_arg( + dst, + torch.ones(2, 2), + 1, + 3 + ) + + @dist_init + def test_self_remote_rref_as_remote_arg_sparse(self): + dst = worker_name((self.rank + 1) % self.world_size) + self._self_remote_rref_as_remote_arg( + dst, + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() + ) + + @dist_init + def test_self_remote_rref_as_self_remote_arg(self): + self._self_remote_rref_as_remote_arg( + rpc.get_worker_info(), + torch.ones(2, 2), + 1, + 3 + ) + + @dist_init + def test_self_remote_rref_as_self_remote_arg_sparse(self): + self._self_remote_rref_as_remote_arg( + rpc.get_worker_info(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() + ) @dist_init def test_rref_proxy_non_exist(self): @@ -816,10 +1004,6 @@ def test_rref_proxy_class(self): def test_rref_proxy_class_self(self): self._test_rref_proxy_class(rpc.get_worker_info()) - @dist_init - def test_self_remote_rref_as_self_remote_arg(self): - self._test_self_remote_rref_as_remote_arg(rpc.get_worker_info()) - @mock.patch.object(torch.distributed.autograd, "_init") @mock.patch.object(torch.distributed.rpc.api, "_set_and_start_rpc_agent") @dist_init(setup_rpc=False) @@ -911,7 +1095,7 @@ def test_reinit(self): ) rpc.shutdown() - def test_world_size_one(self): + def _world_size_one(self, a, b): if self.rank == 0: rpc.init_rpc( name="me", @@ -921,32 +1105,51 @@ def test_world_size_one(self): rpc_backend_options=self.rpc_backend_options, ) - expect = torch.ones(2, 2) * 2 - result = rpc.rpc_sync( - "me", - my_tensor_function, - args=(torch.ones(2, 2), torch.ones(2, 2)) - ) - self.assertEqual(expect, result) - - expect = torch.ones(3, 3) * 2 - result = rpc.rpc_async( - "me", - my_tensor_function, - args=(torch.ones(3, 3), torch.ones(3, 3)) - ).wait() - self.assertEqual(expect, result) + def _rpc_sync(x, y): + expect = x * 2 + result = rpc.rpc_sync( + "me", + my_tensor_function, + args=(x, y) + ) + self.assertEqual(expect, result) + + def _rpc_async(x, y): + expect = x * 2 + result = rpc.rpc_async( + "me", + my_tensor_function, + args=(x, y) + ).wait() + self.assertEqual(expect, result) + + def _remote(x, y): + expect = x * 2 + result = rpc.remote( + "me", + my_tensor_function, + args=(x, y) + ).to_here() + self.assertEqual(expect, result) - expect = torch.ones(4, 4) * 2 - result = rpc.remote( - "me", - my_tensor_function, - args=(torch.ones(4, 4), torch.ones(4, 4)) - ).to_here() - self.assertEqual(expect, result) + _rpc_sync(a, b) + _rpc_async(a, b) + _remote(a, b) rpc.shutdown() + def test_world_size_one(self): + self._world_size_one( + torch.ones(2, 2), + torch.ones(2, 2) + ) + + def test_world_size_one_sparse(self): + self._world_size_one( + build_sparse_tensor(), + build_sparse_tensor() + ) + @dist_init(setup_rpc=False) def test_invalid_names(self): from torch.distributed.rpc import WorkerInfo @@ -1027,17 +1230,30 @@ def test_nonzero(self): ret = rpc.rpc_sync(worker_name(dst_rank), torch.nonzero, args=(x,)) self.assertEqual(ret, x.nonzero()) - @dist_init - def test_multi_rpc(self): + def _multi_rpc(self, sparse): dst_rank = (self.rank + 1) % self.world_size for i in range(20): n = i + self.rank + 1 + if sparse: + x = build_sparse_tensor() * n + y = build_sparse_tensor() * n + else: + x = torch.ones(2, 2) + y = torch.ones(2, 2) ret = rpc.rpc_sync( worker_name(dst_rank), torch.add, - args=(torch.ones(n, n), torch.ones(n, n)), + args=(x, y), ) - self.assertEqual(ret, torch.ones(n, n) * 2) + self.assertEqual(ret, x * 2) + + @dist_init + def test_multi_rpc(self): + self._multi_rpc(False) + + @dist_init + def test_multi_rpc_sparse(self): + self._multi_rpc(True) @dist_init def test_future_wait_twice(self): @@ -1053,7 +1269,7 @@ def test_future_wait_twice(self): with self.assertRaisesRegex(ValueError, "Expected error"): fut.wait() - def _run_uneven_workload(self, num_repeat=30): + def _run_uneven_workload(self, f, x, num_repeat=30): # worker0 drives and waits for worker1 and worker2 # throughout the test. if self.rank == 0: @@ -1063,7 +1279,7 @@ def _run_uneven_workload(self, num_repeat=30): dst = "worker1" futs = [] for _ in range(num_repeat): - fut = rpc.rpc_async(dst, heavy_rpc, args=(torch.ones(100, 100),)) + fut = rpc.rpc_async(dst, f, args=(x,)) futs.append(fut) for fut in torch.futures.collect_all(futs).wait(): @@ -1075,13 +1291,13 @@ def _run_uneven_workload(self, num_repeat=30): dst = "worker2" futs = [] for _ in range(num_repeat): - fut = rpc.rpc_async(dst, heavy_rpc, args=(torch.ones(100, 100),)) + fut = rpc.rpc_async(dst, f, args=(x,)) futs.append(fut) for val in torch.futures.wait_all(futs): self.assertEqual(val, 0) - def test_wait_all_workers(self): + def _wait_all_workers(self, f, x): initialize_pg(self.file_init_method, self.rank, self.world_size) rpc.init_rpc( name="worker%d" % self.rank, @@ -1091,7 +1307,7 @@ def test_wait_all_workers(self): rpc_backend_options=self.rpc_backend_options, ) - self._run_uneven_workload() + self._run_uneven_workload(f, x) # worker0 calls this at the end after waiting for RPC responses. # worker1/2 calls this immediately and has some works after it. @@ -1103,7 +1319,13 @@ def test_wait_all_workers(self): dist.barrier() rpc.shutdown(graceful=False) - def test_wait_all_workers_twice(self): + def test_wait_all_workers_dense(self): + self._wait_all_workers(heavy_rpc, torch.ones(100, 100)) + + def test_wait_all_workers_sparse(self): + self._wait_all_workers(heavy_rpc_sparse, build_sparse_tensor()) + + def _wait_all_workers_twice(self, f, x): initialize_pg(self.file_init_method, self.rank, self.world_size) rpc.init_rpc( name="worker%d" % self.rank, @@ -1113,7 +1335,7 @@ def test_wait_all_workers_twice(self): rpc_backend_options=self.rpc_backend_options, ) - self._run_uneven_workload() + self._run_uneven_workload(f, x) # worker0 calls this at the end after waiting for RPC responses. # worker1/2 calls this immediately and has some works after it. @@ -1126,6 +1348,12 @@ def test_wait_all_workers_twice(self): dist.barrier() rpc.shutdown(graceful=False) + def test_wait_all_workers_twice_dense(self): + self._wait_all_workers_twice(heavy_rpc, torch.ones(100, 100)) + + def test_wait_all_workers_twice_sparse(self): + self._wait_all_workers_twice(heavy_rpc_sparse, build_sparse_tensor()) + @dist_init def test_all_gather(self): info = rpc.get_worker_info() @@ -1211,7 +1439,7 @@ def test_rpc_barrier_multithreaded(self): @dist_init def test_graceful_shutdown_with_uneven_workload(self): """Test graceful termination.""" - self._run_uneven_workload() + self._run_uneven_workload(heavy_rpc, torch.ones(100, 100)) @dist_init(setup_rpc=False) def test_shutdown_followed_by_rpc(self): @@ -2081,6 +2309,16 @@ def test_py_tensors_in_container(self): ) self.assertEqual(ret, my_complex_tensor_function(a, b, c)) + @dist_init + def test_py_sparse_tensors_in_container(self): + n = self.rank + 1 + dst_rank = n % self.world_size + a = [build_sparse_tensor(), build_sparse_tensor()] + ret = rpc.rpc_sync( + worker_name(dst_rank), my_container_sum, args=(a,) + ) + self.assertEqual(ret, my_container_sum(a)) + @dist_init def test_py_nested_pickle(self): n = self.rank + 1 @@ -2137,16 +2375,23 @@ def test_py_raise_in_user_func_escaped_str(self): else: self.assertTrue(False, "expected raise_func_escape to raise ValueError.") - @dist_init - def test_nested_rpc(self): + def _nested_rpc(self, f, expected): n = self.rank + 1 dst_rank = n % self.world_size ret = rpc.rpc_sync( worker_name(dst_rank), - nested_rpc, + f, args=(worker_name(self.rank),), ) - self.assertEqual(ret, torch.ones(2, 2) + 1) + self.assertEqual(ret, expected) + + @dist_init + def test_nested_rpc(self): + self._nested_rpc(nested_rpc, torch.ones(2, 2) + 1) + + @dist_init + def test_nested_rpc_sparse(self): + self._nested_rpc(nested_rpc_sparse, build_sparse_tensor() * 2) def _stress_test_rpc(self, f, repeat=1000, args=()): n = self.rank + 1 @@ -2174,31 +2419,65 @@ def test_stress_light_rpc(self): def test_stress_heavy_rpc(self): self._stress_test_rpc(heavy_rpc, repeat=20, args=(torch.ones(100, 100),)) + @dist_init + def test_stress_heavy_rpc_sparse(self): + self._stress_test_rpc(heavy_rpc_sparse, repeat=20, args=(build_sparse_tensor(),)) + @dist_init def test_stress_heavy_rpc_torchscript(self): self._stress_test_rpc(heavy_rpc_torchscript, repeat=20, args=(torch.ones(100, 100),)) - @dist_init - def test_builtin_remote_ret(self): + def _builtin_remote_ret(self, x, y, expected): n = self.rank + 1 dst_rank = n % self.world_size rref = rpc.remote( worker_name(dst_rank), torch.add, - args=(torch.ones(n, n), torch.ones(n, n)), + args=(x, y), ) - self.assertEqual(rref.to_here(), torch.ones(n, n) * 2) + self.assertEqual(rref.to_here(), expected) @dist_init - def test_builtin_remote_self(self): + def test_builtin_remote_ret(self): + self._builtin_remote_ret( + torch.ones(2, 2), + torch.ones(2, 2), + torch.ones(2, 2) * 2 + ) + + @dist_init + def test_builtin_remote_ret_sparse(self): + self._builtin_remote_ret( + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() * 2 + ) + + def _builtin_remote_self(self, x, y, expected): rref = rpc.remote( worker_name(self.rank), torch.add, - args=(torch.ones(2, 2), torch.ones(2, 2)), + args=(x, y), + ) + self.assertEqual(rref.local_value(), expected) + + @dist_init + def test_builtin_remote_self(self): + self._builtin_remote_self( + torch.ones(2, 2), + torch.ones(2, 2), + torch.ones(2, 2) * 2 + ) + + @dist_init + def test_builtin_remote_self_sparse(self): + self._builtin_remote_self( + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() * 2 ) - self.assertEqual(rref.local_value(), torch.ones(2, 2) * 2) - def _test_multi_remote_call(self, fn, args_fn=lambda x: (), kwargs_fn=lambda x: {}): + def _test_multi_remote_call(self, fn, sparse, args_fn=lambda x, y: (), kwargs_fn=lambda x, y: {}): m = 10 n = self.rank + 1 dst_rank = n % self.world_size @@ -2210,21 +2489,35 @@ def _test_multi_remote_call(self, fn, args_fn=lambda x: (), kwargs_fn=lambda x: rpc.remote( worker_name(dst_rank), fn, - args=args_fn(n), - kwargs=kwargs_fn(n), + args=args_fn(n, sparse), + kwargs=kwargs_fn(n, sparse), ) ) - expected.append(fn(*args_fn(n), **kwargs_fn(n))) + expected.append(fn(*args_fn(n, sparse), **kwargs_fn(n, sparse))) for i in range(m): self.assertEqual(rrefs[i].to_here(), expected[i]) + @staticmethod + def _multi_args_fn(n, sparse=False): + if sparse: + return (build_sparse_tensor(), build_sparse_tensor()) + else: + return (torch.ones(n, n), torch.ones(n, n)) + @dist_init def test_multi_builtin_remote_ret(self): - def args_fn(n): - return (torch.ones(n, n), torch.ones(n, n)) + self._test_multi_remote_call( + torch.add, False, + args_fn=RpcTest._multi_args_fn + ) - self._test_multi_remote_call(torch.add, args_fn=args_fn) + @dist_init + def test_multi_builtin_remote_ret_sparse(self): + self._test_multi_remote_call( + torch.add, True, + args_fn=RpcTest._multi_args_fn + ) @dist_init def test_py_udf_remote(self): @@ -2237,82 +2530,177 @@ def test_py_udf_remote(self): ) self.assertEqual(rref.to_here(), my_function(n, n + 1, n + 2)) - @dist_init - def test_multi_py_udf_remote(self): - def kwargs_fn(n): + @staticmethod + def _multi_kwargs_fn(n, sparse=False): + if sparse: + return { + "a": build_sparse_tensor(), + "b": build_sparse_tensor(), + "c": build_sparse_tensor() + } + else: return {"a": torch.ones(n, n), "b": torch.ones(n, n), "c": torch.ones(n, n)} - self._test_multi_remote_call(my_function, kwargs_fn=kwargs_fn) + @dist_init + def test_multi_py_udf_remote(self): + self._test_multi_remote_call( + my_function, + False, + kwargs_fn=RpcTest._multi_kwargs_fn + ) @dist_init - def test_py_rref_args(self): + def test_multi_py_udf_remote_sparse(self): + self._test_multi_remote_call( + my_function, + True, + kwargs_fn=RpcTest._multi_kwargs_fn + ) + + def _py_rref_args(self, a, b, x, y, expected): n = self.rank + 1 dst_rank = n % self.world_size rref_a = rpc.remote( - worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 2) + worker_name(dst_rank), torch.add, args=(a, b) ) rref_b = rpc.remote( - worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 1) + worker_name(dst_rank), torch.add, args=(x, y) ) rref_c = rpc.remote( worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b) ) - self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4) + self.assertEqual(rref_c.to_here(), expected) @dist_init - def test_py_rref_args_user_share(self): + def test_py_rref_args(self): + self._py_rref_args( + torch.ones(2, 2), + 1, + torch.ones(2, 2), + 2, + torch.ones(2, 2) * 2 + 3) + + @dist_init + def test_py_rref_args_sparse(self): + self._py_rref_args( + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() * 4 + ) + + def _py_rref_args_user_share(self, a, b, c, x, y, z, expected): n = self.rank + 1 owner_rank = n % self.world_size user_rank = (n + 1) % self.world_size rref_a = rpc.remote( - worker_name(owner_rank), my_function, args=(torch.ones(n, n), 2, 0) + worker_name(owner_rank), my_function, args=(a, b, c) ) rref_b = rpc.remote( - worker_name(owner_rank), my_function, args=(torch.ones(n, n), 1, 0) + worker_name(owner_rank), my_function, args=(x, y, z) ) rref_c = rpc.remote( worker_name(user_rank), my_rref_function, args=(rref_a, rref_b) ) - self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4) + self.assertEqual(rref_c.to_here(), expected) @dist_init - def test_py_rpc_rref_args(self): + def test_py_rref_args_user_share(self): + self._py_rref_args_user_share( + torch.ones(2, 2), + 1, + 2, + torch.ones(2, 2), + 3, + 4, + torch.ones(2, 2) * 2 + 10 + ) + + @dist_init + def test_py_rref_args_user_share_sparse(self): + self._py_rref_args_user_share( + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() * 6 + ) + + def _py_rpc_rref_args(self, a, b, c, x, y, z, expected): n = self.rank + 1 dst_rank = n % self.world_size rref_a = rpc.remote( - worker_name(dst_rank), my_function, args=(torch.ones(n, n), 2, 0) + worker_name(dst_rank), my_function, args=(a, b, c) ) rref_b = rpc.remote( - worker_name(dst_rank), my_function, args=(torch.ones(n, n), 1, 0) + worker_name(dst_rank), my_function, args=(x, y, z) ) c = rpc.rpc_sync( worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b) ) + self.assertEqual(c, expected) - self.assertEqual(c, torch.ones(n, n) + 4) + @dist_init + def test_py_rpc_rref_args(self): + self._py_rpc_rref_args( + torch.ones(2, 2), + 1, + 2, + torch.ones(2, 2), + 3, + 4, + torch.ones(2, 2) * 2 + 10 + ) @dist_init - def test_nested_remote(self): + def test_py_rpc_rref_args_sparse(self): + self._py_rpc_rref_args( + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() * 6 + ) + + def _nested_remote(self, f, expected): n = self.rank + 1 dst_rank1 = n % self.world_size dst_rank2 = (n + 1) % self.world_size rref = rpc.remote( worker_name(dst_rank1), - nested_remote, + f, args=(worker_name(dst_rank2),), ) - self.assertEqual(rref.to_here(), torch.ones(2, 2) + 3) + self.assertEqual(rref.to_here(), expected) @dist_init - def test_nested_rref(self): + def test_nested_remote(self): + self._nested_remote( + nested_remote, + torch.ones(2, 2) + 3 + ) + + @dist_init + def test_nested_remote_sparse(self): + self._nested_remote( + nested_remote_sparse, + build_sparse_tensor() + build_sparse_tensor() + ) + + def _nested_rref(self, f, expected1, expected2): n = self.rank + 1 dst_rank1 = n % self.world_size dst_rank2 = (n + 1) % self.world_size rref_of_rrefs = rpc.remote( worker_name(dst_rank1), - nested_rref, + f, args=(worker_name(dst_rank2),), ) @@ -2322,11 +2710,26 @@ def test_nested_rref(self): rrefs = rref_of_rrefs.to_here() self.assertEqual(len(rrefs), 2) - self.assertEqual(rrefs[0].to_here(), torch.ones(2, 2) + 1) - self.assertEqual(rrefs[1].to_here(), torch.ones(2, 2) + 2) + self.assertEqual(rrefs[0].to_here(), expected1) + self.assertEqual(rrefs[1].to_here(), expected2) @dist_init - def test_nested_rref_stress(self): + def test_nested_rref(self): + self._nested_rref( + nested_rref, + torch.ones(2, 2) + 1, + torch.ones(2, 2) + 2 + ) + + @dist_init + def test_nested_rref_sparse(self): + self._nested_rref( + nested_rref_sparse, + build_sparse_tensor() * 2, + build_sparse_tensor() * 2 + ) + + def _nested_rref_stress(self, f, expected1, expected2): n = self.rank + 1 dst_rank1 = n % self.world_size dst_rank2 = (n + 1) % self.world_size @@ -2335,7 +2738,7 @@ def test_nested_rref_stress(self): all_rrefs.append( rpc.remote( worker_name(dst_rank1), - nested_rref, + f, args=(worker_name(dst_rank2),), ) ) @@ -2344,8 +2747,24 @@ def test_nested_rref_stress(self): rref_of_rrefs = all_rrefs[i] rrefs = rref_of_rrefs.to_here() self.assertEqual(len(rrefs), 2) - self.assertEqual(rrefs[0].to_here(), torch.ones(2, 2) + 1) - self.assertEqual(rrefs[1].to_here(), torch.ones(2, 2) + 2) + self.assertEqual(rrefs[0].to_here(), expected1) + self.assertEqual(rrefs[1].to_here(), expected2) + + @dist_init + def test_nested_rref_stress(self): + self._nested_rref_stress( + nested_rref, + torch.ones(2, 2) + 1, + torch.ones(2, 2) + 2 + ) + + @dist_init + def test_nested_rref_stress_sparse(self): + self._nested_rref_stress( + nested_rref_sparse, + build_sparse_tensor() * 2, + build_sparse_tensor() * 2 + ) @dist_init def test_multi_layer_nested_async_rpc(self): @@ -4110,6 +4529,46 @@ def rref_error(): dist.barrier() + def _trainer_func(self, rref, sparse): + m = MyEmbeddingBagModel(sparse=sparse) + loss_fn = nn.MSELoss() + for i in range(10): + outputs = m(torch.rand(10, 10).long()) + loss_fn(outputs, torch.rand(10, 10)).backward() + gradient = list(m.parameters())[0].grad + fut = rref.rpc_async().average(rref, i, gradient) + gradient = fut.wait() + if gradient.is_sparse: + gradient = gradient.to_dense().double() + ps_gradient = rref.rpc_sync().get_gradient(rref) + if ps_gradient.is_sparse: + ps_gradient = ps_gradient.to_dense().double() + self.assertTrue(torch.equal(gradient, ps_gradient)) + + def _my_parameter_server(self, sparse): + ps_rref = RRef(MyParameterServer(self.world_size - 1)) + futures = [] + for index in range(1, self.world_size): + futures.append( + rpc.rpc_async( + worker_name((self.rank + index) % self.world_size), + self._trainer_func, + args=( + ps_rref, + sparse + ), + ) + ) + torch.futures.wait_all(futures) + + @dist_init + def test_my_parameter_server(self): + self._my_parameter_server(False) + + @dist_init + def test_my_parameter_server_sparse(self): + self._my_parameter_server(True) + class CudaRpcTest(RpcAgentTestFixture): From 710a2e933f33145e33fdf669ef9fd5fb3cb50d18 Mon Sep 17 00:00:00 2001 From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" Date: Sun, 29 Aug 2021 14:17:54 -0700 Subject: [PATCH 332/530] [DOC] Add doc for maybe_wrap_dim (#63161) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63161 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D30629451 Pulled By: tugsbayasgalan fbshipit-source-id: b03f030f197e10393a8ff223b240d23c30858028 --- aten/src/ATen/WrapDimUtils.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index 2768efe6e683b..13e605c920ec1 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -7,6 +7,9 @@ namespace at { static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) { + // if dim_post_expr is 0 and wrap_scalar is true, then dim must be in the range [-1, 0]. + // This is a special case for scalar tensors and manifests in e.g. torch.sum(scalar_tensor, 0) + // Otherwise, dim should be in the range [-dim_post_expr, dim_post_expr-1]. return c10::maybe_wrap_dim(dim, dim_post_expr, wrap_scalar); } From 9db56531f708188cf59d9d4db60871405df9df69 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sun, 29 Aug 2021 15:49:59 -0700 Subject: [PATCH 333/530] Revert D30620966: [pytorch][PR] Move Parallel[Native|TBB] to GHA Test Plan: revert-hammer Differential Revision: D30620966 (https://github.com/pytorch/pytorch/commit/223f886032978487099da4f54e86e9e0549cde0c) Original commit changeset: 9a23e4b3e168 fbshipit-source-id: b9248d377b9a7b850dfb3f10f3350fbc9855acfe --- .circleci/cimodel/data/pytorch_build_data.py | 2 + .circleci/config.yml | 90 ++++ .github/generated-ciflow-ruleset.json | 6 - .github/scripts/generate_ci_workflows.py | 38 +- ...rallelnative-linux-xenial-py3.6-gcc5.4.yml | 430 ------------------ ...-paralleltbb-linux-xenial-py3.6-gcc5.4.yml | 430 ------------------ 6 files changed, 104 insertions(+), 892 deletions(-) delete mode 100644 .github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml delete mode 100644 .github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index 156494589831b..5a85674d74fe9 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -7,6 +7,8 @@ ("5.4", [ # All this subtree rebases to master and then build ("3.6", [ ("important", [X(True)]), + ("parallel_tbb", [X(True)]), + ("parallel_native", [X(True)]), ("pure_torch", [X(True)]), ]), ]), diff --git a/.circleci/config.yml b/.circleci/config.yml index 8df67e6fe2bc8..1bb32b5cc0a3d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7166,6 +7166,70 @@ workflows: build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large + - pytorch_linux_build: + name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build + requires: + - "docker-pytorch-linux-xenial-py3.6-gcc5.4" + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + - pytorch_linux_test: + name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test + requires: + - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large + - pytorch_linux_test: + name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test + requires: + - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large + - pytorch_linux_build: + name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build + requires: + - "docker-pytorch-linux-xenial-py3.6-gcc5.4" + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + - pytorch_linux_test: + name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test + requires: + - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large + - pytorch_linux_test: + name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test + requires: + - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large - pytorch_linux_build: name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build requires: @@ -9322,6 +9386,32 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-xenial-py3.6-gcc7" image_name: "pytorch-linux-xenial-py3.6-gcc7" + - pytorch_linux_build: + name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build + requires: + - "docker-pytorch-linux-xenial-py3.6-gcc5.4" + build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + - pytorch_linux_test: + name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test + requires: + - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large + - pytorch_linux_build: + name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build + requires: + - "docker-pytorch-linux-xenial-py3.6-gcc5.4" + build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + - pytorch_linux_test: + name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test + requires: + - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build + build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + resource_class: large - pytorch_linux_build: name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build requires: diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 0fb27af006c85..d13561190d01f 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -10,8 +10,6 @@ "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", - "parallelnative-linux-xenial-py3.6-gcc5.4", - "paralleltbb-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-win-vs2019-cuda11.1-py3", @@ -29,8 +27,6 @@ "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", - "parallelnative-linux-xenial-py3.6-gcc5.4", - "paralleltbb-linux-xenial-py3.6-gcc5.4", "win-vs2019-cpu-py3" ], "ciflow/cuda": [ @@ -67,8 +63,6 @@ "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", - "parallelnative-linux-xenial-py3.6-gcc5.4", - "paralleltbb-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7" ], diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index dd115405e03ea..f1819dbac589d 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -272,32 +272,18 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} ), ), - CIWorkflow( - arch="linux", - build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - # This is a master only job despit on_pull_request is set to True - on_pull_request=True, - ciflow_config=CIFlowConfig( - enabled=True, - trigger_action_only=True, - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="parallelnative-linux-xenial-py3.6-gcc5.4", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - # This is a master only job despit on_pull_request is set to True - on_pull_request=True, - ciflow_config=CIFlowConfig( - enabled=True, - trigger_action_only=True, - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, - ), - ), + # CIWorkflow( + # arch="linux", + # build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + # test_runner_type=LINUX_CPU_TEST_RUNNER, + # ), + # CIWorkflow( + # arch="linux", + # build_environment="parallelnative-linux-xenial-py3.6-gcc5.4", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + # test_runner_type=LINUX_CPU_TEST_RUNNER, + # ), # CIWorkflow( # arch="linux", # build_environment="pure_torch-linux-xenial-py3.6-gcc5.4", diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml deleted file mode 100644 index 402ce38129052..0000000000000 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml +++ /dev/null @@ -1,430 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: parallelnative-linux-xenial-py3.6-gcc5.4 - -on: - pull_request: - types: [unassigned] - push: - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - -concurrency: - group: parallelnative-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true - -jobs: - ciflow_should_run: - runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} - steps: - - name: noop - run: echo running ciflow_should_run - calculate-docker-image: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.2xlarge - needs: [ciflow_should_run] - env: - DOCKER_BUILDKIT: 1 - timeout-minutes: 90 - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} - DOCKER_SKIP_S3_UPLOAD: 1 - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - cd .circleci/docker && ./build_docker.sh - - build: - runs-on: linux.2xlarge - needs: [calculate-docker-image, ciflow_should_run] - env: - DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-build - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - name: Pull docker image - run: | - docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build PyTorch - run: | - docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" \ - sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: ubuntu-18.04 - needs: [ciflow_should_run] - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - env: - DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - name: Pull docker image - run: | - docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Test PyTorch - env: - BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4-${{ matrix.config }} - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - if [[ $NUM_TEST_SHARDS -ne 2 ]]; then - export SHARD_NUMBER=0 - fi - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086 - docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e CONTINUE_THROUGH_ERROR \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" \ - sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Zip test reports for upload - if: always() - env: - COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml' - - uses: actions/upload-artifact@v2 - name: Store PyTorch Test Reports - if: always() - with: - name: test-reports-${{ matrix.config }} - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 - name: Store PyTorch Test Reports on S3 - if: always() - with: - name: test-reports-${{ matrix.config }} - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.16.34 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml deleted file mode 100644 index 59eceb58ea230..0000000000000 --- a/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml +++ /dev/null @@ -1,430 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: paralleltbb-linux-xenial-py3.6-gcc5.4 - -on: - pull_request: - types: [unassigned] - push: - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - -concurrency: - group: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true - -jobs: - ciflow_should_run: - runs-on: ubuntu-18.04 - if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} - steps: - - name: noop - run: echo running ciflow_should_run - calculate-docker-image: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.2xlarge - needs: [ciflow_should_run] - env: - DOCKER_BUILDKIT: 1 - timeout-minutes: 90 - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} - DOCKER_SKIP_S3_UPLOAD: 1 - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - cd .circleci/docker && ./build_docker.sh - - build: - runs-on: linux.2xlarge - needs: [calculate-docker-image, ciflow_should_run] - env: - DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-build - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - name: Pull docker image - run: | - docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build PyTorch - run: | - docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" \ - sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: ubuntu-18.04 - needs: [ciflow_should_run] - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - env: - DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} - JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} - steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: actions/checkout@v2 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - - name: Pull docker image - run: | - docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Test PyTorch - env: - BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ matrix.config }} - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - if [[ $NUM_TEST_SHARDS -ne 2 ]]; then - export SHARD_NUMBER=0 - fi - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086 - docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e CONTINUE_THROUGH_ERROR \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" \ - sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Zip test reports for upload - if: always() - env: - COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml' - - uses: actions/upload-artifact@v2 - name: Store PyTorch Test Reports - if: always() - with: - name: test-reports-${{ matrix.config }} - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 - name: Store PyTorch Test Reports on S3 - if: always() - with: - name: test-reports-${{ matrix.config }} - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test - CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} - CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} - CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.16.34 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af From c5ed31e4a7550bfe5a4893b3803ee7fdf1b31f53 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Sun, 29 Aug 2021 18:35:37 -0700 Subject: [PATCH 334/530] add channel last support for MaxUnpool2d (#49984) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49984 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D26007051 Pulled By: VitalyFedyunin fbshipit-source-id: 6c54751ade4092e03c1651aaa60380f7d6e92f6b --- aten/src/ATen/native/MaxUnpooling.cpp | 434 +++---------------- aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp | 385 ++++++++++++++++ aten/src/ATen/native/cpu/MaxUnpoolKernel.h | 16 + test/test_nn.py | 31 ++ tools/build_variables.bzl | 1 + 5 files changed, 486 insertions(+), 381 deletions(-) create mode 100644 aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp create mode 100644 aten/src/ATen/native/cpu/MaxUnpoolKernel.h diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp index b3c01941c73de..99874084470f4 100644 --- a/aten/src/ATen/native/MaxUnpooling.cpp +++ b/aten/src/ATen/native/MaxUnpooling.cpp @@ -1,90 +1,17 @@ #include #include -#include -#include +#include namespace at { namespace native { -template -Tensor max_unpooling2d_forward_out_cpu_frame( - Tensor& output, - const Tensor& input, - const Tensor& indices, - int64_t oheight, - int64_t owidth) { - int64_t numBatch = 1; - int64_t dimc = 0; - int64_t dimh = 1; - int64_t dimw = 2; - if (input.ndimension() == 4) { - numBatch = input.size(0); - dimc++; - dimh++; - dimw++; - } - int64_t numChannels = input.size(dimc); - int64_t inputHeight = input.size(dimh); - int64_t inputWidth = input.size(dimw); - - auto* rawInput = input.data_ptr(); - auto* rawIndices = indices.data_ptr(); - auto* rawOutput = output.data_ptr(); - - at::internal::lazy_init_num_threads(); - - for (int64_t n = 0; n < numBatch; n++) { - int64_t nOutputOffset = n * numChannels * owidth * oheight; - int64_t nInputOffset = n * numChannels * inputWidth * inputHeight; - int64_t k = 0; - bool has_error = false; - int64_t error_index = 0; -#pragma omp parallel for private(k) - for (k = 0; k < numChannels; k++) { - int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight; - int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight; - scalar_t* output_p_k = rawOutput + finalOutputOffset; - scalar_t* input_p_k = rawInput + finalInputOffset; - int64_t* ind_p_k = rawIndices + finalInputOffset; - - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t maxp; - for (int64_t i = 0; i < inputHeight; i++) { - for (int64_t j = 0; j < inputWidth; j++) { - maxp = ind_p_k[i * inputWidth + j]; - if (maxp < 0 || maxp >= owidth * oheight) { -#pragma omp critical - { - has_error = true; - error_index = maxp; - } - } else { - output_p_k[maxp] = input_p_k[i * inputWidth + j]; - } - } - } - } - if (has_error) { - AT_ERROR( - "Found an invalid max index: ", - error_index, - " (output volumes are of size ", - oheight, - "x", - owidth); - (void)error_index; - } - } - return output; -} - -Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_, +Tensor& max_unpooling2d_forward_out_cpu( + const Tensor& self_, const Tensor& indices_, IntArrayRef output_size, Tensor& output) { auto oheight = output_size[0]; auto owidth = output_size[1]; - TORCH_CHECK(output.is_contiguous(), "output must be contiguous"); TORCH_CHECK( indices_.scalar_type() == at::ScalarType::Long, "elements in indices should be type int64"); @@ -100,8 +27,9 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_, TORCH_CHECK(self_.numel() > 0, "Input must be non-empty"); - auto self = self_.contiguous(); - auto indices = indices_.contiguous(); + auto memory_format = self_.suggest_memory_format(); + auto self = self_.contiguous(memory_format); + auto indices = indices_.contiguous(memory_format); if (self.ndimension() == 3) { int64_t numChannels = self.size(0); @@ -109,15 +37,11 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_, } else { int64_t numBatch = self.size(0); int64_t numChannels = self.size(1); - output.resize_({numBatch, numChannels, oheight, owidth}); + output.resize_({numBatch, numChannels, oheight, owidth}, memory_format); } output.zero_(); - AT_DISPATCH_FLOATING_TYPES( - self.scalar_type(), "max_unpooling2d_forward_out_cpu_frame", ([&] { - max_unpooling2d_forward_out_cpu_frame( - output, self, indices, oheight, owidth); - })); + max_unpool2d_kernel(kCPU, output, self, indices); return output; }; @@ -130,87 +54,6 @@ Tensor max_unpooling2d_forward_cpu( return output; } -template -Tensor max_unpooling3d_forward_out_cpu_frame( - Tensor& output, - const Tensor& input, - const Tensor& indices, - int64_t oT, - int64_t oH, - int64_t oW) { - int64_t nBatch = 1; - int64_t dimw = 3; - int64_t dimh = 2; - int64_t dimt = 1; - - if (input.ndimension() == 5) { - nBatch = input.size(0); - dimw++; - dimh++; - dimt++; - } - - int64_t nSlices = input.size(dimt - 1); - int64_t iT = input.size(dimt); - int64_t iH = input.size(dimh); - int64_t iW = input.size(dimw); - - scalar_t* input_data = input.data_ptr(); - scalar_t* output_data = output.data_ptr(); - int64_t* indices_data = indices.data_ptr(); - - at::internal::lazy_init_num_threads(); - - for (int64_t p = 0; p < nBatch; p++) { - int64_t inputOffset = p * nSlices * iT * iW * iH; - int64_t outputOffset = p * nSlices * oT * oW * oH; - int64_t k = 0; - bool has_error = false; - int error_index = 0; -#pragma omp parallel for private(k) - for (k = 0; k < nSlices; k++) { - int64_t finalInputOffset = inputOffset + k * iT * iW * iH; - int64_t finalOutputOffset = outputOffset + k * oT * oW * oH; - - scalar_t* output_p_k = output_data + finalOutputOffset; - scalar_t* input_p_k = input_data + finalInputOffset; - int64_t* ind_p_k = indices_data + finalInputOffset; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int maxp; - for (int64_t t = 0; t < iT; t++) { - for (int64_t i = 0; i < iH; i++) { - for (int64_t j = 0; j < iW; j++) { - int64_t index = t * iH * iW + i * iW + j; - maxp = ind_p_k[index]; - if (maxp < 0 || maxp >= oT * oW * oH) { -#pragma omp critical - { - has_error = true; - error_index = maxp; - } - } else { - output_p_k[maxp] = input_p_k[index]; - } - } - } - } - if (has_error) { - AT_ERROR( - "found an invalid max index ", - error_index, - " (output volumes are of size ", - oT, - "x", - oH, - "x", - oW); - (void)error_index; - } - } - } - return output; -} - static void max_unpooling3d_shape_check( const Tensor& input, const Tensor& gradOutput, @@ -310,16 +153,7 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_, } output.zero_(); - AT_DISPATCH_FLOATING_TYPES( - self.scalar_type(), "max_unpooling3d_forward_out_cpu_frame", ([&] { - max_unpooling3d_forward_out_cpu_frame( - output, - self, - indices, - oT, - oH, - oW); - })); + max_unpool3d_kernel(kCPU, output, self, indices); return output; } @@ -335,59 +169,6 @@ Tensor max_unpooling3d_forward_cpu( return output; } -template -static void max_unpooling2d_backward_out_cpu_frame( - scalar_t* gradInput_p, - scalar_t* gradOutput_p, - int64_t* ind_p, - int64_t nslices, - int64_t iheight, - int64_t iwidth, - int64_t oheight, - int64_t owidth) { - bool has_error = false; - int64_t error_index = 0; - int64_t k = 0; - - at::internal::lazy_init_num_threads(); -#pragma omp parallel for private(k) - for (k = 0; k < nslices; k++) { - scalar_t* gradInput_p_k = gradInput_p + k * iwidth * iheight; - scalar_t* gradOutput_p_k = gradOutput_p + k * owidth * oheight; - int64_t* ind_p_k = ind_p + k * iwidth * iheight; - - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t i, j; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t maxp; - - for (i = 0; i < iheight; i++) { - for (j = 0; j < iwidth; j++) { - maxp = ind_p_k[i * iwidth + j]; /* retrieve position of max */ - if (maxp < 0 || maxp >= owidth * oheight) { -#pragma omp critical - { - has_error = true; - error_index = maxp; - } - } - gradInput_p_k[i * iwidth + j] = - gradOutput_p_k[maxp]; /* update gradient */ - } - } - } - if (has_error) { - AT_ERROR( - "invalid max index ", - error_index, - ", owidth= ", - owidth, - ", oheight= ", - oheight); - (void)error_index; - } -} - Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_, const Tensor& self, const Tensor& indices_, @@ -396,42 +177,24 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_, TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); int64_t oheight = output_size[0]; int64_t owidth = output_size[1]; - int dimw = 2; - int dimh = 1; - int nbatch = 1; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int nslices; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int iheight; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int iwidth; + int64_t ndim = self.ndimension(); + int64_t dimh = ndim == 3 ? 1 : 2; + int64_t dimw = ndim == 3 ? 2 : 3; + TORCH_CHECK( indices_.scalar_type() == at::ScalarType::Long, "elements in indices should be type int64"); TORCH_CHECK( self.sizes() == indices_.sizes(), "Input shape must match indices shape"); - TORCH_CHECK(output_size.size() == 2, "Output size must be 2"); - /* get contiguous gradOutput and indices */ - auto grad_output = grad_output_.contiguous(); - auto indices = indices_.contiguous(); + auto memory_format = self.suggest_memory_format(); + auto grad_output = grad_output_.contiguous(memory_format); + auto indices = indices_.contiguous(memory_format); - /* resize */ - grad_input.resize_as_(self); + grad_input.resize_(self.sizes(), memory_format); grad_input.zero_(); - if (self.ndimension() == 4) { - nbatch = self.size(0); - dimw++; - dimh++; - } - - /* sizes */ - nslices = self.size(dimh - 1); - iheight = self.size(dimh); - iwidth = self.size(dimw); - if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) { AT_ERROR( "Inconsistent gradOutput size. output height = ", @@ -443,23 +206,8 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_, "x", grad_output.size(dimw)); } - AT_DISPATCH_FLOATING_TYPES( - self.scalar_type(), "max_unpooling2d_backward_out_cpu_frame", ([&] { - int p; - for (p = 0; p < nbatch; p++) { - auto inputOffset = p * nslices * iheight * iwidth; - auto outputOffset = p * nslices * oheight * owidth; - max_unpooling2d_backward_out_cpu_frame( - grad_input.data_ptr() + inputOffset, - grad_output.data_ptr() + outputOffset, - indices.data_ptr() + inputOffset, - nslices, - iheight, - iwidth, - oheight, - owidth); - } - })); + + max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices); return grad_input; } @@ -468,72 +216,14 @@ Tensor max_unpooling2d_backward_cpu( const Tensor& self, const Tensor& indices, IntArrayRef output_size) { - auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - at::native::max_unpooling2d_backward_out_cpu( + auto grad_input = at::empty({0}, self.options()); + max_unpooling2d_backward_out_cpu( grad_output, self, indices, output_size, grad_input); return grad_input; } -template -static void max_unpooling3d_backward_out_cpu_frame( - scalar_t* gradInput_p, - scalar_t* gradOutput_p, - int64_t* ind_p, - int64_t nslices, - int64_t iT, - int64_t iH, - int64_t iW, - int64_t oT, - int64_t oH, - int64_t oW) { - int64_t k = 0; - bool has_error = false; - int error_index = 0; - - at::internal::lazy_init_num_threads(); - -#pragma omp parallel for private(k) - for (k = 0; k < nslices; k++) { - scalar_t* gradInput_p_k = gradInput_p + k * iT * iH * iW; - scalar_t* gradOutput_p_k = gradOutput_p + k * oT * oH * oW; - int64_t* ind_p_k = ind_p + k * iT * iH * iW; - - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t t, i, j, index; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t maxp; - for (t = 0; t < iT; t++) { - for (i = 0; i < iH; i++) { - for (j = 0; j < iW; j++) { - index = t * iH * iW + i * iW + j; - maxp = ind_p_k[index]; /* retrieve position of max */ - if (maxp < 0 || maxp >= oT * oH * oW) { -#pragma omp critical - { - has_error = true; - error_index = maxp; - } - } - gradInput_p_k[index] = gradOutput_p_k[maxp]; /* update gradient */ - } - } - } - } - if (has_error) { - AT_ERROR( - "invalid max index ", - error_index, - ", oT= ", - oT, - ", oW= ", - oW, - ",oH= ", - oH); - (void)error_index; - } -} - -Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_, +Tensor& max_unpooling3d_backward_out_cpu( + const Tensor& grad_output_, const Tensor& self, const Tensor& indices_, IntArrayRef output_size, @@ -541,26 +231,17 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_, IntArrayRef padding, Tensor& grad_input) { TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); - auto oT = output_size[0]; - auto oH = output_size[1]; - auto oW = output_size[2]; - int dimw = 3; - int dimh = 2; - int dimt = 1; - int nbatch = 1; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int nslices; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int iT; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int iH; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int iW; + int64_t oT = output_size[0]; + int64_t oH = output_size[1]; + int64_t oW = output_size[2]; + int64_t ndim = self.ndimension(); + int64_t dimt = ndim == 4 ? 1 : 2; + int64_t dimh = ndim == 4 ? 2 : 3; + int64_t dimw = ndim == 4 ? 3 : 4; max_unpooling3d_shape_check( self, grad_output_, indices_, output_size, stride, padding); - // TODO (from THNN): check gradOutput shape /* get contiguous gradOutput */ auto grad_output = grad_output_.contiguous(); auto indices = indices_.contiguous(); @@ -568,39 +249,24 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_, /* resize */ grad_input.resize_as_(self); grad_input.zero_(); - if (self.ndimension() == 5) { - nbatch = self.size(0); - dimt++; - dimw++; - dimh++; + + if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) { + AT_ERROR( + "Inconsistent gradOutput size. output depth = ", + oT, + ", output height = ", + oH, + ", output width = ", + oW, + ", gradOutput: ", + grad_output.size(dimt), + "x", + grad_output.size(dimh), + "x", + grad_output.size(dimw)); } - /* sizes */ - nslices = self.size(dimt - 1); - iT = self.size(dimt); - iH = self.size(dimh); - iW = self.size(dimw); - - /* backprop */ - AT_DISPATCH_FLOATING_TYPES( - self.scalar_type(), "max_unpooling3d_backward_out_cpu_frame", ([&] { - int p; - for (p = 0; p < nbatch; p++) { - int inputOffset = p * nslices * iT * iH * iW; - int outputOffset = p * nslices * oT * oH * oW; - max_unpooling3d_backward_out_cpu_frame( - grad_input.data_ptr() + inputOffset, - grad_output.data_ptr() + outputOffset, - indices.data_ptr() + inputOffset, - nslices, - iT, - iH, - iW, - oT, - oH, - oW); - } - })); + max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices); return grad_input; } @@ -611,10 +277,16 @@ Tensor max_unpooling3d_backward_cpu( IntArrayRef output_size, IntArrayRef stride, IntArrayRef padding) { - auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto grad_input = at::empty({0}, self.options()); at::native::max_unpooling3d_backward_out_cpu( grad_output, self, indices, output_size, stride, padding, grad_input); return grad_input; } + +DEFINE_DISPATCH(max_unpool2d_kernel); +DEFINE_DISPATCH(max_unpool2d_backward_kernel); +DEFINE_DISPATCH(max_unpool3d_kernel); +DEFINE_DISPATCH(max_unpool3d_backward_kernel); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp new file mode 100644 index 0000000000000..5a7b03128766b --- /dev/null +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -0,0 +1,385 @@ +#include + +#include +#include +#include +#include + +namespace at { namespace native { + +namespace { + +template +void cpu_max_unpool( + Tensor& output_, + const Tensor& input, + const Tensor& indices) { + auto output = output_.contiguous(); + + auto input_data = input.data_ptr(); + auto indices_data = indices.data_ptr(); + auto output_data = output.data_ptr(); + + // NB: input tensor dimensions: + // MaxUnpool2d: + // dim = 3: CHW + // dim = 4: NCHW + // MaxUnpool3d: + // dim = 4: CDHW + // dim = 5: NCDHW + + int64_t numel = input.numel(); + int64_t ndim = input.ndimension(); + + // treat batch size and channels as one dimension + // and the feature map as another dimension + int64_t channels, output_depth, output_height, output_width; + if (is_3d) { + TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d: expect input to be 4d or 5d tensor."); + channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1); + output_depth = output.size(-3); + output_height = output.size(-2); + output_width = output.size(-1); + } else { + TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d: expect input to be 3d or 4d tensor."); + channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1); + output_depth = 1; + output_height = output.size(-2); + output_width = output.size(-1); + } + int64_t input_image_size = numel / channels; + int64_t output_image_size = output.numel() / channels; + + bool has_error = false; + int64_t error_index = 0; + + // parallel on dim N, C, D, H, W: [channels, input_image_size] + at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { + int64_t c = 0; + int64_t ip = 0; + data_index_init(begin, c, channels, ip, input_image_size); + + for (int64_t i = begin; i < end; i++) { + scalar_t* output_ptr = output_data + c * output_image_size; + + int64_t maxp = indices_data[i]; + if (maxp < 0 || maxp >= output_image_size) { + #pragma omp critical + { + has_error = true; + error_index = maxp; + } + } else { + output_ptr[maxp] = input_data[i]; + } + + // move on to next input index + data_index_step(c, channels, ip, input_image_size); + } + }); + + if (has_error) { + if (is_3d) { + AT_ERROR("Found an invalid max index: ", error_index, + " (output volumes are of size ", output_depth, + "x", output_height, "x", output_width); + (void)error_index; + } else { + AT_ERROR("Found an invalid max index: ", error_index, + " (output volumes are of size ", output_height, + "x", output_width); + (void)error_index; + } + } + + if (!output_.is_contiguous()) { + output_.copy_(output); + } +} + +template +void cpu_max_unpool_channels_last( + Tensor& output_, + const Tensor& input, + const Tensor& indices) { + TORCH_CHECK(input.ndimension() == 4, + "max_unpool2d with channels last format supports tensors with 4 dims"); + auto memory_format = at::MemoryFormat::ChannelsLast; + auto output = output_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto indices_data = indices.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_height = input.size(2); + int64_t input_width = input.size(3); + int64_t output_height = output.size(2); + int64_t output_width = output.size(3); + int64_t input_image_size = input_height * input_width; + int64_t output_image_size = output_height * output_width; + + bool has_error = false; + int64_t error_index = 0; + + // parallel on dim N, H, W + at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t ip = 0; + data_index_init(begin, n, nbatch, ip, input_image_size); + + for (int64_t i = begin; i < end; i++) { + scalar_t* input_ptr = input_data + i * channels; + int64_t* indices_ptr = indices_data + i * channels; + scalar_t* output_ptr = output_data + n * output_image_size * channels; + + // can't do scatter on avx2 (only available on avx512) + for (int64_t c = 0; c < channels; c++) { + int64_t maxp = indices_ptr[c]; + if (maxp < 0 || maxp >= output_image_size) { + #pragma omp critical + { + has_error = true; + error_index = maxp; + } + } else { + output_ptr[maxp * channels + c] = input_ptr[c]; + } + } + + // move on to next input index + data_index_step(n, nbatch, ip, input_image_size); + } + }); + + if (has_error) { + AT_ERROR("Found an invalid max index: ", error_index, + " (output volumes are of size ", output_height, + "x", output_width); + (void)error_index; + } + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } +} + +template +void cpu_max_unpool_backward( + Tensor& grad_input_, + const Tensor& grad_output, + const Tensor& indices) { + auto grad_input = grad_input_.contiguous(); + + auto grad_output_data = grad_output.data_ptr(); + auto indices_data = indices.data_ptr(); + auto grad_input_data = grad_input.data_ptr(); + + int64_t numel = grad_input.numel(); + int64_t ndim = grad_output.ndimension(); + + // treat batch size and channels as one dimension + // and the feature map as another dimension + int64_t channels, output_depth, output_height, output_width; + if (is_3d) { + TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d_backward: expect grad_output to be 4d or 5d tensor."); + channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1); + output_depth = grad_output.size(-3); + output_height = grad_output.size(-2); + output_width = grad_output.size(-1); + } else { + TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d_backward: expect grad_output to be 3d or 4d tensor."); + channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1); + output_depth = 1; + output_height = grad_output.size(-2); + output_width = grad_output.size(-1); + } + int64_t input_image_size = numel / channels; + int64_t output_image_size = grad_output.numel() / channels; + + bool has_error = false; + int64_t error_index = 0; + + // parallel on dim N, C, D, H, W + at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { + int64_t c = 0; + int64_t ip = 0; + data_index_init(begin, c, channels, ip, input_image_size); + + for (int64_t i = begin; i < end; i++) { + scalar_t* grad_output_ptr = grad_output_data + c * output_image_size; + + int64_t maxp = indices_data[i]; + if (maxp < 0 || maxp >= output_image_size) { + #pragma omp critical + { + has_error = true; + error_index = maxp; + } + } else { + grad_input_data[i] = grad_output_ptr[maxp]; + } + + // move on to next input index + data_index_step(c, channels, ip, input_image_size); + } + }); + + if (has_error) { + if (is_3d) { + AT_ERROR("invalid max index ", error_index, + ", odepth= ", output_depth, + ", owidth= ", output_width, + ", oheight= ", output_height); + (void)error_index; + } else { + AT_ERROR("invalid max index ", error_index, + ", owidth= ", output_width, + ", oheight= ", output_height); + (void)error_index; + } + } + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } +} + +template +void cpu_max_unpool_backward_channels_last( + Tensor& grad_input_, + const Tensor& grad_output, + const Tensor& indices) { + TORCH_CHECK(grad_output.ndimension() == 4, + "max_unpool2d backward with channels last format supports tensors with 4 dims."); + auto memory_format = at::MemoryFormat::ChannelsLast; + auto grad_input = grad_input_.contiguous(memory_format); + + auto grad_input_data = grad_input.data_ptr(); + auto grad_output_data = grad_output.data_ptr(); + auto indices_data = indices.data_ptr(); + + int64_t nbatch = grad_input.size(0); + int64_t channels = grad_input.size(1); + int64_t input_height = grad_input.size(2); + int64_t input_width = grad_input.size(3); + int64_t output_height = grad_output.size(2); + int64_t output_width = grad_output.size(3); + int64_t input_image_size = input_height * input_width; + int64_t output_image_size = output_height * output_width; + + bool has_error = false; + int64_t error_index = 0; + + // parallel on dim N, H, W + at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t ip = 0; + data_index_init(begin, n, nbatch, ip, input_image_size); + + for (int64_t i = begin; i < end; i++) { + scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels; + scalar_t* grad_input_ptr = grad_input_data + i * channels; + int64_t* indices_ptr = indices_data + i * channels; + + for (int64_t c = 0; c < channels; c++) { + int64_t maxp = indices_ptr[c]; + if (maxp < 0 || maxp >= output_image_size) { + #pragma omp critical + { + has_error = true; + error_index = maxp; + } + } else { + grad_input_ptr[c] = grad_output_ptr[maxp * channels + c]; + } + } + + // move on to next input index + data_index_step(n, nbatch, ip, input_image_size); + } + }); + + if (has_error) { + AT_ERROR("invalid max index ", error_index, + ", owidth= ", output_width, + ", oheight= ", output_height); + (void)error_index; + } + + if (!grad_input_.is_contiguous(memory_format)) { + grad_input_.copy_(grad_input); + } +} + +void max_unpool2d_kernel_impl( + Tensor& output, + const Tensor& input, + const Tensor& indices) { + switch(input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d", [&] { + cpu_max_unpool(output, input, indices); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d_channels_last", [&] { + cpu_max_unpool_channels_last(output, input, indices); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +void max_unpool3d_kernel_impl( + Tensor& output, + const Tensor& input, + const Tensor& indices) { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool3d", [&] { + cpu_max_unpool(output, input, indices); + }); +} + +void max_unpool2d_backward_kernel_impl( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& indices) { + switch(grad_output.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] { + cpu_max_unpool_backward(grad_input, grad_output, indices); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] { + cpu_max_unpool_backward_channels_last(grad_input, grad_output, indices); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +void max_unpool3d_backward_kernel_impl( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& indices) { + AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] { + cpu_max_unpool_backward(grad_input, grad_output, indices); + }); +} + +} // anonymous namespace + +REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl); +REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl); +REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl); +REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl); + +}} // at::native diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h new file mode 100644 index 0000000000000..00fbeb64213d6 --- /dev/null +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h @@ -0,0 +1,16 @@ +#include +#include +#include + +#pragma once + +namespace at { namespace native { + +using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&); + +DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel); +DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel); +DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel); +DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel); + +}} // at::native diff --git a/test/test_nn.py b/test/test_nn.py index 4e01c94d4c971..7d26246786c6c 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -6186,6 +6186,37 @@ def test_MaxUnpool2d_output_size(self): else: self.assertRaises(ValueError, lambda: mu(output_small, indices_small, (h, w))) + def test_max_unpool2d_nhwc_cpu(self): + input = torch.randn(2, 10, 9, 9).float().cpu() + input = input.contiguous(memory_format=torch.channels_last) + ref_input = input.clone().contiguous() + + pool = nn.MaxPool2d(3, stride=2, return_indices=True).cpu() + ref_pool = nn.MaxPool2d(3, stride=2, return_indices=True).cpu() + + out, ind = pool(input) + ref_out, ref_ind = ref_pool(ref_input) + out.requires_grad_() + ref_out.requires_grad_() + + unpool = nn.MaxUnpool2d(3, stride=2).cpu() + ref_unpool = nn.MaxUnpool2d(3, stride=2).cpu() + + upout = unpool(out, ind) + ref_upout = ref_unpool(ref_out, ref_ind) + + grad = torch.randn(upout.size()).float().cpu() + grad = grad.contiguous(memory_format=torch.channels_last) + ref_grad = grad.clone().contiguous() + + upout.backward(grad) + ref_upout.backward(ref_grad) + + self.assertTrue(upout.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_upout.is_contiguous()) + self.assertTrue(torch.allclose(upout, ref_upout)) + self.assertTrue(torch.allclose(out.grad, ref_out.grad)) + def test_container_copy(self): class Model(nn.Module): def __init__(self): diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index b2a1016118d28..34846b5d6c7b3 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -907,6 +907,7 @@ aten_native_source_codegen_list = [ "aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp", "aten/src/ATen/native/cpu/MaxPooling.cpp", "aten/src/ATen/native/cpu/MaxPoolKernel.cpp", + "aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp", "aten/src/ATen/native/cpu/MultinomialKernel.cpp", "aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp", "aten/src/ATen/native/cpu/PowKernel.cpp", From 29ad84f2523346b4b03ac99fa04203fe81d2c4e3 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Sun, 29 Aug 2021 19:37:06 -0700 Subject: [PATCH 335/530] Removes beta warning from the special module documentation (#64148) Summary: Updates documentation per feature review. torch.special is now stable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64148 Reviewed By: ngimel Differential Revision: D30632049 Pulled By: mruberry fbshipit-source-id: 8f6148ec7737e7b3a90644eeca23eb217eda513d --- docs/source/special.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/source/special.rst b/docs/source/special.rst index 06961dbeaaab6..b74d833c96324 100644 --- a/docs/source/special.rst +++ b/docs/source/special.rst @@ -6,10 +6,6 @@ torch.special The torch.special module, modeled after SciPy's `special `_ module. -This module is in BETA. New functions are still being added, and some -functions may change in future PyTorch releases. See the documentation of each -function for details. - .. automodule:: torch.special :noindex: From 44e3ed88c9a1bd9ee6b0168ba5271a2c6b006cc8 Mon Sep 17 00:00:00 2001 From: Zafar Takhirov Date: Sun, 29 Aug 2021 20:28:32 -0700 Subject: [PATCH 336/530] [quant] AO migration of the `quantize.py` (#64086) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64086 AO Team is migrating the existing torch.quantization into torch.ao.quantization. We are doing it one file at a time to make sure that the internal callsites are updated properly. This migrates the `quantize.py` from torch.quantization to `torch.ao.quantization`. At this point both locations will be supported. Eventually the torch.quantization will be deprecated. Test Plan: `buck test mode/opt //caffe2/test:quantization` Reviewed By: jerryzh168, raghuramank100 Differential Revision: D30055886 fbshipit-source-id: 8ef7470f9fa640c0042bef5bb843e7a05ecd0b9f --- test/quantization/ao_migration/__init__.py | 0 .../ao_migration/test_quantize_py.py | 63 ++ test/test_quantization.py | 2 + torch/ao/quantization/__init__.py | 0 torch/ao/quantization/quantize.py | 580 +++++++++++++++++ torch/quantization/fx/convert.py | 2 +- torch/quantization/fx/prepare.py | 2 +- .../quantization/fx/quantization_patterns.py | 2 +- torch/quantization/fx/utils.py | 2 +- torch/quantization/quantize.py | 604 +----------------- 10 files changed, 676 insertions(+), 581 deletions(-) create mode 100644 test/quantization/ao_migration/__init__.py create mode 100644 test/quantization/ao_migration/test_quantize_py.py create mode 100644 torch/ao/quantization/__init__.py create mode 100644 torch/ao/quantization/quantize.py diff --git a/test/quantization/ao_migration/__init__.py b/test/quantization/ao_migration/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/test/quantization/ao_migration/test_quantize_py.py b/test/quantization/ao_migration/test_quantize_py.py new file mode 100644 index 0000000000000..086364bef9d56 --- /dev/null +++ b/test/quantization/ao_migration/test_quantize_py.py @@ -0,0 +1,63 @@ +from torch.testing._internal.common_utils import TestCase + +import importlib +from typing import List + + +class AOMigrationTestCase(TestCase): + def _test_package_import(self, package_name: str): + r"""Tests the module import by making sure that all the internals match + (except the dunder methods).""" + old_module = importlib.import_module(f'torch.quantization.{package_name}') + new_module = importlib.import_module(f'torch.ao.quantization.{package_name}') + old_module_dir = set(dir(old_module)) + new_module_dir = set(dir(new_module)) + # Remove magic modules from checking in subsets + for el in list(old_module_dir): + if el[:2] == '__' and el[-2:] == '__': + old_module_dir.remove(el) + assert (old_module_dir <= new_module_dir), \ + f"Importing {old_module} vs. {new_module} does not match: " \ + f"{old_module_dir - new_module_dir}" + + def _test_function_import(self, package_name: str, function_list: List[str]): + r"""Tests individual function list import by comparing the functions + and their hashes.""" + old_location = importlib.import_module(f'torch.quantization.{package_name}') + new_location = importlib.import_module(f'torch.ao.quantization.{package_name}') + for fn_name in function_list: + old_function = getattr(old_location, fn_name) + new_function = getattr(new_location, fn_name) + assert old_function == new_function, f"Functions don't match: {fn_name}" + assert hash(old_function) == hash(new_function), \ + f"Hashes don't match: {old_function}({hash(old_function)}) vs. " \ + f"{new_function}({hash(new_function)})" + + +class TestAOMigrationQuantizePy(AOMigrationTestCase): + def test_package_import(self): + self._test_package_import('quantize') + + def test_function_import(self): + function_list = [ + '_convert', + '_observer_forward_hook', + '_propagate_qconfig_helper', + '_remove_activation_post_process', + '_remove_qconfig', + 'add_observer_', + 'add_quant_dequant', + 'convert', + 'get_observer_dict', + 'get_unique_devices_', + 'is_activation_post_process', + 'prepare', + 'prepare_qat', + 'propagate_qconfig_', + 'quantize', + 'quantize_dynamic', + 'quantize_qat', + 'register_activation_post_process_hook', + 'swap_module', + ] + self._test_function_import('quantize', function_list) diff --git a/test/test_quantization.py b/test/test_quantization.py index 867151373a5b6..ffc242ed77e33 100644 --- a/test/test_quantization.py +++ b/test/test_quantization.py @@ -100,6 +100,8 @@ from quantization.jit.test_fusion_passes import TestFusionPasses # noqa: F401 from quantization.jit.test_deprecated_jit_quant import TestDeprecatedJitQuantized # noqa: F401 +# AO Migration tests +from quantization.ao_migration.test_quantize_py import TestAOMigrationQuantizePy # noqa: F401 if __name__ == '__main__': run_tests() diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py new file mode 100644 index 0000000000000..92a794ed7b631 --- /dev/null +++ b/torch/ao/quantization/quantize.py @@ -0,0 +1,580 @@ +import copy +import itertools +import warnings + +import torch +import torch.nn as nn +import torch.nn.quantized as nnq +from torch.nn.intrinsic import _FusedModule + +# Import the duplicated API +from torch.quantization.quantization_mappings import ( + get_default_dynamic_quant_module_mappings, + get_default_static_quant_module_mappings, + get_default_qat_module_mappings, + get_default_qconfig_propagation_list, + no_observer_set, + _has_special_act_post_process, + _get_special_act_post_process, +) +from torch.quantization.stubs import DeQuantStub, QuantWrapper +from torch.quantization.qconfig import ( + add_module_to_qconfig_obs_ctr, + default_dynamic_qconfig, + float16_dynamic_qconfig, + float_qparams_weight_only_qconfig) + +def is_activation_post_process(module): + return (isinstance(module, torch.quantization.ObserverBase) or + isinstance(module, torch.quantization.FakeQuantizeBase)) + +def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None, + qconfig_parent=None, prefix=''): + r"""This is a helper function for `propagate_qconfig_` + + Args: + module: input module + qconfig_dict: dictionary that maps from name of submodule to quantization + configuration + allow_list: list of quantizable modules + qconfig_parent: quantization config of parent module, we will fallback to + this config when there is no specified config for current + module + prefix: corresponding prefix of the current module, used as key in + qconfig_dict + + Return: + None, module is modified inplace with qconfig attached + """ + # TODO: Add test + if allow_list is None: + allow_list = get_default_qconfig_propagation_list() + + module_qconfig = qconfig_dict.get(type(module), qconfig_parent) + module_qconfig = qconfig_dict.get(prefix, module_qconfig) + module_qconfig = getattr(module, 'qconfig', module_qconfig) + + torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module) + + qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module) + module.qconfig = qconfig_with_device_check + + for name, child in module.named_children(): + module_prefix = prefix + '.' + name if prefix else name + _propagate_qconfig_helper(child, qconfig_dict, allow_list, + qconfig_with_device_check, module_prefix) + +# TODO(jerryzh): expose allow_list +def propagate_qconfig_(module, qconfig_dict=None, allow_list=None): + r"""Propagate qconfig through the module hierarchy and assign `qconfig` + attribute on each leaf module + + Args: + module: input module + qconfig_dict: dictionary that maps from name or type of submodule to + quantization configuration, qconfig applies to all submodules of a + given module unless qconfig for the submodules are specified (when + the submodule already has qconfig attribute) + + Return: + None, module is modified inplace with qconfig attached + """ + if qconfig_dict is None: + qconfig_dict = {} + _propagate_qconfig_helper(module, qconfig_dict, allow_list) + +def _observer_forward_hook(self, input, output): + r"""Forward hook that calls observer on the output + """ + return self.activation_post_process(output) + +def register_activation_post_process_hook(module): + assert hasattr(module, 'activation_post_process'), \ + 'Expect activation_post_process attribut already attached to the module' + return module.register_forward_hook(_observer_forward_hook) + +def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None): + r"""Add observer for the leaf child of the module. + + This function insert observer module to all leaf child module that + has a valid qconfig attribute. + + Args: + module: input module with qconfig attributes for all the leaf modules that we want to quantize + device: parent device, if any + non_leaf_module_list: list of non-leaf modules we want to add observer + + Return: + None, module is modified inplace with added observer modules and forward_hooks + """ + if qconfig_propagation_list is None: + qconfig_propagation_list = get_default_qconfig_propagation_list() + + if custom_module_class_mapping is None: + custom_module_class_mapping = {} + + # respect device affinity when adding observers + if device is None: + devices = get_unique_devices_(module) + assert len(devices) <= 1, ( + "add_observer_ only works with cpu or single-device CUDA modules, " + "but got devices {}".format(devices) + ) + device = next(iter(devices)) if len(devices) > 0 else None + + def get_activation_post_process(qconfig, device, special_act_post_process=None): + activation = qconfig.activation() if special_act_post_process is None else special_act_post_process() + if device is not None: + activation.to(device) + return activation + + def needs_observation(m): + return hasattr(m, 'qconfig') and m.qconfig is not None + + def insert_activation_post_process(m, special_act_post_process=None): + """ Adds an activation post process module and register + a post hook that calls the module + """ + # We don't insert observer/fake_quantize for DeQuantStub + if needs_observation(m) and not isinstance(m, DeQuantStub): + # observer and hook will be gone after we swap the module + m.add_module('activation_post_process', get_activation_post_process( + m.qconfig, device, special_act_post_process)) + # Register observer as the first entry in the hook list + # All post forward hooks are preserved and will be executed after the observer before convert + handle = register_activation_post_process_hook(m) + m._forward_hooks.move_to_end(handle.id, last=False) + + for name, child in module.named_children(): + if type(child) in [nnq.FloatFunctional, nnq.QFunctional]: + if needs_observation(child): + child.activation_post_process = get_activation_post_process(child.qconfig, device) + elif isinstance(child, _FusedModule): + # activation_post_process are now added directly to nn.Sequentail/_FusedModule + if needs_observation(child): + insert_activation_post_process(child) + elif _has_special_act_post_process(child): + special_act_post_process = _get_special_act_post_process(child) + insert_activation_post_process(child, special_act_post_process) + elif non_leaf_module_list is not None and type(child) in non_leaf_module_list: + if needs_observation(child): + insert_activation_post_process(child) + elif needs_observation(child) and type(child) in custom_module_class_mapping: + observed_child = custom_module_class_mapping[type(child)].from_float(child) + setattr(module, name, observed_child) + # TODO: These are the modules that cannot be observed + # Once there are more, we should move them to a separate list + if custom_module_class_mapping[type(child)] not in no_observer_set(): + insert_activation_post_process(observed_child) + else: + add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping) + + # Insert observers only for leaf nodes, note that this observer is for + # the output of the module, for input QuantStub will observe them + if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ + and type(module) in qconfig_propagation_list: + insert_activation_post_process(module) + +def get_unique_devices_(module): + return {p.device for p in module.parameters()} | \ + {p.device for p in module.buffers()} + +def add_quant_dequant(module): + r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig + Note that this function will modify the children of module inplace and it + can return a new module which wraps the input module as well. + + Args: + module: input module with qconfig attributes for all the leaf modules + that we want to quantize + + Return: + Either the inplace modified module with submodules wrapped in + `QuantWrapper` based on qconfig or a new `QuantWrapper` module which + wraps the input module, the latter case only happens when the input + module is a leaf module and we want to quantize it. + """ + if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig: + return QuantWrapper(module) + + for name, child in module.named_children(): + module._modules[name] = add_quant_dequant(child) + return module + +def prepare(model, inplace=False, allow_list=None, + observer_non_leaf_module_list=None, + prepare_custom_config_dict=None): + r"""Prepares a copy of the model for quantization calibration or quantization-aware training. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + The model will be attached with observer or fake quant modules, and qconfig + will be propagated. + + Args: + `model`: input model to be modified in-place + `inplace`: carry out model transformations in-place, the original module is mutated + `allow_list`: list of quantizable modules + `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer + `prepare_custom_config_dict`: customization configuration dictionary for prepare function + + .. code-block:: python + + # Example of prepare_custom_config_dict: + prepare_custom_config_dict = { + # user will manually define the corresponding observed + # module class which has a from_float class method that converts + # float custom module to observed custom module + "float_to_observed_custom_module_class": { + CustomModule: ObservedCustomModule + } + } + + """ + torch._C._log_api_usage_once("quantization_api.quantize.prepare") + if prepare_custom_config_dict is None: + prepare_custom_config_dict = {} + custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {}) + + if not inplace: + model = copy.deepcopy(model) + + # TODO: remove allow_list + qconfig_propagation_list = allow_list + if qconfig_propagation_list is None: + qconfig_propagation_list = get_default_qconfig_propagation_list() + propagate_qconfig_(model, qconfig_dict=None) + + # sanity check common API misusage + if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()): + warnings.warn("None of the submodule got qconfig applied. Make sure you " + "passed correct configuration through `qconfig_dict` or " + "by assigning the `.qconfig` attribute directly on submodules") + + add_observer_( + model, qconfig_propagation_list, observer_non_leaf_module_list, + custom_module_class_mapping=custom_module_class_mapping) + return model + +def _remove_activation_post_process(module): + # TODO: maybe we should change activation_post_process to _activation_post_process + # to prevent it from being used by user + if hasattr(module, 'activation_post_process') and \ + is_activation_post_process(module.activation_post_process): + delattr(module, 'activation_post_process') + + # remove activation_post_proceess hook + handle_ids_to_remove = set() + for handle_id, hook_fn in module._forward_hooks.items(): + if hook_fn is _observer_forward_hook: + handle_ids_to_remove.add(handle_id) + for handle_id in handle_ids_to_remove: + module._forward_hooks.pop(handle_id) + +# TODO: rename to something more general +def _remove_qconfig(module): + r"""Clean up the qconfig left in the module so that new qconfig can be + propagated. + + Args: + module: module to be cleaned up + """ + for child in module.children(): + _remove_qconfig(child) + + if hasattr(module, "qconfig"): + del module.qconfig + + _remove_activation_post_process(module) + +def quantize(model, run_fn, run_args, mapping=None, inplace=False): + r"""Quantize the input float model with post training static quantization. + + First it will prepare the model for calibration, then it calls + `run_fn` which will run the calibration step, after that we will + convert the model to a quantized model. + + Args: + model: input float model + run_fn: a calibration function for calibrating the prepared model + run_args: positional arguments for `run_fn` + inplace: carry out model transformations in-place, the original module is mutated + mapping: correspondence between original module types and quantized counterparts + + Return: + Quantized model. + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize") + if mapping is None: + mapping = get_default_static_quant_module_mappings() + if not inplace: + model = copy.deepcopy(model) + model.eval() + prepare(model, inplace=True) + run_fn(model, *run_args) + convert(model, mapping, inplace=True) + return model + +def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, + mapping=None, inplace=False): + r"""Converts a float model to dynamic (i.e. weights-only) quantized model. + + Replaces specified modules with dynamic weight-only quantized versions and output the quantized model. + + For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization + by default is performed for layers with large weights size - i.e. Linear and RNN variants. + + Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`. + If `qconfig` is provided, the `dtype` argument is ignored. + + Args: + model: input model + qconfig_spec: Either: + + - A dictionary that maps from name or type of submodule to quantization + configuration, qconfig applies to all submodules of a given + module unless qconfig for the submodules are specified (when the + submodule already has qconfig attribute). Entries in the dictionary + need to be QConfigDynamic instances. + + - A set of types and/or submodule names to apply dynamic quantization to, + in which case the `dtype` argument is used to specify the bit-width + + inplace: carry out model transformations in-place, the original module is mutated + mapping: maps type of a submodule to a type of corresponding dynamically quantized version + with which the submodule needs to be replaced + + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic") + if qconfig_spec is None: + if dtype == torch.qint8: + qconfig_spec = { + nn.Linear : default_dynamic_qconfig, + nn.LSTM : default_dynamic_qconfig, + nn.GRU : default_dynamic_qconfig, + nn.LSTMCell : default_dynamic_qconfig, + nn.RNNCell : default_dynamic_qconfig, + nn.GRUCell : default_dynamic_qconfig, + } + elif dtype == torch.float16: + qconfig_spec = { + nn.Linear : float16_dynamic_qconfig, + nn.LSTM : float16_dynamic_qconfig, + nn.GRU : float16_dynamic_qconfig, + nn.LSTMCell : float16_dynamic_qconfig, + nn.RNNCell : float16_dynamic_qconfig, + nn.GRUCell : float16_dynamic_qconfig, + } + elif dtype == torch.quint8: + qconfig_spec = { + nn.EmbeddingBag : float_qparams_weight_only_qconfig, + } + else: + raise ValueError( + "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype)) + elif isinstance(qconfig_spec, set): + if dtype is torch.qint8: + default_qconfig = default_dynamic_qconfig + elif dtype is torch.float16: + default_qconfig = float16_dynamic_qconfig + elif dtype is torch.quint8: + default_qconfig = float_qparams_weight_only_qconfig + else: + raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype)) + qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig))) + + if mapping is None: + mapping = get_default_dynamic_quant_module_mappings() + + if not inplace: + model = copy.deepcopy(model) + model.eval() + propagate_qconfig_(model, qconfig_spec) + convert(model, mapping, inplace=True) + return model + +def prepare_qat(model, mapping=None, inplace=False): + r""" + Prepares a copy of the model for quantization calibration or + quantization-aware training and converts it to quantized version. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + Args: + model: input model to be modified in-place + mapping: dictionary that maps float modules to quantized modules to be + replaced. + inplace: carry out model transformations in-place, the original module + is mutated + """ + torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat") + if mapping is None: + mapping = get_default_qat_module_mappings() + + if not inplace: + model = copy.deepcopy(model) + + propagate_qconfig_(model, qconfig_dict=None) + convert(model, mapping=mapping, inplace=True, remove_qconfig=False) + prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True) + return model + +def quantize_qat(model, run_fn, run_args, inplace=False): + r"""Do quantization aware training and output a quantized model + + Args: + model: input model + run_fn: a function for evaluating the prepared model, can be a + function that simply runs the prepared model or a training + loop + run_args: positional arguments for `run_fn` + + Return: + Quantized model. + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat") + if not inplace: + model = copy.deepcopy(model) + model.train() + prepare_qat(model, inplace=True) + run_fn(model, *run_args) + convert(model, inplace=True) + return model + +def convert( + module, mapping=None, inplace=False, remove_qconfig=True, + convert_custom_config_dict=None): + r"""Converts submodules in input module to a different module according to `mapping` + by calling `from_float` method on the target module class. And remove qconfig at the + end if remove_qconfig is set to True. + + Args: + `module`: prepared and calibrated module + `mapping`: a dictionary that maps from source module type to target + module type, can be overwritten to allow swapping user defined + Modules + `inplace`: carry out model transformations in-place, the original module + is mutated + `convert_custom_config_dict`: custom configuration dictionary for convert function + + .. code-block:: python + + # Example of convert_custom_config_dict: + convert_custom_config_dict = { + # user will manually define the corresponding quantized + # module class which has a from_observed class method that converts + # observed custom module to quantized custom module + "observed_to_quantized_custom_module_class": { + ObservedCustomModule: QuantizedCustomModule + } + } + + """ + torch._C._log_api_usage_once("quantization_api.quantize.convert") + if not inplace: + module = copy.deepcopy(module) + _convert( + module, mapping, inplace=True, + convert_custom_config_dict=convert_custom_config_dict) + if remove_qconfig: + _remove_qconfig(module) + return module + +def _convert( + module, mapping=None, inplace=False, + convert_custom_config_dict=None): + r"""Converts submodules in input module to a different module according to `mapping` + by calling `from_float` method on the target module class + + Args: + module: input module + mapping: a dictionary that maps from source module type to target + module type, can be overwritten to allow swapping user defined + Modules + inplace: carry out model transformations in-place, the original module + is mutated + + """ + if mapping is None: + mapping = get_default_static_quant_module_mappings() + if convert_custom_config_dict is None: + convert_custom_config_dict = {} + custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {}) + + if not inplace: + module = copy.deepcopy(module) + reassign = {} + for name, mod in module.named_children(): + # both fused modules and observed custom modules are + # swapped as one unit + if not isinstance(mod, _FusedModule) and \ + type(mod) not in custom_module_class_mapping: + _convert(mod, mapping, True, # inplace + convert_custom_config_dict) + reassign[name] = swap_module(mod, mapping, custom_module_class_mapping) + + for key, value in reassign.items(): + module._modules[key] = value + + return module + +def swap_module(mod, mapping, custom_module_class_mapping): + r"""Swaps the module if it has a quantized counterpart and it has an + `observer` attached. + + Args: + mod: input module + mapping: a dictionary that maps from nn module to nnq module + + Return: + The corresponding quantized module of `mod` + """ + new_mod = mod + if hasattr(mod, 'qconfig') and mod.qconfig is not None: + swapped = False + if type(mod) in custom_module_class_mapping: + new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) + swapped = True + elif type(mod) in mapping: + new_mod = mapping[type(mod)].from_float(mod) + swapped = True + + if swapped: + # Preserve module's pre forward hooks. They'll be called on quantized input + for pre_hook_fn in mod._forward_pre_hooks.values(): + new_mod.register_forward_pre_hook(pre_hook_fn) + # Preserve module's post forward hooks except _observer_forward_hook + # After convert they'll work with quantized output + for hook_fn in mod._forward_hooks.values(): + if hook_fn is not _observer_forward_hook: + new_mod.register_forward_hook(hook_fn) + + # respect device affinity when swapping modules + devices = get_unique_devices_(mod) + assert len(devices) <= 1, ( + "swap_module only works with cpu or single-device CUDA modules, " + "but got devices {}".format(devices) + ) + device = next(iter(devices)) if len(devices) > 0 else None + if device: + new_mod.to(device) + return new_mod + +def get_observer_dict(mod, target_dict, prefix=""): + r"""Traverse the modules and save all observers into dict. + This is mainly used for quantization accuracy debug + Args: + mod: the top module we want to save all observers + prefix: the prefix for the current module + target_dict: the dictionary used to save all the observers + """ + def get_prefix(prefix): + return prefix if prefix == "" else prefix + '.' + + if hasattr(mod, 'activation_post_process'): + target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process + for name, child in mod.named_children(): + module_prefix = get_prefix(prefix) + name if prefix else name + get_observer_dict(child, target_dict, module_prefix) diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py index 867b0b24cf7ad..e00e4aaad1b68 100644 --- a/torch/quantization/fx/convert.py +++ b/torch/quantization/fx/convert.py @@ -36,7 +36,7 @@ WEIGHT_INDEX_DICT, ) -from ..quantize import ( +from torch.ao.quantization.quantize import ( _remove_qconfig, is_activation_post_process, ) diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index 86abac2d20991..a6fd660e5e84c 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -68,7 +68,7 @@ get_default_qat_module_mappings, ) -from ..quantize import ( +from torch.ao.quantization.quantize import ( is_activation_post_process, convert ) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index e8b873658b504..779dfcf07aece 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -24,7 +24,7 @@ get_qparam_dict, ) -from ..quantize import ( +from torch.ao.quantization.quantize import ( is_activation_post_process, ) diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 10f8b06b6dfed..3c9adc2bc311b 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -2,7 +2,7 @@ import torch import torch.nn as nn from ..utils import is_per_tensor, is_per_channel -from ..quantize import is_activation_post_process +from torch.ao.quantization.quantize import is_activation_post_process from torch.fx import GraphModule, map_arg diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py index 674ed59ac86ed..5b0f4ed8779ab 100644 --- a/torch/quantization/quantize.py +++ b/torch/quantization/quantize.py @@ -1,580 +1,30 @@ -import copy -import itertools -import warnings +# flake8: noqa: F401 +r""" +This file is in the process of migration to `torch/ao/quantization`, and +is kept here for compatibility while the migration process is ongoing. +If you are adding a new entry/functionality, please, add it to the +`torch/ao/quantization/quantize.py`, while adding an import statement +here. +""" import torch -import torch.nn as nn -import torch.nn.quantized as nnq -from torch.nn.intrinsic import _FusedModule -from .quantization_mappings import ( - get_default_dynamic_quant_module_mappings, - get_default_static_quant_module_mappings, - get_default_qat_module_mappings, - get_default_qconfig_propagation_list, - no_observer_set, - _has_special_act_post_process, - _get_special_act_post_process, -) - -from .stubs import DeQuantStub, QuantWrapper -from .qconfig import ( - add_module_to_qconfig_obs_ctr, - default_dynamic_qconfig, - float16_dynamic_qconfig, - float_qparams_weight_only_qconfig) - -def is_activation_post_process(module): - return (isinstance(module, torch.quantization.ObserverBase) or - isinstance(module, torch.quantization.FakeQuantizeBase)) - -def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None, - qconfig_parent=None, prefix=''): - r"""This is a helper function for `propagate_qconfig_` - - Args: - module: input module - qconfig_dict: dictionary that maps from name of submodule to quantization - configuration - allow_list: list of quantizable modules - qconfig_parent: quantization config of parent module, we will fallback to - this config when there is no specified config for current - module - prefix: corresponding prefix of the current module, used as key in - qconfig_dict - - Return: - None, module is modified inplace with qconfig attached - """ - # TODO: Add test - if allow_list is None: - allow_list = get_default_qconfig_propagation_list() - - module_qconfig = qconfig_dict.get(type(module), qconfig_parent) - module_qconfig = qconfig_dict.get(prefix, module_qconfig) - module_qconfig = getattr(module, 'qconfig', module_qconfig) - - torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module) - - qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module) - module.qconfig = qconfig_with_device_check - - for name, child in module.named_children(): - module_prefix = prefix + '.' + name if prefix else name - _propagate_qconfig_helper(child, qconfig_dict, allow_list, - qconfig_with_device_check, module_prefix) - -# TODO(jerryzh): expose allow_list -def propagate_qconfig_(module, qconfig_dict=None, allow_list=None): - r"""Propagate qconfig through the module hierarchy and assign `qconfig` - attribute on each leaf module - - Args: - module: input module - qconfig_dict: dictionary that maps from name or type of submodule to - quantization configuration, qconfig applies to all submodules of a - given module unless qconfig for the submodules are specified (when - the submodule already has qconfig attribute) - - Return: - None, module is modified inplace with qconfig attached - """ - if qconfig_dict is None: - qconfig_dict = {} - _propagate_qconfig_helper(module, qconfig_dict, allow_list) - -def _observer_forward_hook(self, input, output): - r"""Forward hook that calls observer on the output - """ - return self.activation_post_process(output) - -def register_activation_post_process_hook(module): - assert hasattr(module, 'activation_post_process'), \ - 'Expect activation_post_process attribut already attached to the module' - return module.register_forward_hook(_observer_forward_hook) - -def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None): - r"""Add observer for the leaf child of the module. - - This function insert observer module to all leaf child module that - has a valid qconfig attribute. - - Args: - module: input module with qconfig attributes for all the leaf modules that we want to quantize - device: parent device, if any - non_leaf_module_list: list of non-leaf modules we want to add observer - - Return: - None, module is modified inplace with added observer modules and forward_hooks - """ - if qconfig_propagation_list is None: - qconfig_propagation_list = get_default_qconfig_propagation_list() - - if custom_module_class_mapping is None: - custom_module_class_mapping = {} - - # respect device affinity when adding observers - if device is None: - devices = get_unique_devices_(module) - assert len(devices) <= 1, ( - "add_observer_ only works with cpu or single-device CUDA modules, " - "but got devices {}".format(devices) - ) - device = next(iter(devices)) if len(devices) > 0 else None - - def get_activation_post_process(qconfig, device, special_act_post_process=None): - activation = qconfig.activation() if special_act_post_process is None else special_act_post_process() - if device is not None: - activation.to(device) - return activation - - def needs_observation(m): - return hasattr(m, 'qconfig') and m.qconfig is not None - - def insert_activation_post_process(m, special_act_post_process=None): - """ Adds an activation post process module and register - a post hook that calls the module - """ - # We don't insert observer/fake_quantize for DeQuantStub - if needs_observation(m) and not isinstance(m, DeQuantStub): - # observer and hook will be gone after we swap the module - m.add_module('activation_post_process', get_activation_post_process( - m.qconfig, device, special_act_post_process)) - # Register observer as the first entry in the hook list - # All post forward hooks are preserved and will be executed after the observer before convert - handle = register_activation_post_process_hook(m) - m._forward_hooks.move_to_end(handle.id, last=False) - - for name, child in module.named_children(): - if type(child) in [nnq.FloatFunctional, nnq.QFunctional]: - if needs_observation(child): - child.activation_post_process = get_activation_post_process(child.qconfig, device) - elif isinstance(child, _FusedModule): - # activation_post_process are now added directly to nn.Sequentail/_FusedModule - if needs_observation(child): - insert_activation_post_process(child) - elif _has_special_act_post_process(child): - special_act_post_process = _get_special_act_post_process(child) - insert_activation_post_process(child, special_act_post_process) - elif non_leaf_module_list is not None and type(child) in non_leaf_module_list: - if needs_observation(child): - insert_activation_post_process(child) - elif needs_observation(child) and type(child) in custom_module_class_mapping: - observed_child = custom_module_class_mapping[type(child)].from_float(child) - setattr(module, name, observed_child) - # TODO: These are the modules that cannot be observed - # Once there are more, we should move them to a separate list - if custom_module_class_mapping[type(child)] not in no_observer_set(): - insert_activation_post_process(observed_child) - else: - add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping) - - # Insert observers only for leaf nodes, note that this observer is for - # the output of the module, for input QuantStub will observe them - if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ - and type(module) in qconfig_propagation_list: - insert_activation_post_process(module) - -def get_unique_devices_(module): - return {p.device for p in module.parameters()} | \ - {p.device for p in module.buffers()} - -def add_quant_dequant(module): - r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig - Note that this function will modify the children of module inplace and it - can return a new module which wraps the input module as well. - - Args: - module: input module with qconfig attributes for all the leaf modules - that we want to quantize - - Return: - Either the inplace modified module with submodules wrapped in - `QuantWrapper` based on qconfig or a new `QuantWrapper` module which - wraps the input module, the latter case only happens when the input - module is a leaf module and we want to quantize it. - """ - if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig: - return QuantWrapper(module) - - for name, child in module.named_children(): - module._modules[name] = add_quant_dequant(child) - return module - -def prepare(model, inplace=False, allow_list=None, - observer_non_leaf_module_list=None, - prepare_custom_config_dict=None): - r"""Prepares a copy of the model for quantization calibration or quantization-aware training. - - Quantization configuration should be assigned preemptively - to individual submodules in `.qconfig` attribute. - - The model will be attached with observer or fake quant modules, and qconfig - will be propagated. - - Args: - `model`: input model to be modified in-place - `inplace`: carry out model transformations in-place, the original module is mutated - `allow_list`: list of quantizable modules - `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer - `prepare_custom_config_dict`: customization configuration dictionary for prepare function - - .. code-block:: python - - # Example of prepare_custom_config_dict: - prepare_custom_config_dict = { - # user will manually define the corresponding observed - # module class which has a from_float class method that converts - # float custom module to observed custom module - "float_to_observed_custom_module_class": { - CustomModule: ObservedCustomModule - } - } - - """ - torch._C._log_api_usage_once("quantization_api.quantize.prepare") - if prepare_custom_config_dict is None: - prepare_custom_config_dict = {} - custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {}) - - if not inplace: - model = copy.deepcopy(model) - - # TODO: remove allow_list - qconfig_propagation_list = allow_list - if qconfig_propagation_list is None: - qconfig_propagation_list = get_default_qconfig_propagation_list() - propagate_qconfig_(model, qconfig_dict=None) - - # sanity check common API misusage - if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()): - warnings.warn("None of the submodule got qconfig applied. Make sure you " - "passed correct configuration through `qconfig_dict` or " - "by assigning the `.qconfig` attribute directly on submodules") - - add_observer_( - model, qconfig_propagation_list, observer_non_leaf_module_list, - custom_module_class_mapping=custom_module_class_mapping) - return model - -def _remove_activation_post_process(module): - # TODO: maybe we should change activation_post_process to _activation_post_process - # to prevent it from being used by user - if hasattr(module, 'activation_post_process') and \ - is_activation_post_process(module.activation_post_process): - delattr(module, 'activation_post_process') - - # remove activation_post_proceess hook - handle_ids_to_remove = set() - for handle_id, hook_fn in module._forward_hooks.items(): - if hook_fn is _observer_forward_hook: - handle_ids_to_remove.add(handle_id) - for handle_id in handle_ids_to_remove: - module._forward_hooks.pop(handle_id) - -# TODO: rename to something more general -def _remove_qconfig(module): - r"""Clean up the qconfig left in the module so that new qconfig can be - propagated. - - Args: - module: module to be cleaned up - """ - for child in module.children(): - _remove_qconfig(child) - - if hasattr(module, "qconfig"): - del module.qconfig - - _remove_activation_post_process(module) - -def quantize(model, run_fn, run_args, mapping=None, inplace=False): - r"""Quantize the input float model with post training static quantization. - - First it will prepare the model for calibration, then it calls - `run_fn` which will run the calibration step, after that we will - convert the model to a quantized model. - - Args: - model: input float model - run_fn: a calibration function for calibrating the prepared model - run_args: positional arguments for `run_fn` - inplace: carry out model transformations in-place, the original module is mutated - mapping: correspondence between original module types and quantized counterparts - - Return: - Quantized model. - """ - torch._C._log_api_usage_once("quantization_api.quantize.quantize") - if mapping is None: - mapping = get_default_static_quant_module_mappings() - if not inplace: - model = copy.deepcopy(model) - model.eval() - prepare(model, inplace=True) - run_fn(model, *run_args) - convert(model, mapping, inplace=True) - return model - -def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, - mapping=None, inplace=False): - r"""Converts a float model to dynamic (i.e. weights-only) quantized model. - - Replaces specified modules with dynamic weight-only quantized versions and output the quantized model. - - For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization - by default is performed for layers with large weights size - i.e. Linear and RNN variants. - - Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`. - If `qconfig` is provided, the `dtype` argument is ignored. - - Args: - model: input model - qconfig_spec: Either: - - - A dictionary that maps from name or type of submodule to quantization - configuration, qconfig applies to all submodules of a given - module unless qconfig for the submodules are specified (when the - submodule already has qconfig attribute). Entries in the dictionary - need to be QConfigDynamic instances. - - - A set of types and/or submodule names to apply dynamic quantization to, - in which case the `dtype` argument is used to specify the bit-width - - inplace: carry out model transformations in-place, the original module is mutated - mapping: maps type of a submodule to a type of corresponding dynamically quantized version - with which the submodule needs to be replaced - - """ - torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic") - if qconfig_spec is None: - if dtype == torch.qint8: - qconfig_spec = { - nn.Linear : default_dynamic_qconfig, - nn.LSTM : default_dynamic_qconfig, - nn.GRU : default_dynamic_qconfig, - nn.LSTMCell : default_dynamic_qconfig, - nn.RNNCell : default_dynamic_qconfig, - nn.GRUCell : default_dynamic_qconfig, - } - elif dtype == torch.float16: - qconfig_spec = { - nn.Linear : float16_dynamic_qconfig, - nn.LSTM : float16_dynamic_qconfig, - nn.GRU : float16_dynamic_qconfig, - nn.LSTMCell : float16_dynamic_qconfig, - nn.RNNCell : float16_dynamic_qconfig, - nn.GRUCell : float16_dynamic_qconfig, - } - elif dtype == torch.quint8: - qconfig_spec = { - nn.EmbeddingBag : float_qparams_weight_only_qconfig, - } - else: - raise ValueError( - "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype)) - elif isinstance(qconfig_spec, set): - if dtype is torch.qint8: - default_qconfig = default_dynamic_qconfig - elif dtype is torch.float16: - default_qconfig = float16_dynamic_qconfig - elif dtype is torch.quint8: - default_qconfig = float_qparams_weight_only_qconfig - else: - raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype)) - qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig))) - - if mapping is None: - mapping = get_default_dynamic_quant_module_mappings() - - if not inplace: - model = copy.deepcopy(model) - model.eval() - propagate_qconfig_(model, qconfig_spec) - convert(model, mapping, inplace=True) - return model - -def prepare_qat(model, mapping=None, inplace=False): - r""" - Prepares a copy of the model for quantization calibration or - quantization-aware training and converts it to quantized version. - - Quantization configuration should be assigned preemptively - to individual submodules in `.qconfig` attribute. - - Args: - model: input model to be modified in-place - mapping: dictionary that maps float modules to quantized modules to be - replaced. - inplace: carry out model transformations in-place, the original module - is mutated - """ - torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat") - if mapping is None: - mapping = get_default_qat_module_mappings() - - if not inplace: - model = copy.deepcopy(model) - - propagate_qconfig_(model, qconfig_dict=None) - convert(model, mapping=mapping, inplace=True, remove_qconfig=False) - prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True) - return model - -def quantize_qat(model, run_fn, run_args, inplace=False): - r"""Do quantization aware training and output a quantized model - - Args: - model: input model - run_fn: a function for evaluating the prepared model, can be a - function that simply runs the prepared model or a training - loop - run_args: positional arguments for `run_fn` - - Return: - Quantized model. - """ - torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat") - if not inplace: - model = copy.deepcopy(model) - model.train() - prepare_qat(model, inplace=True) - run_fn(model, *run_args) - convert(model, inplace=True) - return model - -def convert( - module, mapping=None, inplace=False, remove_qconfig=True, - convert_custom_config_dict=None): - r"""Converts submodules in input module to a different module according to `mapping` - by calling `from_float` method on the target module class. And remove qconfig at the - end if remove_qconfig is set to True. - - Args: - `module`: prepared and calibrated module - `mapping`: a dictionary that maps from source module type to target - module type, can be overwritten to allow swapping user defined - Modules - `inplace`: carry out model transformations in-place, the original module - is mutated - `convert_custom_config_dict`: custom configuration dictionary for convert function - - .. code-block:: python - - # Example of convert_custom_config_dict: - convert_custom_config_dict = { - # user will manually define the corresponding quantized - # module class which has a from_observed class method that converts - # observed custom module to quantized custom module - "observed_to_quantized_custom_module_class": { - ObservedCustomModule: QuantizedCustomModule - } - } - - """ - torch._C._log_api_usage_once("quantization_api.quantize.convert") - if not inplace: - module = copy.deepcopy(module) - _convert( - module, mapping, inplace=True, - convert_custom_config_dict=convert_custom_config_dict) - if remove_qconfig: - _remove_qconfig(module) - return module - -def _convert( - module, mapping=None, inplace=False, - convert_custom_config_dict=None): - r"""Converts submodules in input module to a different module according to `mapping` - by calling `from_float` method on the target module class - - Args: - module: input module - mapping: a dictionary that maps from source module type to target - module type, can be overwritten to allow swapping user defined - Modules - inplace: carry out model transformations in-place, the original module - is mutated - - """ - if mapping is None: - mapping = get_default_static_quant_module_mappings() - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {}) - - if not inplace: - module = copy.deepcopy(module) - reassign = {} - for name, mod in module.named_children(): - # both fused modules and observed custom modules are - # swapped as one unit - if not isinstance(mod, _FusedModule) and \ - type(mod) not in custom_module_class_mapping: - _convert(mod, mapping, True, # inplace - convert_custom_config_dict) - reassign[name] = swap_module(mod, mapping, custom_module_class_mapping) - - for key, value in reassign.items(): - module._modules[key] = value - - return module - -def swap_module(mod, mapping, custom_module_class_mapping): - r"""Swaps the module if it has a quantized counterpart and it has an - `observer` attached. - - Args: - mod: input module - mapping: a dictionary that maps from nn module to nnq module - - Return: - The corresponding quantized module of `mod` - """ - new_mod = mod - if hasattr(mod, 'qconfig') and mod.qconfig is not None: - swapped = False - if type(mod) in custom_module_class_mapping: - new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) - swapped = True - elif type(mod) in mapping: - new_mod = mapping[type(mod)].from_float(mod) - swapped = True - - if swapped: - # Preserve module's pre forward hooks. They'll be called on quantized input - for pre_hook_fn in mod._forward_pre_hooks.values(): - new_mod.register_forward_pre_hook(pre_hook_fn) - # Preserve module's post forward hooks except _observer_forward_hook - # After convert they'll work with quantized output - for hook_fn in mod._forward_hooks.values(): - if hook_fn is not _observer_forward_hook: - new_mod.register_forward_hook(hook_fn) - - # respect device affinity when swapping modules - devices = get_unique_devices_(mod) - assert len(devices) <= 1, ( - "swap_module only works with cpu or single-device CUDA modules, " - "but got devices {}".format(devices) - ) - device = next(iter(devices)) if len(devices) > 0 else None - if device: - new_mod.to(device) - return new_mod - -def get_observer_dict(mod, target_dict, prefix=""): - r"""Traverse the modules and save all observers into dict. - This is mainly used for quantization accuracy debug - Args: - mod: the top module we want to save all observers - prefix: the prefix for the current module - target_dict: the dictionary used to save all the observers - """ - def get_prefix(prefix): - return prefix if prefix == "" else prefix + '.' - - if hasattr(mod, 'activation_post_process'): - target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process - for name, child in mod.named_children(): - module_prefix = get_prefix(prefix) + name if prefix else name - get_observer_dict(child, target_dict, module_prefix) +from torch.ao.quantization.quantize import _convert +from torch.ao.quantization.quantize import _observer_forward_hook +from torch.ao.quantization.quantize import _propagate_qconfig_helper +from torch.ao.quantization.quantize import _remove_activation_post_process +from torch.ao.quantization.quantize import _remove_qconfig +from torch.ao.quantization.quantize import add_observer_ +from torch.ao.quantization.quantize import add_quant_dequant +from torch.ao.quantization.quantize import convert +from torch.ao.quantization.quantize import get_observer_dict +from torch.ao.quantization.quantize import get_unique_devices_ +from torch.ao.quantization.quantize import is_activation_post_process +from torch.ao.quantization.quantize import prepare +from torch.ao.quantization.quantize import prepare_qat +from torch.ao.quantization.quantize import propagate_qconfig_ +from torch.ao.quantization.quantize import quantize +from torch.ao.quantization.quantize import quantize_dynamic +from torch.ao.quantization.quantize import quantize_qat +from torch.ao.quantization.quantize import register_activation_post_process_hook +from torch.ao.quantization.quantize import swap_module From 8af1407eab140a3abf12ea99883fea529791883e Mon Sep 17 00:00:00 2001 From: Harut Movsisyan Date: Sun, 29 Aug 2021 20:58:45 -0700 Subject: [PATCH 337/530] [Static Runtime] Out version for torch.linalg.norm (#64070) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64070 Test Plan: Confirm out variant is called for both versions: ``` > buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1 ``` Reviewed By: d1jang Differential Revision: D30595816 fbshipit-source-id: e88d88d4fc698774e83a98efce66b8fa4e281563 --- benchmarks/static_runtime/test_scripts.h | 10 ++++ .../static_runtime/test_static_runtime.cc | 26 ++++++++++ torch/csrc/jit/runtime/static/ops.cpp | 47 +++++++++++++++++++ 3 files changed, 83 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index bcc975b79cf25..004319ca550f9 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -780,3 +780,13 @@ const std::string embedding_bag_byte_prepack_script = R"IR( %res: Tensor = aten::clone(%output, %none) return (%res) )IR"; + +const auto linalg_norm_ord_scalar = R"JIT( + def forward(self, a: Tensor, ord: int, dim: List[int], keepdim: bool, dtype: int): + return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone() +)JIT"; + +const auto linalg_norm_ord_str = R"JIT( + def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int): + return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone() +)JIT"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 1e987a9fab58e..f6e3680e0be38 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1265,3 +1265,29 @@ TEST(StaticRuntime, QEmbeddingBagByteUnpack) { testStaticRuntime(embedding_bag_byte_prepack_script, {a}); testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b}); } + +TEST(StaticRuntime, IndividualOps_LinalgNorm_ScalarOrd) { + auto a = at::randn({2, 3}); + auto dim = std::vector({1}); + auto dtype = at::ScalarType::Float; + + std::vector args0{a, 4, dim, true, dtype}; + testStaticRuntime(linalg_norm_ord_scalar, args0); + + auto b = at::randn({4, 5}); + std::vector args1{b, 4, dim, true, dtype}; + testStaticRuntime(linalg_norm_ord_scalar, args0, args1); +} + +TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) { + auto a = at::randn({2, 3}); + auto dim = std::vector({0, 1}); + auto dtype = at::ScalarType::Float; + + std::vector args0{a, "fro", dim, true, dtype}; + testStaticRuntime(linalg_norm_ord_str, args0); + + auto b = at::randn({4, 5}); + std::vector args1{b, "fro", dim, true, dtype}; + testStaticRuntime(linalg_norm_ord_str, args0, args1); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 3b586689a6c5c..12339301e0433 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1666,6 +1666,53 @@ REGISTER_OPERATOR_FUNCTOR(aten::fmod, aten_fmod, [](Node* n) -> SROperator { }; }); +REGISTER_OPERATOR_FUNCTOR(aten::linalg_norm, aten_linalg_norm, [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor")) && + !n->matches(torch::schema( + "aten::linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& input = p_node->Input(0).toTensor(); + const auto dim = p_node->Input(2).toIntVector(); + const auto keepdim = p_node->Input(3).toBool(); + const auto dtype = p_node->Input(4).toOptional(); + + if (p_node->Output(0).isNone()) { + if (p_node->Input(1).isScalar()) { + p_node->Output(0) = at::native::linalg_norm( + input, + p_node->Input(1).toOptional(), + dim, + keepdim, + dtype); + } else { + p_node->Output(0) = at::native::linalg_norm( + input, p_node->Input(1).toStringView(), dim, keepdim, dtype); + } + return; + } + + auto& output = p_node->Output(0).toTensor(); + fastResizeToZero(output); + + if (p_node->Input(1).isScalar()) { + at::native::linalg_norm_out( + input, + p_node->Input(1).toOptional(), + dim, + keepdim, + dtype, + output); + } else { + at::native::linalg_norm_out( + input, p_node->Input(1).toStringRef(), dim, keepdim, dtype, output); + } + }; +}); + namespace { void check_cat_no_zero_dim(const std::vector& tensors) { From d3bcba5f85f97ef273109924c695f33bf739e115 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 29 Aug 2021 23:31:42 -0700 Subject: [PATCH 338/530] ENH Adds label_smoothing to cross entropy loss (#63122) Summary: Fixes https://github.com/pytorch/pytorch/issues/7455 Partially resolves pytorch/vision#4281 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63122 Reviewed By: iramazanli Differential Revision: D30586076 Pulled By: jbschlosser fbshipit-source-id: 06afc3aa1f8b9edb07fe9ed68c58968ad1926924 --- aten/src/ATen/native/LossNLL.cpp | 79 ++++++- aten/src/ATen/native/native_functions.yaml | 2 +- test/cpp/api/functional.cpp | 14 ++ test/cpp/api/modules.cpp | 25 +++ test/test_nn.py | 72 +++++++ .../api/include/torch/nn/functional/loss.h | 9 +- .../csrc/api/include/torch/nn/options/loss.h | 2 + torch/csrc/api/src/nn/modules/loss.cpp | 3 +- torch/nn/functional.py | 8 +- torch/nn/functional.pyi.in | 3 +- torch/nn/modules/loss.py | 13 +- torch/onnx/symbolic_opset12.py | 6 +- torch/overrides.py | 2 +- torch/testing/_internal/common_nn.py | 199 +++++++++++++++++- 14 files changed, 412 insertions(+), 25 deletions(-) diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index c7c65f7b8cc22..83f169972942f 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -459,9 +459,10 @@ TORCH_IMPL_FUNC(nll_loss_backward_out_cpu) Tensor cross_entropy_loss_prob_target( const Tensor& self, - const Tensor& target, + const Tensor& target_, const Tensor& weight, - int64_t reduction) { + int64_t reduction, + double label_smoothing) { const auto n_classes = self.size(1); TORCH_CHECK( !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes), @@ -472,6 +473,15 @@ Tensor cross_entropy_loss_prob_target( weight.sizes()); auto input = at::log_softmax(self, 1, self.scalar_type()); + Tensor target; + + if (label_smoothing > 0.0) { + TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing); + target = target_ * (1 - label_smoothing) + label_smoothing / n_classes; + } else { + target = target_; + } + if (weight.defined()) { // Expand weight to the correct number of dims for broadcasting with input / target auto weight_broadcast_shape = SmallBuffer(input.dim()); @@ -503,12 +513,66 @@ Tensor cross_entropy_loss_prob_target( } } +Tensor cross_entropy_loss_label_smoothing( + const Tensor& self, + const Tensor& target, + const Tensor& weight, + int64_t reduction, + int64_t ignore_index, + double label_smoothing) { + + auto input = at::log_softmax(self, 1, self.scalar_type()); + auto nllloss = at::nll_loss_nd(input, target, weight, reduction, ignore_index); + + auto n_classes = input.size(1); + + Tensor smooth_loss; + if (weight.defined()) { + // Expand weight to the correct number of dims for broadcasting with input / target + auto weight_broadcast_shape = SmallBuffer(input.dim()); + std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1); + weight_broadcast_shape[1] = weight.size(0); + Tensor weight_ = weight.view(weight_broadcast_shape); + + smooth_loss = -(input * weight_).sum(1); + } else { + smooth_loss = -input.sum(1); + } + + if (ignore_index >= 0) { + smooth_loss.index_put_({target == ignore_index}, 0.0); + } + + Tensor ret; + switch (reduction) { + case Reduction::Mean: + if (weight.defined()) { + // TODO: This code can path can be removed if #61309 is resolved + // loss is normalized by the weights to be consistent with nll_loss_nd + ret = smooth_loss.sum() / weight.gather(0, target.flatten()).sum(); + } else { + ret = smooth_loss.mean(); + } + break; + case Reduction::Sum: + ret = smooth_loss.sum(); + break; + case Reduction::None: + ret = smooth_loss; + break; + default: + TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction); + } + return (1 - label_smoothing) * nllloss + ret * (label_smoothing / n_classes); +} + Tensor cross_entropy_loss( const Tensor& self, const Tensor& target, const c10::optional& weight, int64_t reduction, - int64_t ignore_index) { + int64_t ignore_index, + double label_smoothing) { Tensor ret; if (self.sizes() == target.sizes()) { // Assume soft targets when input and target shapes are the same @@ -519,7 +583,14 @@ Tensor cross_entropy_loss( // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight); const Tensor& weight_ = *weight_maybe_owned; - ret = cross_entropy_loss_prob_target(self, target, weight_, reduction); + ret = cross_entropy_loss_prob_target(self, target, weight_, reduction, label_smoothing); + } else if (label_smoothing > 0.0) { + TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing); + + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight); + const Tensor& weight_ = *weight_maybe_owned; + ret = cross_entropy_loss_label_smoothing(self, target, weight_, reduction, ignore_index, label_smoothing); } else { ret = at::nll_loss_nd( at::log_softmax(self, 1, self.scalar_type()), diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 224d850c8004c..688763ea39c13 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6652,7 +6652,7 @@ device_check: NoCheck # TensorIterator variants: method -- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor +- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, float label_smoothing=0.0) -> Tensor python_module: nn - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp index 2ecb84189c55a..8b7889f1841ef 100644 --- a/test/cpp/api/functional.cpp +++ b/test/cpp/api/functional.cpp @@ -792,6 +792,20 @@ TEST_F(FunctionalTest, CrossEntropy) { ASSERT_TRUE(output.allclose(expected, 1e-04)); ASSERT_TRUE(F::cross_entropy(input, target).allclose(expected, 1e-04)); + + // label smoothing with class indices + input = torch::tensor({{3., 1.}, {1., 2.}}, torch::kFloat); + output = F::cross_entropy( + input, target, F::CrossEntropyFuncOptions().label_smoothing(0.15).reduction(torch::kMean)); + expected = torch::tensor(0.3326, torch::kFloat); + ASSERT_TRUE(output.allclose(expected, 1e-04)); + + // label smoothing with target probabilities + target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat); + output = F::cross_entropy( + input, target, F::CrossEntropyFuncOptions().label_smoothing(0.2).reduction(torch::kMean)); + expected = torch::tensor(0.5701, torch::kFloat); + ASSERT_TRUE(output.allclose(expected, 1e-04)); } TEST_F(FunctionalTest, MaxUnpool1d) { diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 23d75efeee21f..927d884709200 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -2315,6 +2315,31 @@ TEST_F(ModulesTest, CrossEntropyLoss) { ASSERT_TRUE( CrossEntropyLoss(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean)) ->forward(input, target).allclose(expected, 1e-04)); + + // label smoothing with class indices + loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kMean)); + input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true)); + target = torch::tensor({0, 1}, torch::kLong); + output = loss->forward(input, target); + expected = torch::tensor(0.3326, torch::kFloat); + s = output.sum(); + s.backward(); + + ASSERT_TRUE(output.allclose(expected, 1e-04)); + ASSERT_EQ(input.sizes(), input.grad().sizes()); + + // label smoothing with with target probabilities + loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.2).reduction(torch::kMean)); + input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true)); + target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat); + output = loss->forward(input, target); + expected = torch::tensor(0.5701, torch::kFloat); + s = output.sum(); + s.backward(); + + ASSERT_TRUE(output.allclose(expected, 1e-04)); + ASSERT_EQ(input.sizes(), input.grad().sizes()); + } TEST_F(ModulesTest, CosineSimilarity) { diff --git a/test/test_nn.py b/test/test_nn.py index 7d26246786c6c..bb4dd59be5271 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -17183,6 +17183,78 @@ def test_cross_entropy_loss_one_hot_target(self, device): output_one_hot = m(input, target_one_hot) self.assertEqual(output, output_one_hot) + def test_cross_entropy_label_smoothing_errors(self, device): + N, C = 3, 4 + input_args = [ + (torch.randn((N, C), device=device), torch.arange(0, C, device=device)), + (torch.randn((N, C), device=device), torch.randn(N, C, device=device)) + ] + for input_arg in input_args: + loss = nn.CrossEntropyLoss(label_smoothing=1.2) + with self.assertRaisesRegex(RuntimeError, + r"label_smoothing must be between 0\.0"): + loss(*input_arg) + + def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, device): + N, C = 10, 4 + ks = range(5) + reductions = ['none', 'mean', 'sum'] + label_smoothings = [0.05, 0.15] + + for k, reduction, label_smoothing in product(ks, reductions, label_smoothings): + other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] + input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) + target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C) + + # construct target probablity that should have the same result as label_smoothing + target_proba = F.one_hot(target, num_classes=C) + # Need to put the C dim at index 1. + target_proba = target_proba.permute(0, -1, *range(1, target_proba.dim() - 1)) + target_mask = (target_proba == 1) + target_proba = target_proba.to(dtype=input.dtype) + + # y_k^ls = y_k * (1 - label_smoothing) + label_smoothing / n_classes + # Get one-hot representation of the target. + target_proba.masked_fill_(target_mask, 1 - label_smoothing + label_smoothing / C) + target_proba.masked_fill_(~target_mask, label_smoothing / C) + + loss = nn.CrossEntropyLoss(reduction=reduction) + output_with_prob = loss(input, target_proba) + + loss = nn.CrossEntropyLoss( + reduction=reduction, label_smoothing=label_smoothing) + output_with_index = loss(input, target) + + self.assertEqual(output_with_prob, output_with_index, + rtol=1e-07, atol=1e-05) + + def test_cross_entropy_label_smoothing_with_probs(self, device): + N, C = 10, 4 + ks = range(5) + reductions = ['none', 'mean', 'sum'] + label_smoothings = [0.05, 0.15] + + # Test with k-dimensional loss. + for k, label_smoothing in product(ks, label_smoothings): + other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] + input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) + target = F.log_softmax(torch.randn(N, C, *other_dims, device=device), dim=1) + + for reduction in reductions: + # use with label_smoothing + loss = nn.CrossEntropyLoss(reduction=reduction, label_smoothing=label_smoothing) + output_with_smoothing = loss(input, target) + + # manually smoothing target + # class_proba^ls = class_proba * (1 - label_smoothing) + + # label_smoothing / n_classes + target_with_smoothing = target * (1 - label_smoothing) + label_smoothing / C + loss = nn.CrossEntropyLoss(reduction=reduction) + output_with_manual_smoothing = loss(input, target_with_smoothing) + + self.assertEqual(output_with_smoothing, output_with_manual_smoothing) + + def test_softshrink_negative(self, device): input = torch.randn(5, device=device, requires_grad=True) m = torch.nn.Softshrink(-1) diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h index ea2f6066ddf15..1fa91ad6deb1f 100644 --- a/torch/csrc/api/include/torch/nn/functional/loss.h +++ b/torch/csrc/api/include/torch/nn/functional/loss.h @@ -824,13 +824,15 @@ inline Tensor cross_entropy( const Tensor& target, const Tensor& weight, int64_t ignore_index, - CrossEntropyFuncOptions::reduction_t reduction) { + CrossEntropyFuncOptions::reduction_t reduction, + double label_smoothing) { return torch::cross_entropy_loss( input, target, weight, enumtype::reduction_get_enum(reduction), - ignore_index); + ignore_index, + label_smoothing); } } // namespace detail #endif /* DOXYGEN_SHOULD_SKIP_THIS */ @@ -855,7 +857,8 @@ inline Tensor cross_entropy( target, options.weight(), options.ignore_index(), - options.reduction()); + options.reduction(), + options.label_smoothing()); } // ============================================================================ diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h index d8ffd15c8660a..1479de571d13e 100644 --- a/torch/csrc/api/include/torch/nn/options/loss.h +++ b/torch/csrc/api/include/torch/nn/options/loss.h @@ -662,6 +662,8 @@ struct TORCH_API CrossEntropyLossOptions { TORCH_ARG(int64_t, ignore_index) = -100; /// Specifies the reduction to apply to the output. Default: Mean TORCH_ARG(reduction_t, reduction) = torch::kMean; + /// Specifies the amount of smoothing when computing the loss. Default: 0.0 + TORCH_ARG(double, label_smoothing) = 0.0; }; namespace functional { diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp index d5d8c687168e8..dda67fe9c728e 100644 --- a/torch/csrc/api/src/nn/modules/loss.cpp +++ b/torch/csrc/api/src/nn/modules/loss.cpp @@ -378,7 +378,8 @@ Tensor CrossEntropyLossImpl::forward( target, weight, options.ignore_index(), - options.reduction()); + options.reduction(), + options.label_smoothing()); } // ============================================================================ diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 52125864000f1..c11e261d9b85f 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -2772,6 +2772,7 @@ def cross_entropy( ignore_index: int = -100, reduce: Optional[bool] = None, reduction: str = "mean", + label_smoothing: float = 0.0, ) -> Tensor: r"""This criterion computes the cross entropy loss between input and target. @@ -2808,6 +2809,10 @@ def cross_entropy( elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` and :attr:`reduce` are in the process of being deprecated, and in the meantime, specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount + of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. Examples:: @@ -2834,10 +2839,11 @@ def cross_entropy( ignore_index=ignore_index, reduce=reduce, reduction=reduction, + label_smoothing=label_smoothing, ) if size_average is not None or reduce is not None: reduction = _Reduction.legacy_get_string(size_average, reduce) - return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) + return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing) def binary_cross_entropy( diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in index 828f8df2185b5..cbd05d7e3dedb 100644 --- a/torch/nn/functional.pyi.in +++ b/torch/nn/functional.pyi.in @@ -239,7 +239,8 @@ def kl_div(input: Tensor, target: Tensor, size_average: Optional[bool] = ..., re def cross_entropy(input: Tensor, target: Tensor, weight: Optional[Tensor] = ..., size_average: Optional[bool] = ..., - ignore_index: int = ..., reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ... + ignore_index: int = ..., reduce: Optional[bool] = ..., reduction: str = ..., + label_smoothing: float = ...) -> Tensor: ... def binary_cross_entropy(input: Tensor, target: Tensor, weight: Optional[Tensor] = ..., diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index af1da83eeef5b..d72c614c88048 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1104,6 +1104,10 @@ class probabilities only when a single class label per minibatch item is too res and :attr:`reduce` are in the process of being deprecated, and in the meantime, specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` + label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount + of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. Shape: - Input: :math:`(N, C)` where `C = number of classes`, or @@ -1132,17 +1136,20 @@ class probabilities only when a single class label per minibatch item is too res >>> output = loss(input, target) >>> output.backward() """ - __constants__ = ['ignore_index', 'reduction'] + __constants__ = ['ignore_index', 'reduction', 'label_smoothing'] ignore_index: int + label_smoothing: float def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100, - reduce=None, reduction: str = 'mean') -> None: + reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None: super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction) self.ignore_index = ignore_index + self.label_smoothing = label_smoothing def forward(self, input: Tensor, target: Tensor) -> Tensor: return F.cross_entropy(input, target, weight=self.weight, - ignore_index=self.ignore_index, reduction=self.reduction) + ignore_index=self.ignore_index, reduction=self.reduction, + label_smoothing=self.label_smoothing) class MultiLabelSoftMarginLoss(_WeightedLoss): diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py index d8f954148a1ee..ab39325709ea9 100644 --- a/torch/onnx/symbolic_opset12.py +++ b/torch/onnx/symbolic_opset12.py @@ -65,7 +65,7 @@ def nll_loss_nd(g, self, target, weight, reduction, ignore_index): return nll_loss(g, self, target, weight, reduction, ignore_index) -def cross_entropy_loss(g, self, target, weight, reduction, ignore_index): +def cross_entropy_loss(g, self, target, weight, reduction, ignore_index, label_smoothing): # none reduction : onnx::Constant[value={0}] # mean reduction : onnx::Constant[value={1}] # sum reduction : onnx::Constant[value={2}] @@ -73,6 +73,10 @@ def cross_entropy_loss(g, self, target, weight, reduction, ignore_index): reduction_vals = ["none", "mean", "sum"] reduction = reduction_vals[reduction] + label_smoothing = sym_help._maybe_get_const(label_smoothing, "f") + if label_smoothing > 0.0: + raise RuntimeError("Unsupported: ONNX does not support label_smoothing") + # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value. # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100). ignore_index = sym_help._maybe_get_const(ignore_index, "i") diff --git a/torch/overrides.py b/torch/overrides.py index 09748b982b428..64b18b89eb401 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -677,7 +677,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.nn.functional.cosine_embedding_loss: (lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1), torch.nn.functional.cross_entropy: (lambda input, target, weight=None, size_average=None, ignore_index=-100, - reduce=None, reduction="mean": -1), + reduce=None, reduction="mean", label_smoothing=0.0: -1), torch.nn.functional.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean', zero_infinity=False: -1), torch.nn.functional.dropout: lambda input, p=0.5, training=True, inplace=False: -1, diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index e0d09b7ba03fc..73233df8cc5bb 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -4103,7 +4103,8 @@ def nlllossNd_reference(input, target, weight=None, ignore_index=-100, return output -def cross_entropy_loss_prob_target_reference(input, target, weight=None, reduction='mean'): +def cross_entropy_loss_prob_target_reference(input, target, weight=None, reduction='mean', + label_smoothing=0.0): assert input.dim() >= 2 input = torch.log_softmax(input, 1) @@ -4112,6 +4113,10 @@ def cross_entropy_loss_prob_target_reference(input, target, weight=None, reducti weight = torch.ones(C).type_as(input) weight = weight.view(1, C, *(1 for _ in input.shape[2:])) + if label_smoothing > 0.0: + assert label_smoothing <= 1.0 + target = (target * (1 - label_smoothing) + label_smoothing / C) + output = -(input * target * weight).sum(dim=1) if reduction == 'mean': return output.mean() @@ -4120,20 +4125,61 @@ def cross_entropy_loss_prob_target_reference(input, target, weight=None, reducti return output -def cross_entropy_loss_reference(input, target, weight=None, ignore_index=-100, reduction='mean'): +def cross_entropy_loss_indices_target_reference(input, target, weight=None, ignore_index=-100, + reduction='mean', label_smoothing=0.0): + log_softmax_input = torch.log_softmax(input, 1) + nllloss = F.nll_loss( + log_softmax_input, + target, + weight, + ignore_index=ignore_index, + reduction=reduction) + + if label_smoothing == 0.0: + return nllloss + + assert 0.0 < label_smoothing <= 1.0 + + input = torch.log_softmax(input, 1) + C = input.size(1) + if weight is not None: + input = input * weight.view(1, C, *(1 for _ in input.shape[2:])) + + smooth_loss = -torch.sum(input, 1) + + if ignore_index >= 0: + ignore_mask = target == ignore_index + smooth_loss.masked_fill_(ignore_mask, 0.0) + + if reduction == 'mean': + if weight is not None: + # TODO: This code can path can be removed if #61309 is resolved + # loss is normalized by the weights to be consistent with nll_loss_nd + ret = torch.sum(smooth_loss) / weight.gather(0, target.flatten()).sum() + else: + ret = torch.mean(smooth_loss) + elif reduction == 'sum': + ret = torch.sum(smooth_loss) + else: + ret = smooth_loss + + return (1 - label_smoothing) * nllloss + ret * (label_smoothing / C) + + +def cross_entropy_loss_reference(input, target, weight=None, ignore_index=-100, reduction='mean', + label_smoothing=0.0): if input.shape == target.shape: return cross_entropy_loss_prob_target_reference( input, target, weight=weight, - reduction=reduction) + reduction=reduction, + label_smoothing=label_smoothing) else: - return nlllossNd_reference( - torch.log_softmax(input, 1), - target, - weight, - ignore_index=ignore_index, - reduction=reduction) + return cross_entropy_loss_indices_target_reference( + input, target, weight=weight, reduction=reduction, + ignore_index=ignore_index, label_smoothing=label_smoothing + ) def nllloss_reference(input, target, weight=None, ignore_index=-100, @@ -4893,6 +4939,141 @@ def padding3d_circular(input, pad): desc='4d_prob_target', check_bfloat16=False, ), + dict( + fullname='CrossEntropyLoss_2d_prob_target_smoothing_sum_reduction', + constructor=lambda *args, **kwargs: nn.CrossEntropyLoss(reduction='sum', + label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)', + input_size=(5, 3), + target_fn=lambda: torch.rand(5, 3).softmax(dim=1), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_2d_prob_target_smoothing', + constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)', + input_size=(5, 3), + target_fn=lambda: torch.rand(5, 3).softmax(dim=1), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_2d_prob_target_smoothing_weight', + constructor_args_fn=lambda: (torch.rand(3).abs(),), + constructor=lambda weight: nn.CrossEntropyLoss(weight, label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).weight(torch::rand(3).abs())', + input_size=(5, 3), + target_fn=lambda: torch.rand(5, 3).softmax(dim=1), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_3d_prob_target_smoothing_sum_reduction', + constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', + label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)', + input_size=(5, 3, 4), + target_fn=lambda: torch.rand(5, 3, 4).softmax(dim=1), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_3d_prob_target_smoothing', + constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)', + input_size=(5, 3, 4), + target_fn=lambda: torch.rand(5, 3, 4).softmax(dim=1), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_3d_indices_target_smoothing', + constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)', + input_size=(2, 3, 5), + target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_3d_indices_target_smoothing_ignore_index', + constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15, ignore_index=1), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).ignore_index(1)', + input_size=(2, 3, 5), + target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction', + constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)', + input_size=(2, 3, 5), + target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction_ignore_index', + constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.15, + ignore_index=1), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum).ignore_index(1)', + input_size=(2, 3, 5), + target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_2d_indices_target_smoothing', + constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)', + input_size=(15, 10), + target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_2d_indices_target_smoothing_sum_reduction', + constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)', + input_size=(15, 10), + target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_2d_indices_target_smoothing_ignore_index', + constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15, ignore_index=3), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).ignore_index(3)', + input_size=(15, 10), + target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=3), + check_bfloat16=False, + ), + dict( + fullname='CrossEntropyLoss_2d_indices_target_smoothing_weight', + constructor_args_fn=lambda: (torch.rand(10).abs(),), + constructor=lambda weight: nn.CrossEntropyLoss(weight, label_smoothing=0.15), + cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).weight(torch::rand(10).abs())', + input_size=(15, 10), + target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(), + reference_fn=lambda i, t, m: + loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15), + check_bfloat16=False, + ), dict( module_name='CrossEntropyLoss', constructor_args_fn=lambda: (torch.rand(3),), From a836d83957a526a69b3d3de7094e005aa988eb51 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Mon, 30 Aug 2021 04:38:00 -0700 Subject: [PATCH 339/530] [nnc] Fixed warning due to implicit parameter conversion (#64117) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64117 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30616945 Pulled By: navahgar fbshipit-source-id: eaf69232ac4a684ab5f97a54a514971655f86ef3 --- torch/csrc/jit/tensorexpr/expr.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index fbbea12387920..4947bfdc36be9 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -319,11 +319,16 @@ class TORCH_API BufHandle : public ExprHandle { // object. For example: VarHandle x('x'); ExprHandle x2 = x; class TORCH_API VarHandle : public ExprHandle { public: + // Creates an empty VarHandle whose base Var is set to nullptr. VarHandle() : ExprHandle() {} + explicit VarHandle(Dtype dtype) : ExprHandle(Var::make(dtype)) {} + VarHandle(const std::string& name_hint, Dtype dtype) : ExprHandle(Var::make(name_hint, dtype)) {} + explicit VarHandle(VarPtr node) : ExprHandle(node) {} + VarPtr node() const { return static_to(ExprHandle::node()); } From 093a12aaa984bd4a7768bb306157067f7c95b0ec Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Mon, 30 Aug 2021 04:38:00 -0700 Subject: [PATCH 340/530] [nnc] Updated internal asserts to include more detailed error messages (#64118) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64118 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30616944 Pulled By: navahgar fbshipit-source-id: 35289696cc0e7faa01599304243b86f0febc6daf --- torch/csrc/jit/tensorexpr/kernel.cpp | 36 +++++++++++++++++++++++----- torch/csrc/jit/tensorexpr/kernel.h | 2 ++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 0d0d19e004981..e4136d85c0a50 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -34,7 +34,10 @@ static bool checkTypes(const ScalarType highType, const int typeConstraints) { } // assume JIT not supporting complex and qint yet - TORCH_INTERNAL_ASSERT((typeConstraints & (kQintTypes | kComplexTypes)) == 0); + TORCH_INTERNAL_ASSERT( + (typeConstraints & (kQintTypes | kComplexTypes)) == 0, + buildErrorMessage( + "Qint and Complex types are not supported in the fuser.")); return false; } @@ -63,6 +66,13 @@ namespace torch { namespace jit { namespace tensorexpr { +std::string buildErrorMessage(const std::string& s) { + // TODO: Update this generic error message to include details regarding + // turning off the fuser. + static const std::string generic_error_message = ""; + return s + " " + generic_error_message; +} + static int te_cuda_pointwise_loop_levels = -1; static int te_cuda_pointwise_block_count = -1; static int te_cuda_pointwise_block_size = -1; @@ -164,13 +174,18 @@ c10::optional pickDeviceType(const std::shared_ptr& graph) { for (auto const& input : node->inputs()) { if (auto tt = input->type()->cast()) { if (auto inputDevice = tt->device()) { - TORCH_INTERNAL_ASSERT(!device || *device == *inputDevice); + TORCH_INTERNAL_ASSERT( + !device || *device == *inputDevice, + buildErrorMessage( + "Different devices specified for inputs to the fuser.")); device = inputDevice; } } } } - TORCH_INTERNAL_ASSERT(device); + TORCH_INTERNAL_ASSERT( + device, + buildErrorMessage("Could not find device in fuser graph inputs.")); return device; } @@ -356,7 +371,9 @@ bool matmulIsSupported(const torch::jit::Node* node) { void annotateInputShapes( const std::shared_ptr& graph, const std::vector>& example_inputs) { - TORCH_INTERNAL_ASSERT(graph->inputs().size() == example_inputs.size()); + TORCH_INTERNAL_ASSERT( + graph->inputs().size() == example_inputs.size(), + buildErrorMessage("Given inputs do not match the fuser graph inputs.")); for (size_t idx = 0; idx < example_inputs.size(); idx++) { if (auto t = example_inputs[idx]) { auto concrete_tensor_type = tensorTypeInCurrentExecutionContext(*t); @@ -820,7 +837,10 @@ std::vector TensorExprKernel::inferSizesForValue( throw std::runtime_error("Empty input list is passed to aten::cat"); } - TORCH_INTERNAL_ASSERT(n->input(1)->node()->kind() == prim::Constant); + TORCH_INTERNAL_ASSERT( + n->input(1)->node()->kind() == prim::Constant, + buildErrorMessage( + "aten::cat op's dim input is not constant in fuser.")); int64_t dim = n->input(1)->node()->i(attr::value); auto shape = sizesForValue(inputs[0]); auto norm_dim = normalizeAndCheckIndex(dim, shape.size()); @@ -2689,7 +2709,11 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) { blockSize = default_uint8_blocksize; } std::vector loops = l.getLoopStmtsFor(buf); - TORCH_INTERNAL_ASSERT(!loops.empty(), "loops should not be empty"); + TORCH_INTERNAL_ASSERT( + !loops.empty(), + buildErrorMessage( + "No loops found for the buffer " + buf->name_hint() + + " in the fuser.")); ForPtr flattened = nullptr; LoopNest::flatten(loops, &flattened); assert(flattened); diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 4b92b020fce31..bdb9802ccdc3a 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -300,6 +300,8 @@ TORCH_API void annotateInputShapes( TORCH_API std::shared_ptr removeUnusedSelfArgument( const std::shared_ptr& graph); +TORCH_API std::string buildErrorMessage(const std::string& s); + } // namespace tensorexpr } // namespace jit } // namespace torch From 7701ea48be276f5058cc6247ec53fb6a5789445a Mon Sep 17 00:00:00 2001 From: "Rong Rong (AI Infra)" Date: Mon, 30 Aug 2021 07:49:27 -0700 Subject: [PATCH 341/530] remove one more distributed test (#64108) Summary: Follow up on https://github.com/pytorch/pytorch/issues/62896. one more place we should remove distributed test Pull Request resolved: https://github.com/pytorch/pytorch/pull/64108 Reviewed By: janeyx99, soulitzer Differential Revision: D30614062 Pulled By: walterddr fbshipit-source-id: 6576415dc2d481d65419da19c5aa0afc37a86cff --- .jenkins/pytorch/test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 4eb1b35253c91..5014f603e4bb9 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -517,7 +517,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$ test_without_numpy install_torchvision test_python_shard1 - test_distributed test_aten elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then install_torchvision From 82174330d0bae4e2356295e16e261052f1d0ff8c Mon Sep 17 00:00:00 2001 From: Vitaly Fedyunin Date: Mon, 30 Aug 2021 07:54:11 -0700 Subject: [PATCH 342/530] [DataLoader2] Adding Messages, Protocols, Loop wrappers (#63882) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63882 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30627452 Pulled By: VitalyFedyunin fbshipit-source-id: 561ea2df07f3572e04401171946154024126387b --- test/test_dataloader.py | 42 ++++- torch/utils/data/__init__.py | 3 +- torch/utils/data/communication/__init__.py | 5 + torch/utils/data/communication/eventloop.py | 41 +++++ torch/utils/data/communication/iter.py | 173 ++++++++++++++++++++ torch/utils/data/communication/messages.py | 75 +++++++++ torch/utils/data/communication/protocol.py | 159 ++++++++++++++++++ torch/utils/data/communication/queue.py | 50 ++++++ torch/utils/data/dataloader_experimental.py | 124 ++++++++++---- 9 files changed, 638 insertions(+), 34 deletions(-) create mode 100644 torch/utils/data/communication/__init__.py create mode 100644 torch/utils/data/communication/eventloop.py create mode 100644 torch/utils/data/communication/iter.py create mode 100644 torch/utils/data/communication/messages.py create mode 100644 torch/utils/data/communication/protocol.py create mode 100644 torch/utils/data/communication/queue.py diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 65554632fd30f..c768246ff477c 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -22,6 +22,7 @@ IterableDataset, Subset, TensorDataset, + communication, _utils ) from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL @@ -32,6 +33,7 @@ IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest, load_tests, TEST_WITH_TSAN, IS_SANDCASTLE) + try: import psutil HAS_PSUTIL = True @@ -730,7 +732,7 @@ def __getitem__(self, idx): # Should be used as worker_init_fn with TestWorkerInfoDataset. # See _test_get_worker_info below for usage. -def test_worker_info_init_fn(worker_id): +def _test_worker_info_init_fn(worker_id): worker_info = torch.utils.data.get_worker_info() assert worker_id == worker_info.id, "worker_init_fn and worker_info should have consistent id" assert worker_id < worker_info.num_workers, "worker_init_fn and worker_info should have valid id" @@ -760,7 +762,7 @@ def _test_get_worker_info(): dataset = TestWorkerInfoDataset(6, batch_size, num_workers) dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, - worker_init_fn=test_worker_info_init_fn) + worker_init_fn=_test_worker_info_init_fn) it = iter(dataloader) data = [] for d in it: @@ -769,7 +771,7 @@ def _test_get_worker_info(): data = torch.cat(data, 0) for d in data: # each `d` is a [worker_id, worker_pid] pair, which is set in - # test_worker_info_init_fn + # _test_worker_info_init_fn assert d[1] == worker_pids[d[0]] # get_worker_info returns None in main proc after data loading assert torch.utils.data.get_worker_info() is None @@ -1963,11 +1965,41 @@ def test_excessive_thread_creation_warning(self): class TestDataLoader2(TestCase): @skipIfNoDill def test_basics(self): - dp = IterableWrapper(list(range(10))) + # TODO(VitalyFedyunin): This test will start breaking if we remove guaranteed order + # of traversing workers + dp = IterableWrapper(list(range(1000))) dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2) - self.assertEquals(list(dl), list(dl2)) + dl2_threading = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2, parallelism_mode='thread') + self.assertEqual(list(dl), list(dl2)) + self.assertEqual(list(dl), list(dl2_threading)) + + + +@unittest.skipIf( + TEST_WITH_TSAN, + "Fails with TSAN with the following error: starting new threads after multi-threaded " + "fork is not supported. Dying (set die_after_fork=0 to override)") +class TestDataLoader2_EventLoop(TestCase): + @skipIfNoDill + def test_basic_threading(self): + def clean_me(process, req_queue, res_queue): + req_queue.put(communication.messages.TerminateRequest()) + _ = res_queue.get() + process.join() + + it = list(range(100)) + numbers_dp = IterableWrapper(it) + (process, req_queue, res_queue, _thread_local_datapipe) = communication.eventloop.SpawnThreadForDataPipeline(numbers_dp) + + process.start() + local_datapipe = communication.iter.QueueWrapper( + communication.protocol.IterDataPipeQueueProtocolClient(req_queue, res_queue)) + + actual = list(local_datapipe) + clean_me(process, req_queue, res_queue) + self.assertEqual(list(range(100)), actual) class StringDataset(Dataset): def __init__(self): diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py index 0af9e6193af3d..ac0c763fe3854 100644 --- a/torch/utils/data/__init__.py +++ b/torch/utils/data/__init__.py @@ -35,7 +35,7 @@ runtime_validation_disabled, ) from torch.utils.data.dataloader_experimental import DataLoader2 - +from torch.utils.data import communication __all__ = ['BatchSampler', 'ChainDataset', @@ -56,6 +56,7 @@ 'WeightedRandomSampler', '_DatasetKind', 'argument_validation', + 'communication', 'functional_datapipe', 'get_worker_info', 'guaranteed_datapipes_determinism', diff --git a/torch/utils/data/communication/__init__.py b/torch/utils/data/communication/__init__.py new file mode 100644 index 0000000000000..88a395e2bddcf --- /dev/null +++ b/torch/utils/data/communication/__init__.py @@ -0,0 +1,5 @@ +from . import eventloop +from . import iter +from . import messages +from . import protocol +from . import queue diff --git a/torch/utils/data/communication/eventloop.py b/torch/utils/data/communication/eventloop.py new file mode 100644 index 0000000000000..75c44c5192313 --- /dev/null +++ b/torch/utils/data/communication/eventloop.py @@ -0,0 +1,41 @@ +import torch +import threading +import pickle + +from torch.utils.data import IterDataPipe, communication + + +def DataPipeToQueuesLoop(source_datapipe, req_queue, res_queue): + if isinstance(source_datapipe, IterDataPipe): + pipe_type = communication.iter + protocol_type = communication.protocol.IterDataPipeQueueProtocolServer + else: + raise Exception('Only supports IterDataPipe, got', source_datapipe) + # pipe_type = communication.map + # protocol_type = communication.protocol.MapDataPipeQueueProtocolServer + + torch.set_num_threads(1) + for _ in pipe_type.DataPipeBehindQueues(source_datapipe, protocol_type(req_queue, res_queue), blocking_request_get=True): + pass + + +def SpawnProcessForDataPipeline(multiprocessing_ctx, datapipe): + req_queue = multiprocessing_ctx.Queue() + res_queue = multiprocessing_ctx.Queue() + process = multiprocessing_ctx.Process( + target=DataPipeToQueuesLoop, args=(datapipe, req_queue, res_queue)) + return process, req_queue, res_queue + + +def SpawnThreadForDataPipeline(datapipe): + req_queue = communication.queue.ThreadingQueue() + res_queue = communication.queue.ThreadingQueue() + + try: + new_datapipe = pickle.loads(pickle.dumps(datapipe)) + except Exception as e: + raise Exception('Unable to pickle DataPipe to make thread local copy', e) + + process = threading.Thread(target=DataPipeToQueuesLoop, args=( + new_datapipe, req_queue, res_queue), daemon=True) + return process, req_queue, res_queue, new_datapipe diff --git a/torch/utils/data/communication/iter.py b/torch/utils/data/communication/iter.py new file mode 100644 index 0000000000000..594a466295a5f --- /dev/null +++ b/torch/utils/data/communication/iter.py @@ -0,0 +1,173 @@ +import time +import types + +from torch.utils.data import IterDataPipe, communication + +DEFAULT_NON_BLOCKING_SLEEP = 0.001 + + +def default_not_available_hook(): + time.sleep(DEFAULT_NON_BLOCKING_SLEEP) + + +class NotAvailable(Exception): + pass + + +class InvalidStateResetRequired(Exception): + """ + Returned by DataPipe when it is expecting to get reset request, + for example RouterDataPipe expecting all workers to request reset' + """ + pass + + +class NonBlocking(IterDataPipe): + not_available_hook = default_not_available_hook + + def __iter__(self): + self.reset_iterator() + return self + + def __next__(self): + while True: + try: + return self.nonblocking_next() + except StopIteration: + raise StopIteration + except NotAvailable: + if NonBlocking.not_available_hook is not None: + NonBlocking.not_available_hook() + + def nonblocking_next(self): + raise NotImplementedError( + "nonblocking_next is not implemented for %s" % self.__class__) + + def reset_iterator(self): + raise NotImplementedError( + "reset_iterator is not implemented for %s" % self.__class__) + + @staticmethod + def register_not_available_hook(hook_function): + NonBlocking.not_available_hook = hook_function + + +def EnsureNonBlockingDataPipe(validated_datapipe): + if not isinstance(validated_datapipe, IterDataPipe): + raise Exception('Not Iterable DataPipe ' + + str(validated_datapipe.__class__)) + if isinstance(validated_datapipe, NonBlocking): + return validated_datapipe + if not hasattr(validated_datapipe, '_as_iterator'): + validated_datapipe._as_iterator = None # type: ignore[attr-defined] + if not hasattr(validated_datapipe, 'nonblocking_next'): + def nonblocking_next(self): + if self._as_iterator is None: + self._as_iterator = iter(self) + return next(self._as_iterator) + validated_datapipe.nonblocking_next = types.MethodType( # type: ignore[attr-defined] + nonblocking_next, validated_datapipe) + if not hasattr(validated_datapipe, 'reset_iterator'): + def reset_iterator(self): + self._as_iterator = None + validated_datapipe.reset_iterator = types.MethodType( # type: ignore[attr-defined] + reset_iterator, validated_datapipe) + return validated_datapipe + + +def DataPipeBehindQueues(source_datapipe, protocol, full_stop=False, blocking_request_get=False): + """ + Indefinitely iterates over req_queue and passing values from source_datapipe to res_queue + If raise_stop is true, raises exception when StopIteration received from the source_datapipe + """ + if not isinstance(protocol, communication.protocol.IterDataPipeQueueProtocolServer): + raise Exception('Expecting IterDataPipeQueueProtocolServer, got', protocol) + source_datapipe = EnsureNonBlockingDataPipe(source_datapipe) + forever = True + while forever: + + try: + # Non-blocking call is Extremely slow here for python.mp, need to figureout good workaround + request = protocol.get_new_request(block=blocking_request_get) + except communication.protocol.EmptyQueue: + yield True + continue + + if isinstance(request, communication.messages.ResetIteratorRequest): + source_datapipe.reset_iterator() + protocol.response_reset() + + elif isinstance(request, communication.messages.TerminateRequest): + forever = False + protocol.response_terminate() + + elif isinstance(request, communication.messages.GetNextRequest): + while forever: + try: + value = source_datapipe.nonblocking_next() + except NotAvailable: + yield True + continue + except StopIteration: + protocol.response_stop() + if full_stop: + forever = False + else: + yield True + break + except InvalidStateResetRequired: + protocol.response_invalid() + if full_stop: + forever = False + else: + yield True + break + protocol.response_next(value) + yield True # Returns control + break + else: + raise Exception('Unrecognized type of request received', request) + + +class QueueWrapper(NonBlocking): + """ + Creates iter.DataPipe which reads data from the DataLoader.Queue + """ + + def __init__(self, protocol, response_wait_time=0.00001): + if not isinstance(protocol, communication.protocol.IterDataPipeQueueProtocolClient): + raise Exception('Got', protocol) + + self.protocol = protocol + self.counter = 0 + self._stop_iteration = False + self._response_wait_time = response_wait_time + + def reset_iterator(self): + self._stop_iteration = False + self.counter = 0 + self.protocol.request_reset() + while True: + try: + self.protocol.get_response_reset() + break + except communication.protocol.EmptyQueue: + if NonBlocking.not_available_hook is not None: + NonBlocking.not_available_hook() + + def nonblocking_next(self): + if self._stop_iteration: + raise Exception( + '`next` or `nonblocking_next` called after receiving StopIteration') + if self.protocol.can_take_request(): + self.protocol.request_next() + try: + response = self.protocol.get_response_next(block=True, timeout=self._response_wait_time) + except communication.protocol.EmptyQueue: + raise NotAvailable + if isinstance(response, communication.messages.StopIterationResponse): + self._stop_iteration = True + raise StopIteration + if isinstance(response, communication.messages.InvalidStateResponse): + raise NotAvailable + return response.value diff --git a/torch/utils/data/communication/messages.py b/torch/utils/data/communication/messages.py new file mode 100644 index 0000000000000..449cf23cfc01c --- /dev/null +++ b/torch/utils/data/communication/messages.py @@ -0,0 +1,75 @@ +class DataLoaderQueueMessage(object): + pass + + +class Request(DataLoaderQueueMessage): + pass + + +class Response(DataLoaderQueueMessage): + pass + + +class ResetIteratorRequest(Request): + pass + + +class ResetIteratorResponse(Response): + pass + + +class TerminateRequest(Request): + pass + + +class TerminateResponse(Response): + pass + + +class LenRequest(Request): + pass + + +class LenResponse(Response): + __slots__ = ('len') + + def __init__(self, len): + self.len = len + + +class GetItemRequest(Request): + __slots__ = ('key') + + def __init__(self, key): + self.key = key + + +class GetItemResponse(Response): + __slots__ = ('key', 'value') + + def __init__(self, key, value): + self.key = key + self.value = value + + +class GetNextRequest(Request): + pass + + +class GetNextResponse(Response): + __slots__ = ('value') + + def __init__(self, value): + self.value = value + + +class StopIterationResponse(Response): + pass + + +class InvalidStateResponse(Response): + """ + Returned by DataPipe when it is expecting to get reset request, + for example RouterDataPipe expecting all workers to request reset' + """ + pass diff --git a/torch/utils/data/communication/protocol.py b/torch/utils/data/communication/protocol.py new file mode 100644 index 0000000000000..68ff335714d3f --- /dev/null +++ b/torch/utils/data/communication/protocol.py @@ -0,0 +1,159 @@ +from torch.utils.data import communication + + +class Protocol(object): + __slots__ = ('request_queue', 'response_queue') + + def __init__(self, request_queue, response_queue): + self.request_queue = request_queue + self.response_queue = response_queue + + +class ProtocolClient(Protocol): + """ + ProtocolClient takes charge of putting requests into req_queue and returning results from res_queue. + """ + _req_sent = None + + def __init__(self, request_queue, response_queue): + self.request_queue = request_queue + self.response_queue = response_queue + self._req_sent = None + + def can_take_request(self): + return self._req_sent is None + + def waiting_for_response(self): + return self._req_sent is not None + + def request_sent(self, request=True): + if not self.can_take_request(): + raise Exception('Protocol only supports one request in the Queue') + self._req_sent = request + + def request_served(self, result=None): + if not self.waiting_for_response(): + raise Exception( + 'Expected no peding requests, but something got served', result) + self._req_sent = None + + +class ProtocolServer(Protocol): + """ + ProtocolServer takes charge of getting requests from req_queue and fetching data from source datapipe. + """ + _req_received = None + + def __init__(self, request_queue, response_queue): + self.request_queue = request_queue + self.response_queue = response_queue + self._req_received = None + + def have_pending_request(self): + return self._req_received is not None + + def get_new_request(self, block=False): + if self.have_pending_request(): + raise Exception( + 'Trying to get next request, while having one unserved') + try: + response = self.request_queue.get(block=block) + except Exception as e: # TODO: Catch only timeout exceptions + raise EmptyQueue('queue is empty') + self._req_received = response + return response + + # TODO: Validate supported requests + + def response_reset(self): + if not self.have_pending_request(): + raise Exception("Attempting to reply with pending request") + if not isinstance(self._req_received, communication.messages.ResetIteratorRequest): + raise Exception( + "Replaying with reset status to other type of message") + self.response_queue.put(communication.messages.ResetIteratorResponse()) + self._req_received = None + + def response_next(self, value): + if not self.have_pending_request(): + raise Exception("Attempting to reply with pending request") + self.response_queue.put(communication.messages.GetNextResponse(value)) + self._req_received = None + + def response_stop(self): + if not self.have_pending_request(): + raise Exception("Attempting to reply with pending request") + self.response_queue.put(communication.messages.StopIterationResponse()) + self._req_received = None + + def response_invalid(self): + if not self.have_pending_request(): + raise Exception("Attempting to reply with pending request") + self.response_queue.put(communication.messages.InvalidStateResponse()) + self._req_received = None + + def response_terminate(self): + if not self.have_pending_request(): + raise Exception("Attempting to reply with pending request") + if not isinstance(self._req_received, communication.messages.TerminateRequest): + raise Exception( + "Replaying with terminate status to other type of message") + self.response_queue.put(communication.messages.TerminateResponse()) + self._req_received = None + + +class MapDataPipeQueueProtocolClient(ProtocolClient): + pass + + +class MapDataPipeQueueProtocolServer(ProtocolServer): + pass + + +class EmptyQueue(Exception): + pass + + +class IterDataPipeQueueProtocolServer(ProtocolServer): + pass + + +class IterDataPipeQueueProtocolClient(ProtocolClient): + def request_reset(self): + if not self.can_take_request(): + raise Exception( + 'Can not reset while we are still waiting response for previous request') + request = communication.messages.ResetIteratorRequest() + self.request_queue.put(request) + self.request_sent(request) + + def request_next(self): + if not self.can_take_request(): + raise Exception( + 'Can not request next item while we are still waiting response for previous request') + request = communication.messages.GetNextRequest() + self.request_queue.put(request) + self.request_sent(request) + + def get_response_reset(self, block=False): + try: + response = self.response_queue.get(block=block) + except Exception as e: # TODO: Catch only timeout exceptions + raise EmptyQueue('queue is empty') + self.request_served(response) + + if not isinstance(response, communication.messages.ResetIteratorResponse): + raise Exception('Invalid response received') + + def get_response_next(self, block=False, timeout=None): + if not self.waiting_for_response(): + raise Exception( + 'Can not expect any response without submitted request') + try: + response = self.response_queue.get(block=block, timeout=timeout) + except Exception as e: # TODO: Catch only timeout exceptions + raise EmptyQueue('queue is empty') + self.request_served(response) + + # TODO(VitalyFedyunin): Add possible response types validation here + return response diff --git a/torch/utils/data/communication/queue.py b/torch/utils/data/communication/queue.py new file mode 100644 index 0000000000000..7717697b0f75d --- /dev/null +++ b/torch/utils/data/communication/queue.py @@ -0,0 +1,50 @@ +import threading +import time + +class LocalQueue(): + ops = 0 + stored = 0 + uid = 0 + empty = 0 + + def __init__(self, name='unnamed'): + self.items = [] + self.name = name + self.uid = LocalQueue.uid + LocalQueue.uid += 1 + + def put(self, item, block=True): + LocalQueue.ops += 1 + LocalQueue.stored += 1 + self.items.append(item) + + def get(self, block=True, timeout=0): + # TODO(VitalyFedyunin): Add support of block and timeout arguments + LocalQueue.ops += 1 + if not len(self.items): + LocalQueue.empty += 1 + raise Exception('LocalQueue is empty') + LocalQueue.stored -= 1 + return self.items.pop() + + +class ThreadingQueue(): + def __init__(self, name='unnamed'): + self.lock = threading.Lock() + self.items = [] + self.name = name + + def put(self, item, block=True): + with self.lock: + self.items.append(item) + + def get(self, block=True, timeout=0): + # TODO(VitalyFedyunin): Add support of block and timeout arguments + while True: + with self.lock: + if len(self.items) > 0: + return self.items.pop() + if not block: + raise Exception("Not available") + # TODO(VitalyFedyunin): Figure out what to do if nothing in the queue + time.sleep(0.000001) diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py index ea085298bf00f..a74c75cd75122 100644 --- a/torch/utils/data/dataloader_experimental.py +++ b/torch/utils/data/dataloader_experimental.py @@ -1,10 +1,60 @@ import functools +import time + +from typing import Any, List import torch.utils.data.backward_compatibility -from torch.utils.data import DataLoader, IterDataPipe + +import torch.utils.data.sharding +from torch.utils.data import DataLoader, IterDataPipe, communication from torch.utils.data.datapipes.iter import IterableWrapper +class _ThreadingDataLoader2: + + def __init__(self, datapipe, num_workers=0, collate_fn=None): + self.threads = [] + self.datapipes = [] + self.collate_fn = collate_fn + for worker_id in range(num_workers): + (thread, req_queue, res_queue, thread_localdatapipe) = communication.eventloop.SpawnThreadForDataPipeline(datapipe) + torch.utils.data.sharding.apply_sharding(thread_localdatapipe, num_workers, worker_id) + thread.start() + self.threads.append((thread, req_queue, res_queue)) + local_datapipe = communication.iter.QueueWrapper( + communication.protocol.IterDataPipeQueueProtocolClient(req_queue, res_queue)) + self.datapipes.append(local_datapipe) + + def __iter__(self): + not_available = False + forever = True + exclude_datapipes: List[Any] = [] + while len(exclude_datapipes) < len(self.datapipes): + for dp in self.datapipes: + if dp not in exclude_datapipes: + try: + value = dp.nonblocking_next() + yield value + except StopIteration: + exclude_datapipes.append(dp) + except communication.iter.NotAvailable: + not_available = True + if not_available: + time.sleep(0.001) + + def __del__(self): + self._cleanup_all_threads() + + def _cleanup_all_threads(self): + def clean_me(thread, req_queue, res_queue): + req_queue.put(communication.messages.TerminateRequest()) + _ = res_queue.get() + thread.join() + + for thread, req_queue, res_queue in self.threads: + clean_me(thread, req_queue, res_queue) + + class DataLoader2: def __new__(cls, dataset, @@ -21,15 +71,17 @@ def __new__(cls, *, prefetch_factor=2, persistent_workers=False, - batch_outside_worker=False): + batch_outside_worker=False, + parallelism_mode='mp'): if isinstance(dataset, IterDataPipe): - datapipe = dataset + data_loader: Any = None if batch_sampler is not None: raise Exception( - 'batch_sampler is not yet supported for DataPipes') + 'batch_sampler is not yet supported by DataPipes') if sampler is not None: raise Exception( - 'sampler is not yet supported for DataPipes') + 'sampler is not yet supported by DataPipes') + datapipe = dataset if shuffle: datapipe = datapipe.shuffle() if batch_outside_worker and pin_memory: @@ -40,30 +92,43 @@ def __new__(cls, datapipe = datapipe.batch(batch_size, drop_last=drop_last) if collate_fn is None: collate_fn = torch.utils.data._utils.collate.default_collate + if parallelism_mode == 'mp' or num_workers == 0: + def sharding_worker_init_fn(worker_init_fn, worker_id): + if worker_init_fn is not None: + worker_init_fn(worker_id) + torch.utils.data.backward_compatibility.worker_init_fn( + worker_id) - def sharding_worker_init_fn(worker_init_fn, worker_id): - if worker_init_fn is not None: - worker_init_fn(worker_id) - torch.utils.data.backward_compatibility.worker_init_fn( - worker_id) - - my_worker_init_fn = functools.partial( - sharding_worker_init_fn, worker_init_fn) - - data_loader = DataLoader(datapipe, - batch_size=None, # Replaced by .batch DataPipe - shuffle=False, # Replaced by .shuffle DataPipe - sampler=None, - batch_sampler=None, - num_workers=num_workers, - collate_fn=collate_fn, - pin_memory=pin_memory, - drop_last=False, # Replaced by .batch DataPipe - timeout=timeout, - worker_init_fn=my_worker_init_fn, - prefetch_factor=prefetch_factor, - persistent_workers=persistent_workers) + my_worker_init_fn = functools.partial( + sharding_worker_init_fn, worker_init_fn) + data_loader = DataLoader(datapipe, + batch_size=None, # Replaced by .batch DataPipe + shuffle=False, # Replaced by .shuffle DataPipe + sampler=None, + batch_sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=False, # Replaced by .batch DataPipe + timeout=timeout, + worker_init_fn=my_worker_init_fn, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers) + elif parallelism_mode == 'thread': + if collate_fn is not None and not batch_outside_worker: + datapipe = datapipe.map(collate_fn) + if pin_memory: + raise Exception( + 'pin_memory is not yet supported by DataPipes with Threading') + if worker_init_fn is not None: + raise Exception( + 'worker_init_fn is not yet supported by DataPipes with Threading') + data_loader = _ThreadingDataLoader2(datapipe, + num_workers=num_workers, + collate_fn=collate_fn) + else: + raise Exception('Unsupported parallelism mode', parallelism_mode) if not batch_outside_worker: return data_loader else: @@ -72,8 +137,11 @@ def sharding_worker_init_fn(worker_init_fn, worker_id): datapipe = IterableWrapper(data_loader).batch( batch_size, drop_last=drop_last).map(collate_fn) return datapipe - else: + if parallelism_mode != 'thread': + raise Exception( + 'thread parallelism mode is not supported for old DataSets') + return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, From f79df24859125fca8e8af799fb2be44dc3293752 Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Mon, 30 Aug 2021 08:27:36 -0700 Subject: [PATCH 343/530] Automated submodule update: FBGEMM (#64149) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/f6dfed87a10ed5729bce83e98788e437a94cbda0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64149 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jspark1105 Differential Revision: D30632209 fbshipit-source-id: aa1cebaf50169c3a93dbcb994fa47e29d6b6a0d7 --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index 9939bac9defab..9f4078a7bb92b 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 9939bac9defab4d18fb7fdded7e1a76c0c2b49b4 +Subproject commit 9f4078a7bb92b88cdcfc913398ffade158160c91 From dc4fd3bddab51970e53060f8cb1a3c316a28b042 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Mon, 30 Aug 2021 09:26:20 -0700 Subject: [PATCH 344/530] [MicroBench] Added a micro benchmark for a signed log1p kernel. (#64032) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64032 Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D30579198 Pulled By: navahgar fbshipit-source-id: a53d68225fba768b26491d14b535f8f2dcf50c0e --- benchmarks/cpp/tensorexpr/CMakeLists.txt | 1 + .../cpp/tensorexpr/bench_signed_log1p.cpp | 120 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp diff --git a/benchmarks/cpp/tensorexpr/CMakeLists.txt b/benchmarks/cpp/tensorexpr/CMakeLists.txt index 789c81fcf6526..a06502eb29053 100644 --- a/benchmarks/cpp/tensorexpr/CMakeLists.txt +++ b/benchmarks/cpp/tensorexpr/CMakeLists.txt @@ -6,6 +6,7 @@ add_executable( bench_batchnorm.cpp bench_concat.cpp bench_compile.cpp + bench_signed_log1p.cpp bench_fuser_overhead.cpp bench_gemm.cpp bench_parallel.cpp diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp new file mode 100644 index 0000000000000..44781f58c9027 --- /dev/null +++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp @@ -0,0 +1,120 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace torch::jit::tensorexpr; + +namespace { + +class SignedLog1pBench : public benchmark::Fixture { + public: + void SetUp(const benchmark::State& state) override { + input_size_ = {state.range(0), state.range(1)}; + input_size_int_ = {state.range(0), state.range(1)}; + input_ = torch::rand(input_size_); + ref_ = signedLog1p(input_); + } + + void TearDown(benchmark::State& state) override { + TORCH_CHECK(at::allclose(ref_, output_)); + state.counters["GB/s"] = benchmark::Counter( + uint64_t(state.iterations()) * 2 * output_.nbytes(), + benchmark::Counter::kIsRate); + } + + at::Tensor signedLog1p(const at::Tensor& inp) { + auto sign = at::sign(inp); + auto log1p = at::log1p(at::abs(inp)); + return sign * log1p; + } + + void runATen(benchmark::State& state) { + for (auto _ : state) { + output_ = signedLog1p(input_); + } + } + + void runNNC(benchmark::State& state) { + Placeholder input_ph( + "input", kFloat, {input_size_int_[0], input_size_int_[1]}); + Tensor abs_result = Compute( + "aten_abs", + {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + [&](const VarHandle& m, const VarHandle& n) { + return abs(input_ph.load(m, n)); + }); + Tensor log1p_result = Compute( + "aten_log1p", + {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + [&](const VarHandle& m, const VarHandle& n) { + return log1p(abs_result.load(m, n)); + }); + Tensor sign = Compute( + "aten_sign", + {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + [&](const VarHandle& m, const VarHandle& n) { + return CompareSelect::make( + input_ph.load(m, n), + ExprHandle(0.0f), + ExprHandle(-1), + ExprHandle(1), + kLT); + }); + Tensor output = Compute( + "aten_mul", + {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + [&](const VarHandle& m, const VarHandle& n) { + return sign.load(m, n) * log1p_result.load(m, n); + }); + LoopNest nest({output}, {abs_result, log1p_result, sign, output}); + GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt()); + nest.inlineIntermediateBufs(true); + nest.prepareForCodegen(); + nest.simplify(); + nest.vectorizeInnerLoops(); + nest.simplify(); + GRAPH_DEBUG("Final stmt: ", *nest.root_stmt()); + + // StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); + std::vector buf_args; + buf_args.push_back(input_ph); + buf_args.push_back(output); + LLVMCodeGen cg(nest.root_stmt(), buf_args); + + std::vector call_args; + for (auto _ : state) { + output_ = at::empty_like(ref_); + call_args.clear(); + call_args.push_back(input_.data_ptr()); + call_args.push_back(output_.data_ptr()); + cg.call(call_args); + } + } + + private: + std::vector input_size_; + std::vector input_size_int_; + at::Tensor input_; + at::Tensor output_; + at::Tensor ref_; +}; + +} // namespace + +BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) { + runATen(state); +} + +BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) { + runNNC(state); +} + +BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467}); + +BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467}); From 9777887f0ebe6403e19205eb1cafb81fe24606fb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 30 Aug 2021 09:34:24 -0700 Subject: [PATCH 345/530] [PyTorch] Reduce copies/refcount bumps in BytecodeDeserializer::parseMethods (#63961) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63961 Saw a report that this function was slow and was doing unexplained vector copies. First pass to remove a bunch of copying. ghstack-source-id: 136760976 Test Plan: Pixel 3 before: https://our.intern.facebook.com/intern/aibench/details/461850118893980 after: https://www.internalfb.com/intern/aibench/details/48965886029524 MilanBoard failed to return data from simpleperf Reviewed By: dhruvbird Differential Revision: D30544551 fbshipit-source-id: 0e2b5471a10c0803d52c923e6fb5625f5542b99d --- torch/csrc/jit/mobile/import.cpp | 44 +++++++++++++------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index d2865d071ea72..6a548103f6965 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -85,8 +85,8 @@ using caffe2::serialize::ReadAdapterInterface; OpCode parseOpCode(const char* str); -IValue expect_field( - IValue tup, +const IValue& expect_field( + const IValue& tup, const std::string& expected_name, size_t entry) { auto row = tup.toTuple()->elements().at(entry).toTuple(); @@ -325,8 +325,7 @@ void BytecodeDeserializer::parseMethods( " but the model version is ", model_version); - bool has_debug_handles = debug_handles.has_value(); - if (has_debug_handles) { + if (debug_handles) { TORCH_CHECK( debug_handles->size() == vals.size(), "The numbers of bytecode values and debug info values do not match."); @@ -340,12 +339,11 @@ void BytecodeDeserializer::parseMethods( const auto& element = vals[i]; const auto& m_tuple = element.toTuple()->elements(); const std::string& function_name = m_tuple[0].toStringRef(); - IValue codeTable = m_tuple[1]; - auto schemaTable = // older files do not store function schema + const IValue& codeTable = m_tuple[1]; + const IValue* schemaTable = // older files do not store function schema (model_version > 0x4L || (model_version == 0x4L && m_tuple.size() >= 3)) - ? at::optional{m_tuple[2]} - : at::nullopt; - + ? &m_tuple[2] + : nullptr; auto function = std::make_unique(c10::QualifiedName(function_name)); @@ -369,8 +367,8 @@ void BytecodeDeserializer::parseMethods( expect_field(codeTable, "register_size", BYTECODE_INDEX_REGISTER_SIZE) .toInt(); - std::vector debug_handles_list; - if (has_debug_handles) { + c10::List debug_handles_list; + if (debug_handles) { const auto& debug_handles_element = (*debug_handles)[i]; const auto& debug_handles_m_tuple = debug_handles_element.toTuple()->elements(); @@ -379,22 +377,21 @@ void BytecodeDeserializer::parseMethods( TORCH_CHECK( debug_info_function_name == function_name, "The function names in the bytecode table and the debug info table do not match."); - IValue debug_handles_table = debug_handles_m_tuple[1]; + const IValue& debug_handles_table = debug_handles_m_tuple[1]; debug_handles_list = (expect_field( debug_handles_table, "function_debug_handles", BYTECODE_INDEX_MODULE_DEBUG_HANDLES) .toTuple() ->elements())[0] - .toList() - .vec(); + .toIntList(); TORCH_CHECK( debug_handles_list.size() == ins_list.size(), "The numbers of instructions and debug handles strings do not match."); } for (const auto j : c10::irange(ins_list.size())) { - auto ins_item = ins_list[j].toTuple()->elements(); + const auto& ins_item = ins_list[j].toTuple()->elements(); TORCH_CHECK( ins_item.size() == 3, "There should be three parts in an instruction. The function name is ", @@ -402,8 +399,8 @@ void BytecodeDeserializer::parseMethods( OpCode op_code = parseOpCode(ins_item[0].toString()->string().c_str()); int X = ins_item[1].toInt(); int N = ins_item[2].toInt(); - if (has_debug_handles) { - int64_t debug_handle = debug_handles_list[j].toInt(); + if (debug_handles) { + int64_t debug_handle = debug_handles_list[j]; function->append_instruction(op_code, X, N, debug_handle); } else { function->append_instruction(op_code, X, N); @@ -451,14 +448,9 @@ void BytecodeDeserializer::parseMethods( const auto& type = resolveTypeName( (expect_field(argTable, "type", BYTECODE_INDEX_ARGUMENT_TYPE)) .toStringRef()); - auto default_value = expect_field( - argTable, - "default_value", - BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE) - .toIValue(); - auto arg = - c10::Argument(name, type, c10::nullopt /*N*/, default_value); - args.emplace_back(std::move(arg)); + const IValue& default_value = expect_field( + argTable, "default_value", BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE); + args.emplace_back(name, type, c10::nullopt /*N*/, default_value); } return args; }; @@ -522,7 +514,7 @@ mobile::Module BytecodeDeserializer::deserialize( // being a Tuple (int, table), and the integer stands for the bytecode version // number. The rest of the elements are the same as before. // - auto bvals = readArchive("bytecode", mcu).toTuple()->elements(); + auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements(); c10::optional> debug_handles; if (reader_->hasRecord("mobile_debug_handles.pkl")) { From 16ecdbbaa2a24debae1c80b441bbea945d61d02d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 30 Aug 2021 09:34:24 -0700 Subject: [PATCH 346/530] [PyTorch] Fix missing move in unpickler (#63974) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63974 Saw some time spent in this for model loading, no reason not to move here. ghstack-source-id: 136760979 Test Plan: Re-profile model loading on devserver; IValue copy ctor time has gone down Reviewed By: dhruvbird Differential Revision: D30548923 fbshipit-source-id: 42000f2e18582762b43353cca10ae094833de3b3 --- torch/csrc/jit/serialization/unpickler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index f944387465446..b521dc88a12ba 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -318,7 +318,7 @@ PickleOpCode Unpickler::readInstruction() { tuple->elements().reserve(stack_.size() - start); auto start_it = stack_.begin() + start; for (auto it = start_it; it != stack_.end(); ++it) { - tuple->elements().emplace_back(*it); + tuple->elements().emplace_back(std::move(*it)); } stack_.erase(start_it, stack_.end()); stack_.emplace_back(std::move(tuple)); From e24c3644d87acfb0359cb14bde4afcd62a9255ba Mon Sep 17 00:00:00 2001 From: Harut Movsisyan Date: Mon, 30 Aug 2021 09:36:46 -0700 Subject: [PATCH 347/530] [Static Runtime] aten::cat out version when it is not being replaced by prim::VarConcat (#64157) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64157 UseVariadicCat optimization is not applied to aten::cat if list input to the op can not be moved to the position before op (https://fburl.com/diffusion/l6kweimu). For these cases we will need out version for SR. Test Plan: Confirm out variant is called: ``` > buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1 ``` Reviewed By: d1jang Differential Revision: D30598574 fbshipit-source-id: 74cfa8291dc8b5df4aef58adfb1ab2a16f10d90a --- benchmarks/static_runtime/test_scripts.h | 11 ++++++++++ .../static_runtime/test_static_runtime.cc | 20 +++++++++++++++++++ torch/csrc/jit/runtime/static/ops.cpp | 20 +++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 004319ca550f9..7fdb113c4ed45 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -790,3 +790,14 @@ const auto linalg_norm_ord_str = R"JIT( def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int): return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone() )JIT"; + +const std::string cat_script = R"IR( + graph(%a: Tensor, %b: Tensor, %dim: int): + %ten_list: Tensor[] = prim::ListConstruct(%a, %b) + %1 : int = prim::Constant[value=0]() + %2 : int = prim::Constant[value=1]() + %3 : int = prim::Constant[value=1]() + %ten_list2 : Tensor[] = aten::slice(%ten_list, %1, %2, %3) + %ret: Tensor = aten::cat(%ten_list2, %dim) + return (%ret) +)IR"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index f6e3680e0be38..b7201baa1e182 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -1291,3 +1292,22 @@ TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) { std::vector args1{b, "fro", dim, true, dtype}; testStaticRuntime(linalg_norm_ord_str, args0, args1); } + +TEST(StaticRuntime, IndividualOps_Cat) { + auto graph = std::make_shared(); + std::unordered_map vmap; + parseIR(cat_script, graph.get(), vmap); + torch::jit::StaticModule smodule(graph); + ASSERT_TRUE(getNodeWithKind(smodule, "aten::cat")); + + auto a = at::randn({2, 4}); + auto b = at::randn({3, 4}); + std::vector args0{a, b, 0}; + + testStaticRuntime(cat_script, args0); + + auto c = at::randn({3, 4}); + auto d = at::randn({3, 5}); + std::vector args1{c, d, 1}; + testStaticRuntime(cat_script, args0, args1); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 12339301e0433..cf91f33a28c26 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1713,6 +1713,26 @@ REGISTER_OPERATOR_FUNCTOR(aten::linalg_norm, aten_linalg_norm, [](Node* n) -> SR }; }); +REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator { + if (!n->matches( + torch::schema("aten::cat(Tensor[] tensors, int dim=0) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto inputs = p_node->Input(0).toTensorVector(); + const auto dim = p_node->Input(1).toInt(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::_cat_cpu(inputs, dim); + return; + } + + auto& output = p_node->Output(0).toTensor(); + fastResizeToZero(output); + at::native::_cat_out_cpu(inputs, dim, output); + }; +}); + namespace { void check_cat_no_zero_dim(const std::vector& tensors) { From 93d2e5090f9823102debab3845117c8e8208995b Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 30 Aug 2021 09:43:25 -0700 Subject: [PATCH 348/530] Improve performance of index_select by avoiding item (#63008) Summary: Partially fixes https://github.com/pytorch/pytorch/issues/61788 From a CUDA perspective: item already pulls all Tensor content onto the host (albeit one-by-one), which incurs very expensive memory transfers. This way we'll do it all at once. From a CPU perspective: item has a lot of overhead as a native function in comparison to simply using a pointer. Overall there's still lots of performance gains to be had, but this is a small change that should take us into a more usable landscape. This doesn't land a separate benchmark, but I postulate that's not necessary to decide on the benefit of this (we'll also see if it shows up indirectly), however is still a good follow-up item. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63008 Reviewed By: zou3519 Differential Revision: D30211160 Pulled By: cpuhrsch fbshipit-source-id: 70b752be5df51afc66b5aa1c77135d1205520cdd --- aten/src/ATen/native/TensorShape.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 2545ec4c1e035..1dc2a270c44c2 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1209,12 +1209,15 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index) if (dim < sparse_dim) { - auto dim_indices = indices[dim]; + auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous(); + int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr(); + auto cpu_index = index.to(c10::kCPU).contiguous(); + int64_t* cpu_index_ptr = cpu_index.data_ptr(); std::vector zindices; std::vector iindices; int64_t new_nnz = 0; - for (const auto i : c10::irange(new_sizes[dim])) { - auto idx = index[i].item(); + for (int64_t i = 0; i < new_sizes[dim]; i++) { + int64_t idx = cpu_index_ptr[i]; if (idx < -size || idx >= size) { TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ", self.sizes(), " at dimension ", dim); @@ -1222,8 +1225,8 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index) if (idx < 0) { idx += size; } - for (const auto j : c10::irange(nnz)) { - auto jdx = dim_indices[j].item(); + for (int64_t j = 0; j < nnz; j++) { + int64_t jdx = cpu_dim_indices_ptr[j]; if (idx == jdx) { new_nnz++; iindices.push_back(i); From ac99d63f83ceaee4a95e7baa8a52fba09903d00b Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Mon, 30 Aug 2021 11:46:14 -0700 Subject: [PATCH 349/530] [jit] Make operation call accept Stack& instead Stack* (#63414) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63414 Misuse of raw pointer in here where stack is never nullable. ghstack-source-id: 136938318 Test Plan: compiles. Imported from OSS Reviewed By: ejguan Differential Revision: D30375410 fbshipit-source-id: 9d65b620bb76d90d886c800f54308520095d58ee --- aten/src/ATen/core/dispatch/Dispatcher.h | 4 + aten/src/ATen/core/stack.h | 40 +- test/cpp/jit/test_alias_analysis.cpp | 14 +- test/cpp/jit/test_custom_operators.cpp | 14 +- test/cpp/jit/test_misc.cpp | 4 +- test/cpp/jit/test_schema_matching.cpp | 4 +- test/cpp/jit/test_utils.cpp | 2 +- test/custom_operator/test_custom_ops.cpp | 2 +- torch/csrc/autograd/record_function_ops.cpp | 2 +- .../rpc/request_callback_no_python.cpp | 2 +- torch/csrc/jit/codegen/cuda/interface.cpp | 6 +- torch/csrc/jit/codegen/fuser/fallback.cpp | 2 +- torch/csrc/jit/mobile/function.cpp | 2 +- torch/csrc/jit/passes/batch_mm.cpp | 16 +- .../csrc/jit/passes/constant_propagation.cpp | 2 +- torch/csrc/jit/passes/decompose_ops.cpp | 4 +- .../csrc/jit/passes/frozen_ops_to_mkldnn.cpp | 18 +- torch/csrc/jit/passes/shape_analysis.cpp | 2 +- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 4 +- .../passes/utils/check_alias_annotation.cpp | 2 +- torch/csrc/jit/python/pybind_utils.h | 2 +- torch/csrc/jit/python/python_interpreter.cpp | 4 +- torch/csrc/jit/runtime/graph_executor.cpp | 12 +- torch/csrc/jit/runtime/interpreter.cpp | 4 +- torch/csrc/jit/runtime/register_c10_ops.cpp | 2 +- torch/csrc/jit/runtime/register_cuda_ops.cpp | 24 +- .../jit/runtime/register_distributed_ops.cpp | 30 +- torch/csrc/jit/runtime/register_ops_utils.cpp | 52 +-- torch/csrc/jit/runtime/register_ops_utils.h | 138 +++---- torch/csrc/jit/runtime/register_prim_ops.cpp | 342 +++++++++--------- .../jit/runtime/register_prim_ops_fulljit.cpp | 62 ++-- .../csrc/jit/runtime/register_special_ops.cpp | 34 +- torch/csrc/jit/runtime/static/fusion.cpp | 6 +- torch/csrc/jit/runtime/static/impl.cpp | 2 +- 34 files changed, 451 insertions(+), 409 deletions(-) diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index fd32a72c75102..cfa6b740f8877 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -344,6 +344,10 @@ class TORCH_API OperatorHandle { c10::Dispatcher::singleton().callBoxed(*this, stack); } + void callBoxed(Stack& stack) const { + callBoxed(&stack); + } + void redispatchBoxed(DispatchKeySet ks, Stack* stack) const { c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack); } diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h index ffc0e8fd9037d..021e8a02104f2 100644 --- a/aten/src/ATen/core/stack.h +++ b/aten/src/ATen/core/stack.h @@ -1,6 +1,9 @@ #pragma once +#include + #include +#include // TODO move this to c10 namespace @@ -9,7 +12,42 @@ namespace jit { using c10::IValue; using Stack = std::vector; -using Operation = std::function; + +class Operation { + template + using accepts = std::is_constructible, F&&>; + + public: + template ::value, int> = 0> + C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.") + Operation(F&& raw): op_([raw = std::forward(raw)](Stack& stack) { + raw(&stack); + }) {} + + template ::value && + !std::is_same, Operation>::value, int> = 0> + Operation(F&& op): op_(std::forward(op)) {} + + Operation(std::nullptr_t) noexcept {} + + explicit operator bool() const noexcept { + return op_ ? true : false; + } + + void operator()(Stack& stack) { + op_(stack); + } + + template + T* target() noexcept { + return op_.target(); + } + + private: + std::function op_; +}; // An operation with N inputs and M outputs pops the last N inputs off // the stack and pushes its M inputs onto the stack diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp index 1bd556a8980b7..eef529d8d5d33 100644 --- a/test/cpp/jit/test_alias_analysis.cpp +++ b/test/cpp/jit/test_alias_analysis.cpp @@ -1,11 +1,11 @@ #include #include +#include +#include #include -#include "torch/csrc/jit/frontend/ir_emitter.h" -#include "torch/csrc/jit/ir/alias_analysis.h" -#include "torch/csrc/jit/runtime/custom_operator.h" -#include "torch/csrc/utils/memory.h" +#include +#include namespace torch { namespace jit { @@ -484,7 +484,7 @@ TEST(AliasAnalysisTest, SafeToChangeAliasingRelationship) { TEST(WriteTrackingTest, Basic) { RegisterOperators reg({Operator( "prim::creates_alias(Tensor(a) x) -> Tensor(a)", - [](Stack* s) {}, + [](Stack&) {}, aliasAnalysisFromSchema())}); const auto creates_alias = Symbol::fromQualString("prim::creates_alias"); auto graph = std::make_shared(); @@ -949,11 +949,11 @@ TEST(WildcardsTest, Basic) { RegisterOperators reg( {Operator( "prim::returns_wildcard(Tensor a) -> Tensor(*)", - [](Stack* stack) {}, + [](Stack&) {}, aliasAnalysisFromSchema()), Operator( "prim::writes(Tensor(z!) a) -> Tensor(a)", - [](Stack* stack) {}, + [](Stack&) {}, aliasAnalysisFromSchema())}); const auto returns_wildcard = Symbol::fromQualString("prim::returns_wildcard"); diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp index a34ca33672c7b..39be82ea23430 100644 --- a/test/cpp/jit/test_custom_operators.cpp +++ b/test/cpp/jit/test_custom_operators.cpp @@ -31,7 +31,7 @@ TEST(CustomOperatorTest, InferredSchema) { Stack stack; push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); + op->getOperation()(stack); at::Tensor output; pop(stack, output); @@ -61,7 +61,7 @@ TEST(CustomOperatorTest, ExplicitSchema) { Stack stack; push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); + op->getOperation()(stack); at::Tensor output; pop(stack, output); @@ -109,7 +109,7 @@ TEST(CustomOperatorTest, ListParameters) { c10::List>( {c10::complex(2.4, -5.5), c10::complex(-1.3, 2)})); push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); + op->getOperation()(stack); c10::List output; pop(stack, output); @@ -140,7 +140,7 @@ TEST(CustomOperatorTest, ListParameters2) { Stack stack; push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); + op->getOperation()(stack); c10::List output; pop(stack, output); @@ -204,7 +204,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) { torch::jit::RegisterOperators reg({OperatorGenerator( TORCH_SELECTIVE_NAME_IN_SCHEMA( op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; at::Tensor b; @@ -223,7 +223,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) { torch::jit::RegisterOperators reg({OperatorGenerator( TORCH_SELECTIVE_NAME_IN_SCHEMA( op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; at::Tensor b; @@ -249,7 +249,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) { Stack stack; push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); + op->getOperation()(stack); at::Tensor output; pop(stack, output); diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 9f8a732f550f4..305d36a476213 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -1493,11 +1493,11 @@ TEST(NoneSchemaMatchTest, Basic) { RegisterOperators reg({ Operator( "prim::test_none() -> int?", - [](Stack* stack) { push(stack, IValue()); }, + [](Stack& stack) { push(stack, IValue()); }, aliasAnalysisFromSchema()), Operator( "prim::is_none(int? a) -> bool", - [](Stack* stack) { + [](Stack& stack) { IValue a = pop(stack); if (a.isNone()) { push(stack, true); diff --git a/test/cpp/jit/test_schema_matching.cpp b/test/cpp/jit/test_schema_matching.cpp index 31d332b718f53..c56d0bc28fe99 100644 --- a/test/cpp/jit/test_schema_matching.cpp +++ b/test/cpp/jit/test_schema_matching.cpp @@ -15,7 +15,7 @@ TEST(SchemaMatchingTest, VarType) { RegisterOperators reg({ Operator( "aten::test_vartype(t[] a, t b) -> (t)", - [](Stack* stack) { + [](Stack& stack) { c10::List list; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; @@ -54,7 +54,7 @@ TEST(SchemaMatchingTest, VarType2) { RegisterOperators reg({ Operator( "aten::test_vartype2(t a, t[] b) -> (t[])", - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; c10::List list; diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp index f2fb9e1fb0606..8da101e99bbdf 100644 --- a/test/cpp/jit/test_utils.cpp +++ b/test/cpp/jit/test_utils.cpp @@ -273,7 +273,7 @@ RegisterOperators reg({ // because it always produces empty Tensors. Operator( "prim::MakeTestTensor() -> Tensor", - [](Stack* stack) { push(stack, at::Tensor()); }, + [](Stack& stack) { push(stack, at::Tensor()); }, aliasAnalysisFromSchema()), }); } // namespace diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp index 7c6a187df1465..ec22568c5a3ea 100644 --- a/test/custom_operator/test_custom_ops.cpp +++ b/test/custom_operator/test_custom_ops.cpp @@ -30,7 +30,7 @@ Result get_operator_from_registry_and_execute(const char* op_name, Args&&... arg torch::jit::Stack stack; torch::jit::push(stack, std::forward(args)...); - op->getOperation()(&stack); + op->getOperation()(stack); TORCH_INTERNAL_ASSERT(1 == stack.size()); return torch::jit::pop(stack).to(); diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp index 7e621f9e8b62e..9650c354c5868 100644 --- a/torch/csrc/autograd/record_function_ops.cpp +++ b/torch/csrc/autograd/record_function_ops.cpp @@ -79,7 +79,7 @@ c10::AliasAnalysisKind aliasAnalysisFromSchema() { jit::RegisterOperators reg_fut_ops({ jit::Operator( "profiler::_call_end_callbacks_on_jit_fut(Tensor x, Future(t) y) -> Future(t)", - [](jit::Stack* stack) { + [](jit::Stack& stack) { // Pop inputs, which should be a future and a tensor auto fut = jit::pop(stack).toFuture(); auto tensor = jit::pop(stack).toTensor(); diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp index 07d5c61e0c53c..5eada8d573f2f 100644 --- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp +++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp @@ -582,7 +582,7 @@ c10::intrusive_ptr RequestCallbackNoPython::runJitOperator( std::vector streams) const { c10::MultiStreamGuard guard(streams); try { - op.getOperation()(&stack); + op.getOperation()(stack); } catch (const std::exception&) { return asFuture(std::current_exception()); } diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp index 009ae21dad6d0..cf8f3787229ce 100644 --- a/torch/csrc/jit/codegen/cuda/interface.cpp +++ b/torch/csrc/jit/codegen/cuda/interface.cpp @@ -182,8 +182,8 @@ RegisterOperators reg_fusion({ Operator( prim::CudaFusionGroup, [](const Node* node) -> Operation { - return [node](Stack* stack) { - fuser::cuda::runFusionGroup(node, *stack); + return [node](Stack& stack) { + fuser::cuda::runFusionGroup(node, stack); }; }, aliasAnalysisSpecialCase()), @@ -196,7 +196,7 @@ RegisterOperators reg_guard({ // if we would ever return refined tensor, which would change aliasing // analysis, we should update aliasdb pass. [](const Node* node) -> Operation { - return [node](Stack* stack) { + return [node](Stack& stack) { // TODO: check latency here!!!! std::vector types = node->tys(attr::types); const auto num_inputs = types.size(); diff --git a/torch/csrc/jit/codegen/fuser/fallback.cpp b/torch/csrc/jit/codegen/fuser/fallback.cpp index 59fe7e6f4fd25..60a5d72f3c439 100644 --- a/torch/csrc/jit/codegen/fuser/fallback.cpp +++ b/torch/csrc/jit/codegen/fuser/fallback.cpp @@ -26,7 +26,7 @@ RegisterOperators reg_fused_operators({Operator( [](const Node* node) -> Operation { int64_t dim = node->i(attr::dim); int64_t num_inputs = node->inputs().size(); - return [dim, num_inputs](Stack* stack) { + return [dim, num_inputs](Stack& stack) { auto result = at::cat( fmap( last(stack, num_inputs), diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp index 0775a550d2a79..127bd5f9418d4 100644 --- a/torch/csrc/jit/mobile/function.cpp +++ b/torch/csrc/jit/mobile/function.cpp @@ -67,7 +67,7 @@ bool Function::append_operator( auto jit_op = findOperatorFor(opname); std::vector args; if (jit_op) { - fn = [jit_op](Stack& stack) { jit_op->getOperation()(&stack); }; + fn = [jit_op](Stack& stack) { jit_op->getOperation()(stack); }; args = jit_op->schema().arguments(); } else { auto op = c10::Dispatcher::singleton().findSchema(opname_c10); diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp index 815a1bc0ea649..944e27805cf18 100644 --- a/torch/csrc/jit/passes/batch_mm.cpp +++ b/torch/csrc/jit/passes/batch_mm.cpp @@ -109,11 +109,11 @@ bool shape_is_fast_for_reduce(const at::Tensor& lhs, const at::Tensor& rhs) { RegisterOperators mm_tree_reduction_reg({Operator( "prim::MMTreeReduce(...) -> Tensor", - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); std::vector inputs; inputs.reserve(num_inputs); - for (auto it = stack->end() - num_inputs; it != stack->end(); ++it) { + for (auto it = stack.end() - num_inputs; it != stack.end(); ++it) { inputs.push_back(std::move(*it).toTensor()); } drop(stack, num_inputs); @@ -320,11 +320,11 @@ RegisterOperators mm_batch_side_reg({Operator( [](const Node* node) -> Operation { size_t num_other_side_inputs = node->inputs().size() - 1; Side single_side = static_cast(node->i(Symbol::attr("side"))); - return [num_other_side_inputs, single_side](Stack* stack) { + return [num_other_side_inputs, single_side](Stack& stack) { at::Tensor side_input; std::vector other_side_inputs; other_side_inputs.reserve(num_other_side_inputs); - for (auto it = stack->end() - num_other_side_inputs; it != stack->end(); + for (auto it = stack.end() - num_other_side_inputs; it != stack.end(); ++it) { other_side_inputs.push_back(std::move(*it).toTensor()); } @@ -343,18 +343,18 @@ RegisterOperators mm_batch_side_reg({Operator( mm_out, num_other_side_inputs, /*dim=*/single_side == Side::LHS ? 1 : 0); - stack->insert( - stack->end(), + stack.insert( + stack.end(), std::make_move_iterator(outputs.begin()), std::make_move_iterator(outputs.end())); } else { if (single_side == Side::LHS) { for (at::Tensor& other : other_side_inputs) { - stack->emplace_back(side_input.mm(other)); + stack.emplace_back(side_input.mm(other)); } } else { for (at::Tensor& other : other_side_inputs) { - stack->emplace_back(other.mm(side_input)); + stack.emplace_back(other.mm(side_input)); } } } diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index a7f831abd88f6..3a28eaeba46e6 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -78,7 +78,7 @@ c10::optional> runNodeIfInputsAreConstant( try { auto op = n->getOperation(); - op(&stack); + op(stack); } catch (...) { return c10::nullopt; } diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp index 7f935a1c1cbd5..0706c9c14ae98 100644 --- a/torch/csrc/jit/passes/decompose_ops.cpp +++ b/torch/csrc/jit/passes/decompose_ops.cpp @@ -59,7 +59,7 @@ bool isDecomposableNorm(Node* normalize_op) { RegisterOperators reg_ops( {Operator( "aten::_ncf_unsqueeze(Tensor(a) self, int ndim) -> Tensor(a)", - [](Stack* stack) { + [](Stack& stack) { const int64_t ndim = pop(stack).toInt(); auto self = pop(stack).toTensor(); c10::SmallVector sizes(ndim, 1); @@ -70,7 +70,7 @@ RegisterOperators reg_ops( aliasAnalysisFromSchema()), Operator( "aten::_ncf_view(Tensor(a) self, int[] input_shape, int normalized_ndim) -> Tensor(a)", - [](Stack* stack) { + [](Stack& stack) { const int64_t normalized_ndim = pop(stack).toInt(); auto input_shape = pop(stack).toIntList(); auto self = pop(stack).toTensor(); diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp index 6d218af06e34c..542e136280520 100644 --- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp +++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp @@ -235,7 +235,7 @@ void InplaceMKLDNNSubgraph(std::shared_ptr graph) { Operation createUnaryOp( std::function aten_op, bool inplace = false) { - return [aten_op, inplace](Stack* stack) { + return [aten_op, inplace](Stack& stack) { auto a = pop(stack).toTensor(); c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); // we cast `a` to an `ideep::tensor`, so we can get at its descriptor @@ -275,7 +275,7 @@ Operation createUnaryOp( }; } -void MKLDNNLayerNormOp(Stack* stack, bool inplace) { +void MKLDNNLayerNormOp(Stack& stack, bool inplace) { c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); // enable_cudnn not used @@ -303,7 +303,7 @@ void MKLDNNLayerNormOp(Stack* stack, bool inplace) { }; Operation BroadOp(const Node* node) { - return [](Stack* stack) { + return [](Stack& stack) { auto b = pop(stack).toTensor(); auto a = pop(stack).toTensor(); auto b_size = b.sizes(); @@ -471,17 +471,17 @@ const RegisterOperators BroadOpReg({ const RegisterOperators MKLDNNLayerNormOpReg({ torch::jit::Operator( "prim::MKLDNNLayerNorm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor", - [](Stack* stack) { MKLDNNLayerNormOp(stack, false); }, + [](Stack& stack) { MKLDNNLayerNormOp(stack, false); }, AliasAnalysisKind::FROM_SCHEMA), torch::jit::Operator( "prim::MKLDNNLayerNorm_(Tensor(a!) input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor(a!)", - [](Stack* stack) { MKLDNNLayerNormOp(stack, true); }, + [](Stack& stack) { MKLDNNLayerNormOp(stack, true); }, AliasAnalysisKind::FROM_SCHEMA), }); Operation ConstantMKLDNNTensorOp(const Node* node) { const auto& t = node->t(attr::value); - return [t](Stack* stack) { + return [t](Stack& stack) { push(stack, t); return 0; }; @@ -509,7 +509,7 @@ jit::RegisterOperators reg_fut_ops({ // XXX: this follows the schema convention of conv2d/conv3d, not // aten::mkldnn_convolution, which is different for some reason! "prim::mkldnn_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor", - [](jit::Stack* stack) { + [](jit::Stack& stack) { int64_t groups = pop(stack).toInt(); auto dilation = pop(stack).toIntVector(); auto padding = pop(stack).toIntVector(); @@ -558,7 +558,7 @@ jit::RegisterOperators reg_fut_ops({ // in default bindings jit::Operator( "prim::MKLDNNScalarMul(Tensor self, Scalar other) -> Tensor", - [](jit::Stack* stack) { + [](jit::Stack& stack) { c10::impl::ExcludeDispatchKeyGuard edkg( c10::autograd_dispatch_keyset); float other = pop(stack).toScalar().toFloat(); @@ -576,7 +576,7 @@ jit::RegisterOperators reg_fut_ops({ aliasAnalysisFromSchema()), jit::Operator( "prim::MKLDNNScalarMul_(Tensor(a!) self, Scalar other) -> Tensor(a!)", - [](jit::Stack* stack) { + [](jit::Stack& stack) { c10::impl::ExcludeDispatchKeyGuard edkg( c10::autograd_dispatch_keyset); float other = pop(stack).toScalar().toFloat(); diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 47cd30b3d43ac..5e13829a8ce6d 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -410,7 +410,7 @@ class ShapePropagator { // is to uncover any mistakes we could make when editing this code, // and eventually it shouldn't matter, because this phase should be // preceded by schema checking. - op(&stack); + op(stack); AT_ASSERT(stack.size() == node->outputs().size()); for (const auto i : c10::irange(stack.size())) { diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 833c338578616..1d5128c7e71e2 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -1299,9 +1299,9 @@ void FuseTensorExprs( Operation createTensorExprOp(const Node* node) { auto kernel = std::make_shared(node->g(attr::Subgraph)); - return [kernel](Stack* stack) { + return [kernel](Stack& stack) { RECORD_FUNCTION("TensorExpr", std::vector()); - kernel->run(*stack); + kernel->run(stack); return 0; }; } diff --git a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp index cd894b46ff69b..ae3a962509994 100644 --- a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp +++ b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp @@ -253,7 +253,7 @@ void checkAliasAnnotation( const auto inputsDeepCopy = deepCopy(stack); // Run the op - node->getOperation()(&stack); + node->getOperation()(stack); const auto outputs = std::move(stack); diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 0138231d3bc3f..eff1ddc243999 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -1151,7 +1151,7 @@ inline py::object invokeOperatorFromPython( Stack stack = std::get<1>(opWithStack); { pybind11::gil_scoped_release no_gil_guard; - found_op->getOperation()(&stack); + found_op->getOperation()(stack); } return createPyObjectForStack(std::move(stack)); diff --git a/torch/csrc/jit/python/python_interpreter.cpp b/torch/csrc/jit/python/python_interpreter.cpp index 82a0d22c54fa2..29b7929fcd690 100644 --- a/torch/csrc/jit/python/python_interpreter.cpp +++ b/torch/csrc/jit/python/python_interpreter.cpp @@ -43,7 +43,7 @@ Operation createPythonOperation(const Node* op_) { AT_ASSERT(op->outputs().size() == 1); - return [=](Stack* stack) { + return [=](Stack& stack) { pybind11::gil_scoped_acquire gil; py::tuple py_inputs(op->cconv.size()); size_t i = 0; @@ -66,7 +66,7 @@ Operation createPythonOperation(const Node* op_) { drop(stack, num_inputs); try { py::object py_output(func(*py_inputs)); - stack->push_back(returnToIValue(op->output()->type(), py_output)); + stack.push_back(returnToIValue(op->output()->type(), py_output)); } catch (py::error_already_set& e) { throw std::runtime_error(e.what()); } diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp index 476882650a1dd..0187988680e80 100644 --- a/torch/csrc/jit/runtime/graph_executor.cpp +++ b/torch/csrc/jit/runtime/graph_executor.cpp @@ -377,7 +377,7 @@ struct DifferentiableGraphOp { num_outputs(this->grad.f->outputs().size()) {} // XXX: keep in mind that stack can be larger than the inputs we need! - void operator()(Stack* stack) const { + void operator()(Stack& stack) const { auto grad_fn = std::make_shared( grad_executor, grad.df_input_vjps.size(), @@ -394,13 +394,13 @@ struct DifferentiableGraphOp { captureInputs(*grad_fn, inputs); } - detachVariables(*stack); + detachVariables(stack); if (IsNewExecutorEnabled()) { ExecutionPlan plan = - f_ptr->getPlanFor(*stack, GraphExecutor::getDefaultNumBailOuts()); - InterpreterState(plan.code).run(*stack); + f_ptr->getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts()); + InterpreterState(plan.code).run(stack); } else { - InterpreterState(legacy_f).run(*stack); + InterpreterState(legacy_f).run(stack); } { @@ -419,7 +419,7 @@ struct DifferentiableGraphOp { // drop the temporary outputs so that we return the same number of // outputs as if we were not also calculating gradient const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs; - stack->erase(stack->end() - num_temporary_outputs, stack->end()); + stack.erase(stack.end() - num_temporary_outputs, stack.end()); } } diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index be2019e532f98..70c9c6c653326 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -297,13 +297,13 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { } case INST(OP): { INST_GUARD; - frame.function->operator_table_[inst.X](&stack); + frame.function->operator_table_[inst.X](stack); } INST_NEXT; case INST(OPN): { INST_GUARD; stack.push_back(inst.N); - frame.function->operator_table_[inst.X](&stack); + frame.function->operator_table_[inst.X](stack); } INST_NEXT; case INST(LOAD): { diff --git a/torch/csrc/jit/runtime/register_c10_ops.cpp b/torch/csrc/jit/runtime/register_c10_ops.cpp index 993d41194e84b..4d541ec46bbbf 100644 --- a/torch/csrc/jit/runtime/register_c10_ops.cpp +++ b/torch/csrc/jit/runtime/register_c10_ops.cpp @@ -12,7 +12,7 @@ namespace jit { namespace { Operator createOperatorFromC10(const c10::OperatorHandle& op) { - return Operator(op, [op](Stack* stack) { op.callBoxed(stack); }); + return Operator(op, [op](Stack& stack) { op.callBoxed(stack); }); } class RegistrationListener final : public c10::OpRegistrationListener { diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp index f7a989d7acef9..599fd5398c370 100644 --- a/torch/csrc/jit/runtime/register_cuda_ops.cpp +++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp @@ -38,7 +38,7 @@ void _device_synchronize(int64_t device_index) { RegisterOperators const reg({ Operator( "cuda::current_stream.device(Device? device) -> __torch__.torch.classes.cuda.Stream", - [](Stack* stack) { + [](Stack& stack) { auto device = pop(stack).toOptional(); c10::DeviceIndex device_index = device.has_value() ? device->index() @@ -50,7 +50,7 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::current_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream", - [](Stack* stack) { + [](Stack& stack) { auto idx = pop(stack).toOptional(); c10::DeviceIndex device_index = idx.has_value() ? static_cast(idx.value()) @@ -62,7 +62,7 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::default_stream.device(Device? device) -> __torch__.torch.classes.cuda.Stream", - [](Stack* stack) { + [](Stack& stack) { auto device = pop(stack).toOptional(); c10::DeviceIndex device_index = device.has_value() ? device->index() @@ -74,7 +74,7 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::default_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream", - [](Stack* stack) { + [](Stack& stack) { auto idx = pop(stack).toOptional(); c10::DeviceIndex device_index = idx.has_value() ? static_cast(idx.value()) @@ -86,14 +86,14 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::_current_device() -> int", - [](Stack* stack) { + [](Stack& stack) { auto v = c10::cuda::current_device(); push(stack, static_cast(v)); }, aliasAnalysisFromSchema()), Operator( "cuda::_set_device(int64_t val) -> ()", - [](Stack* stack) { + [](Stack& stack) { int64_t idx = -1; pop(stack, idx); c10::cuda::set_device(static_cast(idx)); @@ -101,7 +101,7 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::device_index(Device device) -> int", - [](Stack* stack) { + [](Stack& stack) { auto device = pop(stack); auto idx = device.toDevice().index(); push(stack, idx); @@ -109,11 +109,11 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::device_count() -> int", - [](Stack* stack) { push(stack, at::cuda::device_count()); }, + [](Stack& stack) { push(stack, at::cuda::device_count()); }, aliasAnalysisFromSchema()), Operator( "cuda::set_stream(__torch__.torch.classes.cuda.Stream stream) -> ()", - [](Stack* stack) { + [](Stack& stack) { auto v = pop(stack); auto s = v.toCustomClass(); auto stream_device_idx = static_cast(s->device_index()); @@ -141,11 +141,11 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::synchronize() -> ()", - [](Stack* stack) { c10::cuda::device_synchronize(); }, + [](Stack& stack) { c10::cuda::device_synchronize(); }, aliasAnalysisFromSchema()), Operator( "cuda::synchronize.device(Device? device) -> ()", - [](Stack* stack) { + [](Stack& stack) { auto device = pop(stack).toOptional(); c10::DeviceIndex device_index = device.has_value() ? device->index() @@ -155,7 +155,7 @@ RegisterOperators const reg({ aliasAnalysisFromSchema()), Operator( "cuda::synchronize.int(int? val) -> ()", - [](Stack* stack) { + [](Stack& stack) { auto idx = pop(stack).toOptional(); c10::DeviceIndex device_index = idx.has_value() ? static_cast(idx.value()) diff --git a/torch/csrc/jit/runtime/register_distributed_ops.cpp b/torch/csrc/jit/runtime/register_distributed_ops.cpp index 2c8277d106f3c..edf7a0ccff23a 100644 --- a/torch/csrc/jit/runtime/register_distributed_ops.cpp +++ b/torch/csrc/jit/runtime/register_distributed_ops.cpp @@ -29,11 +29,11 @@ static auto workerInfo = // prepare the rpc input arguments and call the C++ impls void prepare_and_call_rpc_op( - Stack* stack, + Stack& stack, int num_inputs, const std::string& rpc_op) { // Get inputs from the stack. - auto stackIter = stack->end() - num_inputs; + auto stackIter = stack.end() - num_inputs; auto& dstWorkerIValue = *stackIter++; auto& qualifiedNameIValue = *stackIter++; IValue emptyTuple(c10::ivalue::Tuple::create({})); @@ -137,7 +137,7 @@ void prepare_and_call_rpc_op( rpcTimeout); // Push output to the stack. drop(stack, num_inputs); - stack->emplace_back(std::move(futureIValuePtr)); + stack.emplace_back(std::move(futureIValuePtr)); } else if (rpc_op == "rpc_sync") { // Send RPC request. auto futureIValuePtr = dist_rpc::rpcTorchscript( @@ -154,7 +154,7 @@ void prepare_and_call_rpc_op( auto res = futureIValuePtr->value(); // Push output to the stack. drop(stack, num_inputs); - stack->emplace_back(std::move(res)); + stack.emplace_back(std::move(res)); } } else if (rpc_op == "rpc_remote") { auto rrefPtr = dist_rpc::remoteTorchscript( @@ -165,7 +165,7 @@ void prepare_and_call_rpc_op( rpcTimeout); // Push output to the stack. drop(stack, num_inputs); - stack->emplace_back( + stack.emplace_back( c10::static_intrusive_pointer_cast(rrefPtr)); } else { throw std::runtime_error( @@ -178,7 +178,7 @@ RegisterOperators reg_rpc_ops( fmt::format( "aten::to_here(RRef(t) self, float timeout = {}) -> t(*)", torch::distributed::rpc::kDefaultRpcTimeoutSeconds), - [](Stack* stack) { + [](Stack& stack) { auto timeout = pop(stack).toDouble(); auto rref = pop(stack).toRRef(); IValue res; @@ -195,7 +195,7 @@ RegisterOperators reg_rpc_ops( aliasAnalysisFromSchema()), Operator( "aten::local_value(RRef(t) self) -> t(*)", - [](Stack* stack) { + [](Stack& stack) { auto rref = pop(stack).toRRef(); TORCH_CHECK( rref->isOwner(), @@ -208,14 +208,14 @@ RegisterOperators reg_rpc_ops( aliasAnalysisFromSchema()), Operator( "aten::is_owner(RRef(t) self) -> bool", - [](Stack* stack) { + [](Stack& stack) { auto rref = pop(stack).toRRef(); push(stack, rref->isOwner()); }, aliasAnalysisFromSchema()), Operator( "aten::owner(RRef(t) self) -> __torch__.torch.classes.dist_rpc.WorkerInfo", - [](Stack* stack) { + [](Stack& stack) { auto rref = pop(stack).toRRef(); push( stack, @@ -225,21 +225,21 @@ RegisterOperators reg_rpc_ops( aliasAnalysisFromSchema()), Operator( "aten::owner_name(RRef(t) self) -> str", - [](Stack* stack) { + [](Stack& stack) { auto rref = pop(stack).toRRef(); push(stack, rref->ownerName()); }, aliasAnalysisFromSchema()), Operator( "aten::confirmed_by_owner(RRef(t) self) -> bool", - [](Stack* stack) { + [](Stack& stack) { auto rref = pop(stack).toRRef(); push(stack, rref->confirmedByOwner()); }, aliasAnalysisFromSchema()), Operator( "aten::dist_backward(int context_id, Tensor[] roots, bool retain_graph=False) -> ()", - [](Stack* stack) { + [](Stack& stack) { bool retain_graph = pop(stack).toBool(); auto roots_list = pop(stack).toTensorList(); int64_t context_id = pop(stack).toInt(); @@ -252,7 +252,7 @@ RegisterOperators reg_rpc_ops( prim::rpc_sync, [](const Node* node) -> Operation { int num_inputs = node->inputs().size(); - return [num_inputs](Stack* stack) { + return [num_inputs](Stack& stack) { prepare_and_call_rpc_op(stack, num_inputs, "rpc_sync"); }; }, @@ -261,7 +261,7 @@ RegisterOperators reg_rpc_ops( prim::rpc_remote, [](const Node* node) -> Operation { int num_inputs = node->inputs().size(); - return [num_inputs](Stack* stack) { + return [num_inputs](Stack& stack) { prepare_and_call_rpc_op(stack, num_inputs, "rpc_remote"); }; }, @@ -270,7 +270,7 @@ RegisterOperators reg_rpc_ops( prim::rpc_async, [](const Node* node) -> Operation { int num_inputs = node->inputs().size(); - return [num_inputs](Stack* stack) { + return [num_inputs](Stack& stack) { prepare_and_call_rpc_op(stack, num_inputs, "rpc_async"); }; }, diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp index 91ff2c738a1bf..64bb3abc57584 100644 --- a/torch/csrc/jit/runtime/register_ops_utils.cpp +++ b/torch/csrc/jit/runtime/register_ops_utils.cpp @@ -13,7 +13,7 @@ c10::impl::GenericList make_result_list(const TypePtr& elemType) { } template <> -void listIndex(Stack* stack) { +void listIndex(Stack& stack) { at::Tensor elem = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -31,7 +31,7 @@ void listIndex(Stack* stack) { } template <> -void listCount(Stack* stack) { +void listCount(Stack& stack) { at::Tensor elem = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -44,21 +44,21 @@ void listCount(Stack* stack) { } template <> -void listEq(Stack* stack) { +void listEq(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); push(stack, tensor_list_equal(a, b)); } template <> -void listNe(Stack* stack) { +void listNe(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); push(stack, !tensor_list_equal(a, b)); } template <> -void listSort(Stack* stack) { +void listSort(Stack& stack) { bool reverse = pop(stack).toBool(); c10::List list = pop(stack).toTensorList(); std::sort( @@ -74,7 +74,7 @@ void listSort(Stack* stack) { } template <> -void listCopyAndSort(Stack* stack) { +void listCopyAndSort(Stack& stack) { c10::List list = pop(stack).toTensorList(); auto list_copied = list.copy(); std::sort( @@ -87,7 +87,7 @@ void listCopyAndSort(Stack* stack) { } template <> -void listRemove(Stack* stack) { +void listRemove(Stack& stack) { at::Tensor elem = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -268,7 +268,7 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size) { return idx; } -void listAppend(Stack* stack) { +void listAppend(Stack& stack) { IValue el = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -276,13 +276,13 @@ void listAppend(Stack* stack) { push(stack, std::move(list)); } -void listReverse(Stack* stack) { +void listReverse(Stack& stack) { c10::List list = pop(stack).to>(); std::reverse(list.begin(), list.end()); } -void listPopImpl(Stack* stack, const char* empty_message) { +void listPopImpl(Stack& stack, const char* empty_message) { int64_t idx = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -297,22 +297,22 @@ void listPopImpl(Stack* stack, const char* empty_message) { list.erase(list.begin() + normalized_idx); } -void listPop(Stack* stack) { +void listPop(Stack& stack) { return listPopImpl(stack, "pop from empty list"); } -void listClear(Stack* stack) { +void listClear(Stack& stack) { c10::List list = pop(stack).to>(); list.clear(); } -void listDelete(Stack* stack) { +void listDelete(Stack& stack) { listPopImpl(stack, "pop index out of range"); pop(stack); } -void listInsert(Stack* stack) { +void listInsert(Stack& stack) { IValue elem = pop(stack).to(); int64_t idx = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -331,7 +331,7 @@ void listInsert(Stack* stack) { } } -void listExtend(Stack* stack) { +void listExtend(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); @@ -341,12 +341,12 @@ void listExtend(Stack* stack) { } } -void listCopy(Stack* stack) { +void listCopy(Stack& stack) { c10::List list = pop(stack).to>(); push(stack, list.copy()); } -void listSelect(Stack* stack) { +void listSelect(Stack& stack) { int64_t idx = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -354,19 +354,19 @@ void listSelect(Stack* stack) { push(stack, std::move(element)); } -void listLen(Stack* stack) { +void listLen(Stack& stack) { c10::List a = pop(stack).to>(); const int64_t size = a.size(); push(stack, size); } -void listList(Stack* stack) { +void listList(Stack& stack) { c10::List a = pop(stack).to>(); push(stack, a.copy()); } -void listAdd(Stack* stack) { +void listAdd(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); @@ -383,14 +383,14 @@ void listAdd(Stack* stack) { push(stack, std::move(ret)); } -void listInplaceAdd(Stack* stack) { +void listInplaceAdd(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); a.append(std::move(b)); push(stack, std::move(a)); } -void listMulIntLeftInPlace(Stack* stack) { +void listMulIntLeftInPlace(Stack& stack) { int64_t n = pop(stack).to(); c10::List list = pop(stack).to>(); if (n <= 0) { @@ -408,7 +408,7 @@ void listMulIntLeftInPlace(Stack* stack) { push(stack, std::move(list)); } -void listMulIntLeft(Stack* stack) { +void listMulIntLeft(Stack& stack) { int64_t n = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -426,7 +426,7 @@ void listMulIntLeft(Stack* stack) { push(stack, std::move(ret)); } -void listMulIntRight(Stack* stack) { +void listMulIntRight(Stack& stack) { c10::List list = pop(stack).to>(); int64_t n = pop(stack).to(); @@ -444,7 +444,7 @@ void listMulIntRight(Stack* stack) { push(stack, std::move(ret)); } -void listSlice(Stack* stack) { +void listSlice(Stack& stack) { auto step_val = pop(stack); auto end_val = pop(stack); auto start_val = pop(stack); @@ -477,7 +477,7 @@ void listSlice(Stack* stack) { push(stack, std::move(sliced_list)); } -void listSetItem(Stack* stack) { +void listSetItem(Stack& stack) { IValue value = pop(stack).to(); int64_t idx = pop(stack).to(); c10::List list = pop(stack).to>(); diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h index 5d00872d9ca7e..a4efb67943569 100644 --- a/torch/csrc/jit/runtime/register_ops_utils.h +++ b/torch/csrc/jit/runtime/register_ops_utils.h @@ -55,7 +55,7 @@ c10::List make_result_list(const TypePtr& elemType) { template <> c10::impl::GenericList make_result_list(const TypePtr& elemType); -inline void noop(Stack* n) {} +inline void noop(Stack& n) {} // As described in https://docs.python.org/3/library/functions.html#round // When a number is exactly halfway between two integers, python builtin round @@ -181,12 +181,12 @@ void setItem(const c10::List& list, int64_t idx, T&& value) { list.set(normalized_idx, std::forward(value)); } -void listAppend(Stack* stack); +void listAppend(Stack& stack); -void listReverse(Stack* stack); +void listReverse(Stack& stack); template -void minList(Stack* stack) { +void minList(Stack& stack) { c10::List a = pop(stack).to>(); c10::List b = pop(stack).to>(); @@ -204,7 +204,7 @@ void minList(Stack* stack) { } template -void maxList(Stack* stack) { +void maxList(Stack& stack) { c10::List a = pop(stack).to>(); c10::List b = pop(stack).to>(); @@ -221,18 +221,18 @@ void maxList(Stack* stack) { push(stack, b.size() > a.size() ? b : a); } -void listPopImpl(Stack* stack, const char* empty_message); +void listPopImpl(Stack& stack, const char* empty_message); -void listPop(Stack* stack); +void listPop(Stack& stack); -void listClear(Stack* stack); +void listClear(Stack& stack); -void listDelete(Stack* stack); +void listDelete(Stack& stack); -void listInsert(Stack* stack); +void listInsert(Stack& stack); template -void listRemove(Stack* stack) { +void listRemove(Stack& stack) { T elem = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -246,7 +246,7 @@ void listRemove(Stack* stack) { } template -void listMin(Stack* stack) { +void listMin(Stack& stack) { c10::List list = pop(stack).to>(); size_t list_size = list.size(); if (list_size == 0) { @@ -259,11 +259,11 @@ void listMin(Stack* stack) { min_elem = elem < min_elem ? elem : min_elem; } - stack->push_back(min_elem); + stack.push_back(min_elem); } template -void listMax(Stack* stack) { +void listMax(Stack& stack) { c10::List list = pop(stack).to>(); size_t list_size = list.size(); if (list_size == 0) { @@ -276,14 +276,14 @@ void listMax(Stack* stack) { max_elem = elem > max_elem ? elem : max_elem; } - stack->push_back(max_elem); + stack.push_back(max_elem); } template <> -void listRemove(Stack* stack); +void listRemove(Stack& stack); template -void listIndex(Stack* stack) { +void listIndex(Stack& stack) { T elem = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -297,10 +297,10 @@ void listIndex(Stack* stack) { } template <> -void listIndex(Stack* stack); +void listIndex(Stack& stack); template -void listCount(Stack* stack) { +void listCount(Stack& stack) { T elem = pop(stack).to(); c10::List list = pop(stack).to>(); @@ -309,25 +309,25 @@ void listCount(Stack* stack) { } template <> -void listCount(Stack* stack); +void listCount(Stack& stack); -void listExtend(Stack* stack); +void listExtend(Stack& stack); -void listCopy(Stack* stack); +void listCopy(Stack& stack); -void listSelect(Stack* stack); +void listSelect(Stack& stack); -void listLen(Stack* stack); +void listLen(Stack& stack); template -void listEq(Stack* stack) { +void listEq(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); push(stack, a == b); } template -void listNe(Stack* stack) { +void listNe(Stack& stack) { c10::List b = pop(stack).to>(); c10::List a = pop(stack).to>(); push(stack, a != b); @@ -357,16 +357,16 @@ inline bool tensor_list_equal( // Specialization for at::Tensor, since it doesn't define operator== template <> -void listEq(Stack* stack); +void listEq(Stack& stack); // Specialization for at::Tensor, since it doesn't define operator== template <> -void listNe(Stack* stack); +void listNe(Stack& stack); -void listList(Stack* stack); +void listList(Stack& stack); template -void listContains(Stack* stack) { +void listContains(Stack& stack) { auto key = pop(stack).to(); auto list = pop(stack).to>(); // NOLINTNEXTLINE(performance-implicit-conversion-in-loop) @@ -379,20 +379,20 @@ void listContains(Stack* stack) { push(stack, false); } -void listAdd(Stack* stack); +void listAdd(Stack& stack); -void listInplaceAdd(Stack* stack); +void listInplaceAdd(Stack& stack); -void listMulIntLeftInPlace(Stack* stack); +void listMulIntLeftInPlace(Stack& stack); -void listMulIntLeft(Stack* stack); +void listMulIntLeft(Stack& stack); -void listMulIntRight(Stack* stack); +void listMulIntRight(Stack& stack); -void listSlice(Stack* stack); +void listSlice(Stack& stack); template -void listSort(Stack* stack) { +void listSort(Stack& stack) { bool reverse = pop(stack).toBool(); c10::List list = pop(stack).to>(); std::sort(list.begin(), list.end(), [reverse](const T& a, const T& b) { @@ -408,10 +408,10 @@ void listSort(Stack* stack) { // Specialization for at::Tensor template <> -void listSort(Stack* stack); +void listSort(Stack& stack); template -void listCopyAndSort(Stack* stack) { +void listCopyAndSort(Stack& stack) { c10::List list = pop(stack).to>(); auto list_copied = list.copy(); std::sort(list_copied.begin(), list_copied.end(), [](const T& a, const T& b) { @@ -426,22 +426,22 @@ void listCopyAndSort(Stack* stack) { // Specialization for at::Tensor template <> -void listCopyAndSort(Stack* stack); +void listCopyAndSort(Stack& stack); -void listSetItem(Stack* stack); +void listSetItem(Stack& stack); struct OperatorGeneratorArgs { const char* schema_str; bool isOperationCreator; union { - void (*operation)(Stack*); + void (*operation)(Stack&); OperationCreator operationCreator; }; AliasAnalysisKind aliasAnalysis; explicit constexpr OperatorGeneratorArgs( torch::detail::SelectiveStr schema_str, - void (*op)(Stack*), + void (*op)(Stack&), AliasAnalysisKind aa) : schema_str(schema_str), isOperationCreator(false), @@ -472,7 +472,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".int_int(int a, int b) -> " #int_float_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a, b; \ pop(stack, a, b); \ push(stack, op); \ @@ -482,7 +482,7 @@ struct OperatorGeneratorArgs { TORCH_SELECTIVE_SCHEMA( \ #aten_op \ ".float_float(float a, float b) -> " #int_float_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ double a, b; \ pop(stack, a, b); \ push(stack, op); \ @@ -492,7 +492,7 @@ struct OperatorGeneratorArgs { TORCH_SELECTIVE_SCHEMA( \ #aten_op \ ".complex_complex(complex a, complex b) -> " #complex_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c10::complex a, b; \ pop(stack, a, b); \ push(stack, op); \ @@ -503,7 +503,7 @@ struct OperatorGeneratorArgs { #define DEFINE_GENERIC_OP(aten_op, int_op, float_op, int_result, float_result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a, b; \ pop(stack, a, b); \ push(stack, int_op); \ @@ -512,7 +512,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".float(float a, float b) -> " #float_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ double a, b; \ pop(stack, a, b); \ push(stack, float_op); \ @@ -523,7 +523,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".int_float(int a, float b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a; \ double b; \ pop(stack, a, b); \ @@ -533,7 +533,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".float_int(float a, int b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ double a; \ int64_t b; \ pop(stack, a, b); \ @@ -544,7 +544,7 @@ struct OperatorGeneratorArgs { #define DEFINE_INT_OP(aten_op, op) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> int"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a, b; \ pop(stack, a, b); \ push(stack, op); /* NOLINT(hicpp-signed-bitwise) */ \ @@ -554,7 +554,7 @@ struct OperatorGeneratorArgs { #define DEFINE_STR_CMP_OP(aten_op, op) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".str(str a, str b) -> bool"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ auto b = pop(stack).toStringRef(); \ auto a = pop(stack).toStringRef(); \ push(stack, op); \ @@ -570,7 +570,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op string_val \ "(Scalar a, Scalar b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ IValue x, y; \ pop(stack, x, y); \ if (x.isDouble()) { \ @@ -625,7 +625,7 @@ struct OperatorGeneratorArgs { #define DEFINE_UNARY_INT_OP(aten_op, op, result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a; \ pop(stack, a); \ push(stack, op); \ @@ -635,7 +635,7 @@ struct OperatorGeneratorArgs { #define DEFINE_UNARY_FLOAT_OP(aten_op, op, result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".float(float a) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ double a; \ pop(stack, a); \ push(stack, op); \ @@ -647,7 +647,7 @@ struct OperatorGeneratorArgs { DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result), \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ IValue x; \ pop(stack, x); \ if (x.isDouble()) { \ @@ -662,7 +662,7 @@ struct OperatorGeneratorArgs { #define DEFINE_BOOL_OP(aten_op, op) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".bool(bool a, bool b) -> bool"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ bool a, b; \ pop(stack, a, b); \ push(stack, op); \ @@ -671,7 +671,7 @@ struct OperatorGeneratorArgs { #define DEFINE_STRING_OP(op_name, string_op, result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name ".str(str a, str b) ->" #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ auto b = pop(stack).toStringRef(); \ auto a = pop(stack).toStringRef(); \ push(stack, string_op); \ @@ -685,7 +685,7 @@ struct OperatorGeneratorArgs { #define DEFINE_UNARY_COMPLEX_OP(aten_op, op, result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".complex(complex a) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c10::complex a; \ pop(stack, a); \ push(stack, op); \ @@ -709,7 +709,7 @@ struct OperatorGeneratorArgs { DEFINE_UNARY_COMPLEX_OP(aten_op, op, complex_result), \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ IValue x; \ pop(stack, x); \ if (x.isDouble()) { \ @@ -739,7 +739,7 @@ struct OperatorGeneratorArgs { complex_result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a, b; \ pop(stack, a, b); \ push(stack, int_op); \ @@ -748,7 +748,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".complex(complex a, complex b) -> " #complex_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c10::complex a, b; \ pop(stack, a, b); \ push(stack, complex_op); \ @@ -757,7 +757,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".float(float a, float b) -> " #float_result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ double a, b; \ pop(stack, a, b); \ push(stack, float_op); \ @@ -768,7 +768,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op \ ".int_complex(int a, complex b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ int64_t a; \ c10::complex b; \ pop(stack, a, b); \ @@ -778,7 +778,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".complex_int(complex a, int b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c10::complex a; \ int64_t b; \ pop(stack, a, b); \ @@ -790,7 +790,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".float_complex(float a, complex b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ double a; \ c10::complex b; \ pop(stack, a, b); \ @@ -800,7 +800,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA( \ #aten_op ".complex_float(complex a, float b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c10::complex a; \ double b; \ pop(stack, a, b); \ @@ -813,7 +813,7 @@ struct OperatorGeneratorArgs { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op string_val \ "(Scalar a, Scalar b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ IValue x, y; \ pop(stack, x, y); \ if (x.isComplexDouble()) { \ @@ -860,7 +860,7 @@ struct OperatorGeneratorArgs { aten_op, int_op, float_op, complex_op, result) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#aten_op "(Scalar a, Scalar b) -> " #result), \ - [](Stack* stack) { \ + [](Stack& stack) { \ IValue x, y; \ pop(stack, x, y); \ if (x.isComplexDouble()) { \ diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index 2953b686ee379..9164471dfddf7 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -89,7 +89,7 @@ auto powWrapper(T a, U b) { static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"), - [](Stack* stack) { + [](Stack& stack) { std::stringstream ss; ss << pop(stack); push(stack, ss.str()); @@ -97,7 +97,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::list(str t) -> str[]"), - [](Stack* stack) { + [](Stack& stack) { auto str = pop(stack).toStringRef(); c10::List chars; chars.reserve(str.size()); @@ -109,7 +109,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.cpu()); @@ -117,7 +117,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::layout(Tensor a) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.layout()); @@ -128,7 +128,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::__range_length(int lo, int hi, int step) -> int"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t lo, hi, step; pop(stack, lo, hi, step); @@ -148,7 +148,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::__derive_index(int index, int start, int step) -> int"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t index, start, step; pop(stack, index, start, step); @@ -157,7 +157,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::TupleUnpack(Any tup) -> ..."), - [](Stack* stack) { tupleUnpack(*stack); }, + [](Stack& stack) { tupleUnpack(stack); }, aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::unchecked_cast(t x) -> t"), @@ -165,7 +165,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::IntImplicit(Tensor a) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); checkImplicitTensorToNum(a, /*to int*/ true); @@ -174,7 +174,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ComplexImplicit(Tensor a) -> complex"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); checkImplicitTensorToNum(a, /*to int*/ false); @@ -183,7 +183,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::FloatImplicit(Tensor a) -> float"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); checkImplicitTensorToNum(a, /*to int*/ false); @@ -192,7 +192,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ScalarImplicit(Tensor a) -> Scalar"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); checkImplicitTensorToNum(a, /*to int*/ false); @@ -201,7 +201,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Bool.Tensor(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_nonzero()); @@ -209,7 +209,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Bool.int(int a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t i; pop(stack, i); @@ -218,7 +218,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Bool.float(float a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double d; pop(stack, d); @@ -227,7 +227,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.item()); @@ -235,7 +235,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool b; pop(stack, b); @@ -244,7 +244,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double d; pop(stack, d); @@ -253,7 +253,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"), - [](Stack* stack) { + [](Stack& stack) { IValue scalar; pop(stack, scalar); if (scalar.isInt()) { @@ -266,7 +266,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"), - [](Stack* stack) { + [](Stack& stack) { auto s = pop(stack).toString(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::string::size_type sz; @@ -283,7 +283,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.item()); @@ -291,7 +291,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Float.Scalar(Scalar a) -> float"), - [](Stack* stack) { + [](Stack& stack) { IValue scalar; pop(stack, scalar); if (scalar.isDouble()) { @@ -305,7 +305,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Float.int(int a) -> float"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t i; pop(stack, i); @@ -314,7 +314,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Float.bool(bool a) -> float"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool b; pop(stack, b); @@ -323,7 +323,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Float.str(str a) -> float"), - [](Stack* stack) { + [](Stack& stack) { auto s = pop(stack).toString(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::string::size_type sz; @@ -340,7 +340,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Complex.Scalar(Scalar a) -> complex"), - [](Stack* stack) { + [](Stack& stack) { IValue scalar; pop(stack, scalar); if (scalar.isComplexDouble()) { @@ -355,7 +355,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::Complex.Tensor_Tensor(Tensor a, Tensor b) -> complex"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a, b; pop(stack, a, b); push(stack, c10::complex(a.item(), b.item())); @@ -363,21 +363,21 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::format(str self, ...) -> str"), - [](Stack* stack) { + [](Stack& stack) { size_t num_inputs = pop(stack).toInt(); - format(*stack, num_inputs); + format(stack, num_inputs); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::einsum.sublist(Tensor a, ...) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { size_t num_inputs = pop(stack).toInt(); - einsum(*stack, num_inputs); + einsum(stack, num_inputs); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.Scalar(Scalar a) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { at::Scalar s; pop(stack, s); push(stack, at::scalar_to_tensor(s)); @@ -385,29 +385,29 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::RaiseException(str msg) -> ()"), - [](Stack* stack) { throw JITException(pop(stack).toStringRef()); }, + [](Stack& stack) { throw JITException(pop(stack).toStringRef()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::Size(int[] sizes) -> int[]"), - [](Stack* stack) {}, + [](Stack& stack) {}, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::size(Tensor self) -> int[]"), - [](Stack* stack) { + [](Stack& stack) { auto t = std::move(pop(stack)).toTensor(); pack(stack, t.sizes().vec()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::EnumName(AnyEnumType enum) -> str"), - [](Stack* stack) { + [](Stack& stack) { IValue e = pop(stack); push(stack, e.toEnumHolder()->name()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::EnumValue.int(AnyEnumType enum) -> int"), - [](Stack* stack) { + [](Stack& stack) { IValue e = pop(stack); push(stack, e.toEnumHolder()->value()); }, @@ -415,14 +415,14 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "prim::EnumValue.float(AnyEnumType enum) -> float"), - [](Stack* stack) { + [](Stack& stack) { IValue e = pop(stack); push(stack, e.toEnumHolder()->value()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::EnumValue.str(AnyEnumType enum) -> str"), - [](Stack* stack) { + [](Stack& stack) { IValue e = pop(stack); push(stack, e.toEnumHolder()->value()); }, @@ -431,7 +431,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { // note the compiler knows to type TupleIndex more accurately than it // is listed here. TORCH_SELECTIVE_SCHEMA("prim::TupleIndex(Any tup, int i) -> Any"), - [](Stack* stack) { + [](Stack& stack) { int64_t index = pop(stack).toInt(); auto tuple = pop(stack).toTuple(); auto norm_index = normalizeIndex(index, tuple->elements().size()); @@ -439,7 +439,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { norm_index > static_cast(tuple->elements().size())) { throw std::out_of_range("Tuple list index out of range"); } - stack->emplace_back(tuple->elements()[norm_index]); + stack.emplace_back(tuple->elements()[norm_index]); }, aliasAnalysisSpecialCase()), OperatorGeneratorArgs( @@ -453,11 +453,11 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::device(Tensor a) -> Device"), - [](Stack* stack) { push(stack, pop(stack).toTensor().device()); }, + [](Stack& stack) { push(stack, pop(stack).toTensor().device()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::dtype(Tensor a) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, static_cast(a.scalar_type())); @@ -465,11 +465,11 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::__not__(bool self) -> bool"), - [](Stack* stack) { push(stack, !pop(stack).toBool()); }, + [](Stack& stack) { push(stack, !pop(stack).toBool()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::__is__(t1 self, t2 obj) -> bool"), - [](Stack* stack) { + [](Stack& stack) { IValue self, obj; pop(stack, self, obj); push(stack, self.is(obj)); @@ -477,7 +477,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::__isnot__(t1 self, t2 obj) -> bool"), - [](Stack* stack) { + [](Stack& stack) { IValue self, obj; pop(stack, self, obj); push(stack, !self.is(obj)); @@ -485,28 +485,28 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::element_size(Tensor self) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor arg = pop(stack).toTensor(); push(stack, arg.element_size()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::numel(Tensor self) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor arg = pop(stack).toTensor(); push(stack, arg.numel()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::dim(Tensor self) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor arg = pop(stack).toTensor(); push(stack, arg.dim()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"), - [](Stack* stack) { + [](Stack& stack) { RECORD_FUNCTION("get_device", std::vector()); auto result = at::get_device((std::move(peek(stack, 0, 1))).toTensor()); @@ -516,7 +516,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"), - [](Stack* stack) { + [](Stack& stack) { RECORD_FUNCTION("storage_offset", std::vector()); auto result = ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset(); @@ -526,7 +526,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { RECORD_FUNCTION("is_contiguous", std::vector()); auto result = ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous(); @@ -623,7 +623,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::eq.device(Device a, Device b) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto a = pop(stack).toDevice(); auto b = pop(stack).toDevice(); push(stack, a == b); @@ -631,7 +631,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ne.device(Device a, Device b) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto a = pop(stack).toDevice(); auto b = pop(stack).toDevice(); push(stack, a != b); @@ -639,7 +639,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::eq.bool(bool a, bool b) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto a = pop(stack); auto b = pop(stack); push(stack, a == b); @@ -647,7 +647,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ne.bool(bool a, bool b) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto a = pop(stack); auto b = pop(stack); push(stack, a != b); @@ -655,11 +655,11 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"), - [](Stack* stack) { push(stack, IValue::uninitialized()); }, + [](Stack& stack) { push(stack, IValue::uninitialized()); }, aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::Print(...) -> ()"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); std::stringstream ss; bool first = true; @@ -682,7 +682,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { // prim::VarConcat(Tensors..., dim) -> Tensor OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::VarConcat(...) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); auto dim = pop(stack).toInt(); std::vector inputs(num_inputs - 1); @@ -694,7 +694,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); auto dim = pop(stack).toInt(); std::vector inputs(num_inputs - 1); @@ -707,7 +707,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"), - [](Stack* stack) { + [](Stack& stack) { IValue x = pop(stack); IValue y = pop(stack); push(stack, x == y); @@ -716,7 +716,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::ne.enum(AnyEnumType a, AnyEnumType b) -> bool"), - [](Stack* stack) { + [](Stack& stack) { IValue x = pop(stack); IValue y = pop(stack); push(stack, x != y); @@ -731,7 +731,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::dequantize.tensor(Tensor qtensor) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor qtensor; pop(stack, qtensor); push(stack, at::dequantize(qtensor)); @@ -740,14 +740,14 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::dequantize.list(Tensor[] qtensors) -> Tensor[]"), - [](Stack* stack) { + [](Stack& stack) { auto qtensors = pop(stack).toTensorVector(); push(stack, at::dequantize(qtensors)); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"), - [](Stack* stack) { dequantize(*stack); }, + [](Stack& stack) { dequantize(stack); }, aliasAnalysisFromSchema()), DEFINE_UNARY_OP_WITH_COMPLEX(aten::log, std::log(a), float, float), DEFINE_STRING_OP(aten::add, a + b, str), @@ -847,7 +847,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { float), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::pow.int_to_int(int a, int b) -> int"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t a, b; pop(stack, a, b); @@ -860,7 +860,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { DEFINE_BINARY_OP(prim::max, a > b ? a : b), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::type(Device self) -> str"), - [](Stack* stack) { + [](Stack& stack) { auto d = pop(stack); push( stack, DeviceTypeName(d.toDevice().type(), /* lower_case=*/true)); @@ -869,7 +869,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { // tensor length op (size of 1st dimension) OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::len.Tensor(Tensor t) -> int"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor t = pop(stack).toTensor(); if (t.dim() == 0) { AT_ERROR("len() of a 0-d tensor"); @@ -879,7 +879,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ord(str string) -> int"), - [](Stack* stack) { + [](Stack& stack) { auto string = pop(stack).toStringRef(); TORCH_CHECK( string.size() == 1, @@ -891,7 +891,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::lower(str self) -> str"), - [](Stack* stack) { + [](Stack& stack) { auto string = pop(stack).toStringRef(); std::stringstream ss; for (char c : string) { @@ -912,14 +912,14 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::len.str(str s) -> int"), - [](Stack* stack) { + [](Stack& stack) { auto string = pop(stack).toStringRef(); push(stack, static_cast(string.size())); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::dict() -> Dict(str, Tensor)"), - [](Stack* stack) { + [](Stack& stack) { auto dict = c10::impl::GenericDict(StringType::get(), TensorType::get()); push(stack, dict); @@ -928,7 +928,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::__getitem__.str(str s, int index) -> str"), - [](Stack* stack) { + [](Stack& stack) { auto index = pop(stack).toInt(); auto string = pop(stack).toStringRef(); auto norm_index = normalizeIndex(index, string.size()); @@ -941,7 +941,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { TORCH_SELECTIVE_SCHEMA("aten::copy_." #other_type \ "(Tensor(a!) self, " #other_type \ " other) -> Tensor(a!)"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ at::Tensor t; \ c_type other; \ pop(stack, t, other); \ @@ -957,7 +957,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"), - [](Stack* stack) { + [](Stack& stack) { bool create_graph = pop(stack).toBool(); auto retain_graph = pop(stack).toOptional(); IValue gradient_ivalue = pop(stack); @@ -977,7 +977,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { auto indices = pop(stack).to>>(); auto self = pop(stack).toTensor(); auto result = at::index(self, indices); @@ -987,7 +987,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::_index_put_impl_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)"), - [](Stack* stack) { + [](Stack& stack) { auto unsafe = pop(stack).toBool(); auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); @@ -1001,7 +1001,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::index_put_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)"), - [](Stack* stack) { + [](Stack& stack) { auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); auto indices = pop(stack).to>>(); @@ -1013,7 +1013,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::index_put.hacked_twin(Tensor self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); auto indices = pop(stack).to>>(); @@ -1026,7 +1026,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool non_blocking; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -1044,7 +1044,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::to.prim_dtype(Tensor(a) self, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool non_blocking; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -1060,7 +1060,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_cuda(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_cuda()); @@ -1068,7 +1068,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_xpu(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_xpu()); @@ -1076,7 +1076,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::data(Tensor(a) a) -> Tensor(a)"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, autograd::Variable(a).variable_data()); @@ -1113,7 +1113,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { #define DEFINE_STRING_IS_OP(op_name, char_op) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> bool"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ auto string = pop(stack).toStringRef(); \ push( \ stack, \ @@ -1134,7 +1134,7 @@ static const OperatorGeneratorArgs opGenArgs[] = { #define DEFINE_STRING_CHAR_MAP_OP(op_name, char_op) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> str"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ auto string = pop(stack).toStringRef(); \ std::stringstream ss; \ for (char c : string) { \ @@ -1183,7 +1183,7 @@ RegisterOperators reg(([]() { // operator below is intended to be as close to the Python // implementation in torch/csrc/utils/tensor_list.cpp as possible. [](const Node* /*node*/) -> Operation { - return [](Stack* stack) { + return [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int elem_ty_val; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -1260,19 +1260,19 @@ RegisterOperators reg(([]() { return v; })()); -void dictSetItem(Stack* stack) { +void dictSetItem(Stack& stack) { auto value = pop(stack); auto idx = pop(stack); auto dict = pop(stack).toGenericDict(); dict.insert_or_assign(std::move(idx), std::move(value)); } -void dictLen(Stack* stack) { +void dictLen(Stack& stack) { auto dict = pop(stack).toGenericDict(); push(stack, int64_t(dict.size())); } -void dictValues(Stack* stack) { +void dictValues(Stack& stack) { auto dict = pop(stack).toGenericDict(); auto values = c10::impl::GenericList(dict.valueType()); for (const auto& entry : dict) { @@ -1281,7 +1281,7 @@ void dictValues(Stack* stack) { push(stack, values); } -void dictKeys(Stack* stack) { +void dictKeys(Stack& stack) { auto dict = pop(stack).toGenericDict(); auto keys = c10::impl::GenericList(dict.keyType()); for (const auto& entry : dict) { @@ -1290,7 +1290,7 @@ void dictKeys(Stack* stack) { push(stack, keys); } -void dictIndex(Stack* stack) { +void dictIndex(Stack& stack) { auto key = pop(stack); auto dict = pop(stack).toGenericDict(); auto value = dict.find(key); @@ -1301,7 +1301,7 @@ void dictIndex(Stack* stack) { } template -void dictGet(Stack* stack) { +void dictGet(Stack& stack) { IValue default_value; if (has_default) { default_value = pop(stack); @@ -1318,7 +1318,7 @@ void dictGet(Stack* stack) { // If the key is in the dict, return it. Else set it to the default value and // return that. -void dictSetDefault(Stack* stack) { +void dictSetDefault(Stack& stack) { auto default_value = pop(stack); auto key = pop(stack); auto dict = pop(stack).toGenericDict(); @@ -1332,7 +1332,7 @@ void dictSetDefault(Stack* stack) { } template -void dictPop(Stack* stack) { +void dictPop(Stack& stack) { IValue default_value; if (has_default) { default_value = pop(stack); @@ -1355,13 +1355,13 @@ void dictPop(Stack* stack) { } } -void dictDelete(Stack* stack) { +void dictDelete(Stack& stack) { dictPop(stack); // pop pushes an item on the stack but delete does not, so get rid of it pop(stack); } -void dictPopItem(Stack* stack) { +void dictPopItem(Stack& stack) { auto dict = pop(stack).toGenericDict(); if (dict.size() == 0) { AT_ERROR("popitem(): dictionary is empty"); @@ -1376,18 +1376,18 @@ void dictPopItem(Stack* stack) { push(stack, tuple); } -void dictContains(Stack* stack) { +void dictContains(Stack& stack) { auto key = pop(stack); auto dict = pop(stack).toGenericDict(); push(stack, dict.contains(key)); } -void dictClear(Stack* stack) { +void dictClear(Stack& stack) { auto dict = pop(stack).toGenericDict(); dict.clear(); } -void dictUpdate(Stack* stack) { +void dictUpdate(Stack& stack) { auto to_add = pop(stack).toGenericDict(); auto dict = pop(stack).toGenericDict(); @@ -1396,7 +1396,7 @@ void dictUpdate(Stack* stack) { } } -void dictItems(Stack* stack) { +void dictItems(Stack& stack) { auto dict = pop(stack).toGenericDict(); auto key_type = dict.keyType(); auto value_type = dict.valueType(); @@ -1409,11 +1409,11 @@ void dictItems(Stack* stack) { push(stack, std::move(items)); } -void dictCopy(Stack* stack) { +void dictCopy(Stack& stack) { push(stack, pop(stack).toGenericDict().copy()); } -void dictConstructFromList(Stack* stack) { +void dictConstructFromList(Stack& stack) { auto input_list = pop(stack); auto list = input_list.toList(); auto tup_type = list.elementType()->expect(); @@ -2120,7 +2120,7 @@ TORCH_LIBRARY_IMPL(aten, CatchAll, m) { static const OperatorGeneratorArgs opGenArgs1[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::rangelist(int n) -> int[]"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t n; pop(stack, n); @@ -2136,7 +2136,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { // because all _to_tensor conversion have to have the same operator namet OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.bool(bool a) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool b; pop(stack, b); @@ -2145,21 +2145,21 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::device(str a) -> Device"), - [](Stack* stack) { + [](Stack& stack) { push(stack, c10::Device(pop(stack).toStringRef())); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::percentFormat(str self, ...) -> str"), - [](Stack* stack) { + [](Stack& stack) { size_t num_inputs = pop(stack).toInt(); - percentFormat(*stack, num_inputs); + percentFormat(stack, num_inputs); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::to.prim_other(Tensor(a) self, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor self; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool non_blocking; @@ -2174,7 +2174,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::requires_grad(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.requires_grad()); @@ -2182,7 +2182,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::grad(Tensor a) -> Tensor(*)"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.grad()); @@ -2190,7 +2190,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_sparse(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_sparse()); @@ -2198,7 +2198,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_sparse_csr(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_sparse_csr()); @@ -2206,7 +2206,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_mkldnn(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_mkldnn()); @@ -2214,7 +2214,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_mlc()); @@ -2222,7 +2222,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_vulkan(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_vulkan()); @@ -2230,7 +2230,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_quantized()); @@ -2238,7 +2238,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_meta(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_meta()); @@ -2246,7 +2246,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.is_ort()); @@ -2254,7 +2254,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); if (a.name() == "") { @@ -2266,7 +2266,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::index(Device self) -> int?"), - [](Stack* stack) { + [](Stack& stack) { auto d = pop(stack).toDevice(); if (d.has_index()) { push(stack, d.index()); @@ -2279,11 +2279,11 @@ static const OperatorGeneratorArgs opGenArgs1[] = { // TODO return generator object when torchscript supports RNG // first-class TORCH_SELECTIVE_SCHEMA("aten::manual_seed(int seed) -> ()"), - [](Stack* stack) { at::manual_seed(pop(stack).toInt()); }, + [](Stack& stack) { at::manual_seed(pop(stack).toInt()); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::cuda(Tensor(a) self) -> Tensor(a|b)"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a; pop(stack, a); push(stack, a.cuda()); @@ -2291,12 +2291,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::AutogradZero() -> Tensor"), - [](Stack* stack) { stack->emplace_back(at::Tensor()); }, + [](Stack& stack) { stack.emplace_back(at::Tensor()); }, aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]"), - [](Stack* stack) { + [](Stack& stack) { bool keepdim = pop(stack).toBool(); c10::List axes = pop(stack).toIntList(); c10::List size = pop(stack).toIntList(); @@ -2324,7 +2324,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::BroadcastSizes(...) -> int[]"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); std::vector size; size.reserve(8); @@ -2339,7 +2339,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::warn(str message, int stacklevel=2) -> ()"), - [](Stack* stack) { + [](Stack& stack) { TORCH_CHECK(false, "warn is implemented directly in the interpreter"); }, aliasAnalysisFromSchema()), @@ -2347,7 +2347,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "onnx::Reshape(Tensor input, Tensor shape) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor input, shape; pop(stack, input, shape); shape = shape.contiguous(); @@ -2358,7 +2358,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("onnx::Shape(Tensor t) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { auto t = pop(stack).toTensor(); at::IntArrayRef sizes = t.sizes(); auto sizes_tensor = torch::empty( @@ -2367,12 +2367,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = { for (const auto i : c10::irange(sizes.size())) { accessor[i] = sizes[i]; } - stack->emplace_back(sizes_tensor); + stack.emplace_back(sizes_tensor); }, aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::AutogradAnyNonZero(...) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); bool result = false; for (const IValue& v : last(stack, num_inputs)) { @@ -2395,12 +2395,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = { } } drop(stack, num_inputs); - stack->emplace_back(result); + stack.emplace_back(result); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::AutogradAllZero(...) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); bool result = true; for (const IValue& v : last(stack, num_inputs)) { @@ -2411,12 +2411,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = { } } drop(stack, num_inputs); - stack->emplace_back(result); + stack.emplace_back(result); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::AutogradAllNonZero(...) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto num_inputs = pop(stack).toInt(); bool result = true; for (const IValue& v : last(stack, num_inputs)) { @@ -2427,31 +2427,31 @@ static const OperatorGeneratorArgs opGenArgs1[] = { } } drop(stack, num_inputs); - stack->emplace_back(result); + stack.emplace_back(result); }, aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::AutogradAdd(Any a, Any b) -> Any"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor a, b; pop(stack, a, b); // NOLINTNEXTLINE(bugprone-branch-clone) if (!a.defined() && !b.defined()) { // undef + undef == undef - stack->emplace_back(a); + stack.emplace_back(a); } else if (!a.defined()) { - stack->emplace_back(b); + stack.emplace_back(b); } else if (!b.defined()) { - stack->emplace_back(a); + stack.emplace_back(a); } else { - stack->emplace_back(a + b); + stack.emplace_back(a + b); } }, aliasAnalysisSpecialCase()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::_size_if_not_equal(int[] self_size, int[] other_size) -> int[]?"), - [](Stack* stack) { + [](Stack& stack) { IValue self_size, other_size; pop(stack, self_size, other_size); auto s = self_size.toIntVector(); @@ -2466,7 +2466,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::_unwrap_optional(t(a)? optional) -> t(a)"), - [](Stack* stack) { + [](Stack& stack) { auto val = pop(stack); TORCH_CHECK(!val.isNone(), "Unwrapping null optional"); push(stack, std::move(val)); @@ -2476,7 +2476,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = { RegisterOperators reg1( createOperators(opGenArgs1, sizeof(opGenArgs1) / sizeof(opGenArgs1[0]))); -void hashValue(Stack* stack) { +void hashValue(Stack& stack) { auto value = pop(stack); push(stack, value.hash()); } @@ -2618,7 +2618,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { #define DEFINE_CONVERT_BASE_OP(op_name, prefix, char_op) \ OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA(#op_name "(int i) -> str"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ auto i = pop(stack).toInt(); \ std::stringstream ss; \ if (i < 0) { \ @@ -2635,7 +2635,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::bin(int i) -> str"), - [](Stack* stack) { + [](Stack& stack) { auto i = pop(stack).toInt(); std::stringstream ss; if (i == 0) { @@ -2656,7 +2656,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "prim::StringIndex(str string, int index) -> str"), - [](Stack* stack) { + [](Stack& stack) { auto index = pop(stack).toInt(); auto string = pop(stack).toStringRef(); auto norm_index = normalizeIndex(index, string.size()); @@ -2666,7 +2666,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::chr(int i) -> str"), - [](Stack* stack) { + [](Stack& stack) { auto i = pop(stack).toInt(); std::stringstream ss; TORCH_CHECK( @@ -2684,7 +2684,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::modf(float a) -> (float, float)"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; pop(stack, a); @@ -2696,7 +2696,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::frexp(float a) -> (float, int)"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; pop(stack, a); @@ -2710,7 +2710,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::ldexp(float x, int i) -> float"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -2810,7 +2810,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { float), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::abs(Tensor x) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor x; pop(stack, x); push(stack, x.abs()); @@ -2833,7 +2833,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { float), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::_tensor_to_list(Tensor self) -> int[]"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor t; pop(stack, t); c10::List elems; @@ -2846,7 +2846,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::_list_to_tensor(int[] self) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toIntList(); auto t = torch::empty( {static_cast(l.size())}, at::dtype(at::kInt)); @@ -2858,7 +2858,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.int(int[] self) -> int"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toIntList(); auto sum = 0; for (const auto& elem : l) { @@ -2869,7 +2869,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.float(float[] self) -> float"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toDoubleList(); auto sum = 0.0; for (const auto& elem : l) { @@ -2880,7 +2880,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.complex(complex[] self) -> complex"), - [](Stack* stack) { + [](Stack& stack) { c10::List> l = pop(stack).toComplexDoubleList(); c10::complex sum = 0.0; for (const auto i : c10::irange(l.size())) { @@ -2891,7 +2891,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::sum.bool(bool[] self) -> int"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toBoolList(); auto sum = 0; for (const auto& elem : l) { @@ -2904,7 +2904,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.str(str[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { auto l = pop(stack).toList(); for (const auto& elem : l) { if (elem != "") { @@ -2917,7 +2917,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.int(int[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toIntList(); for (const auto& elem : l) { if (elem) { @@ -2930,7 +2930,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.float(float[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toDoubleList(); for (const auto& elem : l) { if (elem) { @@ -2943,7 +2943,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::any.bool(bool[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toBoolList(); for (const auto& elem : l) { if (elem) { @@ -2956,7 +2956,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::all.int(int[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toIntList(); for (const auto& elem : l) { if (!elem) { @@ -2969,7 +2969,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::all.float(float[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toDoubleList(); for (const auto& elem : l) { if (!elem) { @@ -2982,7 +2982,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::all.bool(bool[] self) -> bool"), - [](Stack* stack) { + [](Stack& stack) { c10::List l = pop(stack).toBoolList(); for (const auto& elem : l) { if (!elem) { @@ -2995,7 +2995,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("aten::divmod.int(int x, int y) -> (int, int)"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t a, b; lldiv_t divresult = {}; @@ -3018,7 +3018,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::divmod.float(float x, float y) -> (float, float)"), - [](Stack* stack) { + [](Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double a, b; pop(stack, a, b); @@ -3035,7 +3035,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA("prim::id(AnyClassType? x) -> int"), - [](Stack* stack) { + [](Stack& stack) { IValue a; pop(stack, a); if (a.isNone()) { @@ -3050,7 +3050,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::divmod." #type_a "_" #type_b "(" #type_a \ " x," #type_b " y) -> (float, float)"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ type_a a; \ type_b b; \ pop(stack, a, b); \ @@ -3076,7 +3076,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \ " x," #type_b " y) -> complex"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ actual_type_a a; \ actual_type_b b; \ pop(stack, a, b); \ @@ -3090,7 +3090,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \ " x," #type_b " y) -> complex"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ actual_type_a a; \ actual_type_b b; \ pop(stack, a, b); \ @@ -3101,7 +3101,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = { OperatorGeneratorArgs( \ TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_b "_" #type_a \ "(" #type_b " x," #type_a " y) -> complex"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ actual_type_b a; \ actual_type_a b; \ pop(stack, a, b); \ diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp index 43c278be474fd..e43c7c052a673 100644 --- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp @@ -31,7 +31,7 @@ RegisterOperators reg( {Operator( prim::profile, [](const Node* node) -> Operation { - return [](Stack* stack) { + return [](Stack& stack) { AT_ERROR( "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT }; @@ -40,7 +40,7 @@ RegisterOperators reg( Operator( prim::profile_ivalue, [](const Node* node) -> Operation { - return [](Stack* stack) { + return [](Stack& stack) { AT_ERROR( "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT }; @@ -50,9 +50,9 @@ RegisterOperators reg( prim::FusionGroup, [](const Node* node) -> Operation { const auto key = registerFusion(node); - return [key](Stack* stack) { + return [key](Stack& stack) { RECORD_FUNCTION("FusionGroup", std::vector()); - runFusion(key, *stack); + runFusion(key, stack); }; }, aliasAnalysisSpecialCase()), @@ -67,7 +67,7 @@ RegisterOperators reg( t->castRaw()->requiresGrad().has_value()); return *t->castRaw()->requiresGrad(); }); - return [rg_props](Stack* stack) { + return [rg_props](Stack& stack) { auto num_inputs = rg_props.size(); // Check every input's shape against profiled (expected) shape. for (const auto i : c10::irange(num_inputs)) { @@ -91,14 +91,14 @@ RegisterOperators reg( auto outputs_used = fmap(node->outputs(), [](const Value* v) { return v->uses().size() > 0; }); - return [=](Stack* stack) { + return [=](Stack& stack) { RECORD_FUNCTION("chunk", last(stack, 1)); at::Tensor t; pop(stack, t); auto result = at::chunk(t, chunks, dim); - stack->insert( - stack->end(), + stack.insert( + stack.end(), std::make_move_iterator(result.begin()), std::make_move_iterator(result.end())); // NB: Chunk can sometimes return a smaller number of outputs. @@ -121,7 +121,7 @@ RegisterOperators reg( num_results); // We know that the output is unused, so it's ok to push // anything on the stack. - stack->emplace_back(); + stack.emplace_back(); } } }; @@ -132,7 +132,7 @@ RegisterOperators reg( [](const Node* node) -> Operation { int64_t raw_dim = node->i(attr::dim); int64_t chunks = node->i(attr::chunks); - return [raw_dim, chunks](Stack* stack) { + return [raw_dim, chunks](Stack& stack) { c10::List shape = pop(stack).toIntList(); c10::List regular_shape = shape.copy(); c10::List last_shape = shape.copy(); @@ -158,7 +158,7 @@ RegisterOperators reg( aliasAnalysisSpecialCase()), Operator( "aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)", - [](Stack* stack) { + [](Stack& stack) { RECORD_FUNCTION("_grad_sum_to_size", std::vector()); IValue self, size; pop(stack, self, size); @@ -175,7 +175,7 @@ RegisterOperators reg( OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "prim::ModuleContainerIndex.list(Any self, int ind) -> Any"), - [](Stack* stack) { + [](Stack& stack) { IValue ind = pop(stack); IValue module_dict = pop(stack); std::stringstream ss; @@ -189,7 +189,7 @@ RegisterOperators reg( OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "prim::ModuleContainerIndex.dict(Any self, str ind) -> Any"), - [](Stack* stack) { + [](Stack& stack) { IValue ind = pop(stack); IValue module_dict = pop(stack); push(stack, module_dict.toModule().attr(ind.toStringRef())); @@ -198,7 +198,7 @@ RegisterOperators reg( Operator( prim::TypeCheck /* (...) -> (..., bool) */, [](const Node* /* node */) -> Operation { - return [](Stack* /* stack */) { + return [](Stack& /* stack */) { AT_ERROR("prim::TypeCheck not yet implemented"); // NOLINT }; }, @@ -206,7 +206,7 @@ RegisterOperators reg( Operator( prim::FallbackGraph, [](const Node* node) -> Operation { - return [](Stack* stack) { + return [](Stack& stack) { AT_ERROR( "Must be converted to prim::FunctionCall by replaceFallbackGraphWithFallbackFunction"); // NOLINT }; @@ -214,17 +214,17 @@ RegisterOperators reg( aliasAnalysisSpecialCase()), Operator( "prim::Guard(Tensor(a) t) -> Tensor(a)", - [](Stack* stack) { AT_ERROR("Should be replaced by prim::BailOut"); }, + [](Stack& stack) { AT_ERROR("Should be replaced by prim::BailOut"); }, aliasAnalysisFromSchema()), Operator( "prim::BailOut(...) -> Tensor(a)", - [](Stack* /* stack */) { + [](Stack& /* stack */) { AT_ERROR("prim::BailOut not yet implemented"); // NOLINT }, aliasAnalysisFromSchema()), Operator( "prim::BailoutTemplate() -> int", - [](Stack* stack) { + [](Stack& stack) { // TODO: today, we put a single bailout template at the front to // carry the un-optimized graph for bailout nodes to use. Ideally // this should never run, but we haven't written the code to remove @@ -237,7 +237,7 @@ RegisterOperators reg( aliasAnalysisFromSchema()), Operator( "aten::grad(Tensor[] outputs, Tensor[] inputs, Tensor?[]? grad_outputs=None, bool? retain_graph=None, bool create_graph=False, bool allow_unused=False) -> Tensor?[]", - [](Stack* stack) { + [](Stack& stack) { bool allow_unused = pop(stack).toBool(); bool create_graph = pop(stack).toBool(); auto retain_graph = pop(stack).toOptional(); @@ -277,7 +277,7 @@ RegisterOperators reg( // create_graph=True so we use aliasAnalysisConservative for these two OPs Operator( "aten::backward.TensorList(Tensor[] tensors, Tensor?[]? grad_tensors=None, bool? retain_graph=None, bool create_graph=False) -> ()", - [](Stack* stack) { + [](Stack& stack) { bool create_graph = pop(stack).toBool(); auto retain_graph = pop(stack).toOptional(); auto grad_tensors = pop(stack); @@ -298,7 +298,7 @@ RegisterOperators reg( aliasAnalysisConservative()), Operator( "aten::save(t item, str filename) -> ()", - [](Stack* stack) { + [](Stack& stack) { auto filename = pop(stack).toStringRef(); auto ivalue = pop(stack); @@ -312,7 +312,7 @@ RegisterOperators reg( aliasAnalysisFromSchema()), Operator( "prim::IgnoredPythonOp(...) -> None", - [](Stack* stack) { + [](Stack& stack) { throw JITException( "This Python function is annotated to be ignored" " and cannot be and has not been included in the exported" @@ -323,7 +323,7 @@ RegisterOperators reg( aliasAnalysisFromSchema()), Operator( "aten::wait(Future(t) self) -> t", - [](Stack* stack) { + [](Stack& stack) { TORCH_CHECK( false, "wait is implemented directly in the interpreter"); }, @@ -332,7 +332,7 @@ RegisterOperators reg( RegisterOperators logging_operators( {Operator( "prim::AddStatValue(str key, int val) -> ()", - [](Stack* stack) { + [](Stack& stack) { auto val = pop(stack).toInt(); auto key = pop(stack).toString(); @@ -353,7 +353,7 @@ RegisterOperators logging_operators( aliasAnalysisFromSchema()), Operator( "prim::TimePoint() -> int", - [](Stack* stack) { + [](Stack& stack) { auto schema = parseSchema("prim::TimePoint() -> int"); Node* node = nullptr; // TODO: remove this custom tracing code once the custom op bugfix @@ -372,7 +372,7 @@ RegisterOperators logging_operators( }, aliasAnalysisFromSchema())}); -C10_UNUSED void hashValue(Stack* stack) { +C10_UNUSED void hashValue(Stack& stack) { auto value = pop(stack); push(stack, value.hash()); } @@ -453,7 +453,7 @@ bool isSortableListOfObjectsOrTuples( } template -void sort_op(Stack* stack) { +void sort_op(Stack& stack) { bool reverse = has_reverse_arg ? pop(stack).toBool() : false; auto g_list = pop(stack).toList(); @@ -697,7 +697,7 @@ at::Tensor interpolate( ") "); } -void interpolate_op(Stack* stack) { +void interpolate_op(Stack& stack) { at::Tensor input; IValue size; IValue scale_factors; @@ -743,7 +743,7 @@ IValue convert_scale_factor_to_double(const IValue& int_ivalue) { return scale_factor_double; } -void upsample_nearest_op(Stack* stack) { +void upsample_nearest_op(Stack& stack) { at::Tensor input; IValue size; IValue scale_factor_int; @@ -754,7 +754,7 @@ void upsample_nearest_op(Stack* stack) { push(stack, std::move(res)); } -void upsample_op(Stack* stack) { +void upsample_op(Stack& stack) { at::Tensor input; IValue size; IValue scale_factor_int; @@ -772,7 +772,7 @@ void upsample_op(Stack* stack) { push(stack, std::move(res)); } -void upsample_bilinear_op(Stack* stack) { +void upsample_bilinear_op(Stack& stack) { at::Tensor input; IValue size; IValue scale_factor_int; diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp index ace87f20b9c35..015d607044ddb 100644 --- a/torch/csrc/jit/runtime/register_special_ops.cpp +++ b/torch/csrc/jit/runtime/register_special_ops.cpp @@ -184,7 +184,7 @@ void recursiveStore( } template -void createTensorFromList(Stack* stack) { +void createTensorFromList(Stack& stack) { // torch.tensor has a fourth requires_grad arg but torch.as_tensor not, so // we use the template arg to distinguish between these two cases // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -246,7 +246,7 @@ RegisterOperators reg({ OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]"), - [](Stack* stack) { + [](Stack& stack) { RECORD_FUNCTION("split_with_sizes", last(stack, 3)); auto result = at::split_with_sizes( @@ -264,7 +264,7 @@ RegisterOperators reg({ "aten::tensor." #operator_type "(" #operator_type \ " t, *, ScalarType? dtype=None, Device? device=None" \ ", bool requires_grad=False) -> Tensor"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c_type scalar_val; \ IValue dtype; \ IValue device; \ @@ -280,7 +280,7 @@ RegisterOperators reg({ TORCH_SELECTIVE_SCHEMA( \ "aten::as_tensor." #operator_type "(" #operator_type \ " t, *, ScalarType? dtype=None, Device? device=None) -> Tensor"), \ - [](Stack* stack) { \ + [](Stack& stack) { \ c_type scalar_val; \ IValue dtype; \ IValue device; \ @@ -319,7 +319,7 @@ RegisterOperators reg({ // tensor_new.cpp OperatorGenerator( TORCH_SELECTIVE_SCHEMA("aten::_infer_size(int[] a, int[] b) -> int[]"), - [](Stack* stack) { + [](Stack& stack) { auto a = pop(stack); auto b = pop(stack); push(stack, at::infer_size(a.toIntVector(), b.toIntVector())); @@ -328,7 +328,7 @@ RegisterOperators reg({ OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor"), - [](Stack* stack) { + [](Stack& stack) { at::Tensor weight; at::Tensor input; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -353,7 +353,7 @@ RegisterOperators reg({ OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::as_tensor(Tensor(a) data, *, ScalarType? dtype=None, Device? device=None) -> Tensor(a|b)"), - [](Stack* stack) { + [](Stack& stack) { auto device = pop(stack).toOptional(); auto dtype = pop(stack).toOptional(); at::Tensor data = pop(stack).toTensor(); @@ -377,24 +377,24 @@ RegisterOperators reg({ TORCH_SELECTIVE_SCHEMA( "aten::_pack_sequence(Tensor output, Tensor batch_sizes, Tensor? sorted_indices, " "Tensor? unsorted_indices) -> (Tensor, Tensor, Tensor?, Tensor?)"), - [](Stack* stack) {}, + [](Stack& stack) {}, aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA("aten::_get_tracing_state() -> bool"), - [](Stack* stack) { push(stack, false); }, + [](Stack& stack) { push(stack, false); }, aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA("aten::is_scripting() -> bool"), - [](Stack* stack) { push(stack, true); }, + [](Stack& stack) { push(stack, true); }, aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA("aten::has_torch_function(...) -> bool"), - [](Stack* stack) { push(stack, false); }, + [](Stack& stack) { push(stack, false); }, aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::_no_grad_uniform_(Tensor(a!) tensor, float a, float b) -> Tensor(a!)"), - [](Stack* stack) { + [](Stack& stack) { // TODO: remove when script supports setting grad mode torch::NoGradGuard no_grad; @@ -410,7 +410,7 @@ RegisterOperators reg({ OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::_no_grad_normal_(Tensor(a!) tensor, float mean, float std) -> Tensor(a!)"), - [](Stack* stack) { + [](Stack& stack) { // TODO: remove when script supports setting grad mode torch::NoGradGuard no_grad; @@ -426,7 +426,7 @@ RegisterOperators reg({ OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::_no_grad_fill_(Tensor(a!) tensor, float val) -> Tensor(a!)"), - [](Stack* stack) { + [](Stack& stack) { // TODO: remove when script supports setting grad mode torch::NoGradGuard no_grad; @@ -440,7 +440,7 @@ RegisterOperators reg({ OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::_no_grad_zero_(Tensor(a!) tensor) -> Tensor(a!)"), - [](Stack* stack) { + [](Stack& stack) { // TODO: remove when script supports setting grad mode torch::NoGradGuard no_grad; @@ -451,11 +451,11 @@ RegisterOperators reg({ aliasAnalysisFromSchema()), Operator( "aten::is_grad_enabled() -> bool", - [](Stack* stack) { push(stack, torch::GradMode::is_enabled()); }, + [](Stack& stack) { push(stack, torch::GradMode::is_enabled()); }, aliasAnalysisConservative()), Operator( "aten::set_grad_enabled(bool val) -> ()", - [](Stack* stack) { torch::GradMode::set_enabled(pop(stack).toBool()); }, + [](Stack& stack) { torch::GradMode::set_enabled(pop(stack).toBool()); }, aliasAnalysisConservative()), }); } // namespace diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp index b08b59fc6890a..0b41b8e48a345 100644 --- a/torch/csrc/jit/runtime/static/fusion.cpp +++ b/torch/csrc/jit/runtime/static/fusion.cpp @@ -39,7 +39,7 @@ Operation createStaticSubgraphRuntime(const Node* node) { auto g = node->g(attr::Subgraph); auto module = std::make_shared(g); auto num_inputs = module->num_inputs(); - return [module, num_inputs](Stack* stack) { + return [module, num_inputs](Stack& stack) { RECORD_FUNCTION("Static Runtime", std::vector()); auto inps = torch::jit::last(stack, num_inputs); // TODO maybe avoid call to vec @@ -48,10 +48,10 @@ Operation createStaticSubgraphRuntime(const Node* node) { if (module->num_outputs() > 1) { for (auto& o : outputs.toTuple()->elements()) { - push_one(*stack, std::move(o)); + push_one(stack, std::move(o)); } } else { - push_one(*stack, std::move(outputs)); + push_one(stack, std::move(outputs)); } return 0; }; diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index ee8e9038b1c48..e22447819ea67 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -1440,7 +1440,7 @@ void ProcessedNode::run() { } DCHECK(op_); - op_->operator()(&stack); + op_->operator()(stack); DCHECK_EQ(stack.size(), node_->outputs().size()); for (const auto i : c10::irange(node_->outputs().size())) { From 5b0dfd0f8aff50e2fce8f2f1fe6f2ef0594a9e25 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 30 Aug 2021 12:14:09 -0700 Subject: [PATCH 350/530] Fix bad use of channels last kernel in sync batch norm backward (#64100) Summary: Fixes https://github.com/pytorch/pytorch/issues/64039 There are two distinct problems here. 1. If `grad_output` is channels last but not input, then input would be read as-if it were channels last. So reading the wrong values. 2. `use_channels_last_kernels` doesn't guarunte that `suggest_memory_format` will actually return channels last, so use `empty_like` instead so the strides always match. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64100 Reviewed By: mruberry Differential Revision: D30622127 Pulled By: ngimel fbshipit-source-id: e28cc57215596817f1432fcdd6c49d69acfedcf2 --- aten/src/ATen/native/cuda/Normalization.cu | 4 +- aten/src/ATen/native/cuda/Normalization.cuh | 6 ++- test/test_nn.py | 42 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index 0238b1b682877..1d4d1cc4bda4e 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -648,7 +648,9 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; - if (at::cuda::detail::canUse32BitIndexMath(self) && batch_norm_use_channels_last_kernels(self)){ + if (at::cuda::detail::canUse32BitIndexMath(self) && + batch_norm_use_channels_last_kernels(self) && + batch_norm_use_channels_last_kernels(input)) { return batch_norm_backward_elemt_channels_last_cuda_template(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count); } diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh index af074f5d2c6fd..6daa2b0858044 100644 --- a/aten/src/ATen/native/cuda/Normalization.cuh +++ b/aten/src/ATen/native/cuda/Normalization.cuh @@ -1649,7 +1649,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template( const auto stride = input.sizes()[1]; const auto reduction_size = input.numel() / stride; - at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format()); + // Input is guarunteed to be channels-last compatible + at::Tensor grad_input = at::empty_like(input); dim3 block; dim3 grid; @@ -1716,7 +1717,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template( const auto reduction_size = input.numel() / stride; auto norm_fct = 1.0 / reduction_size; - at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format()); + // Input is guarunteed to be channels-last compatible + at::Tensor grad_input = at::empty_like(input); dim3 block; dim3 grid; diff --git a/test/test_nn.py b/test/test_nn.py index bb4dd59be5271..c9815dbf2ee0e 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -11192,6 +11192,48 @@ def test_convert_sync_batchnorm(self): self.assertEqual(layer.state_dict()[key].device, converted_layer.state_dict()[key].device) self.assertEqual(layer.state_dict()[key], converted_layer.state_dict()[key]) + @unittest.skipIf(not TEST_CUDA, "CUDA not available") + def test_sync_batchnorm_backward_elemt(self): + device = 'cuda' + saved_input = torch.rand(2, 3, 2, 1, device=device) + grad_output = torch.rand(2, 3, 2, 1, device=device) + mean = torch.rand(3, device=device) + invstd = torch.rand(3, device=device) + weight = torch.rand(3, device=device) + sum_dy = torch.rand(3, device=device) + sum_dy_xmu = torch.rand(3, device=device) + count_tensor = torch.tensor([5, 5, 5], dtype=torch.int32, device=device) + + gI_contiguous = torch.batch_norm_backward_elemt( + grad_output, + saved_input, + mean, + invstd, + weight, + sum_dy, + sum_dy_xmu, + count_tensor + ) + + # Test batch_norm_backward_elemt gives the same answer for all + # combinations of contiguous as channels_last input + for a, b in [ + (torch.channels_last, torch.contiguous_format), + (torch.contiguous_format, torch.channels_last), + (torch.channels_last, torch.channels_last), + ]: + gI_actual = torch.batch_norm_backward_elemt( + grad_output.contiguous(memory_format=a), + saved_input.contiguous(memory_format=b), + mean, + invstd, + weight, + sum_dy, + sum_dy_xmu, + count_tensor + ) + self.assertEqual(gI_actual, gI_contiguous) + @unittest.skipIf(not TEST_CUDA, "CUDA not available") def test_sync_batchnorm_accuracy_cuda(self): # The target of this test is to test the functionality and accuracy of From d37636901ed1c65c1f8b68e36e37e59eb503c554 Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Mon, 30 Aug 2021 12:16:23 -0700 Subject: [PATCH 351/530] [Doc] `make_tensor` to `torch.testing` module (#63925) Summary: This PR aims to add `make_tensor` to the `torch.testing` module in PyTorch docs. TODOs: * [x] Add examples cc: pmeier mruberry brianjo Pull Request resolved: https://github.com/pytorch/pytorch/pull/63925 Reviewed By: ngimel Differential Revision: D30633487 Pulled By: mruberry fbshipit-source-id: 8e5a1f880c6ece5925b4039fee8122bd739538af --- docs/source/testing.rst | 1 + test/test_autograd.py | 3 +- test/test_binary_ufuncs.py | 4 +- test/test_buffer_protocol.py | 7 +- test/test_foreach.py | 4 +- test/test_indexing.py | 3 +- test/test_jit.py | 3 +- test/test_linalg.py | 4 +- test/test_ops.py | 4 +- test/test_reductions.py | 4 +- test/test_shape_ops.py | 3 +- test/test_sort_and_select.py | 4 +- test/test_sparse.py | 3 +- test/test_sparse_csr.py | 4 +- test/test_tensor_creation_ops.py | 3 +- test/test_testing.py | 3 +- test/test_torch.py | 3 +- test/test_unary_ufuncs.py | 4 +- test/test_view_ops.py | 3 +- torch/testing/__init__.py | 1 + torch/testing/_creation.py | 155 ++++++++++++++++++ .../_internal/common_methods_invocations.py | 4 +- torch/testing/_internal/common_modules.py | 26 +-- torch/testing/_internal/common_utils.py | 100 +---------- 24 files changed, 213 insertions(+), 140 deletions(-) create mode 100644 torch/testing/_creation.py diff --git a/docs/source/testing.rst b/docs/source/testing.rst index 981a636c53390..9f1e2c3c53f89 100644 --- a/docs/source/testing.rst +++ b/docs/source/testing.rst @@ -9,3 +9,4 @@ torch.testing .. automodule:: torch.testing .. autofunction:: assert_close +.. autofunction:: make_tensor diff --git a/test/test_autograd.py b/test/test_autograd.py index 4d416459c2af4..364d48807b737 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -24,13 +24,14 @@ from torch.autograd.profiler_util import (_format_time, EventList, FunctionEvent, FunctionEventAvg) import torch.autograd.functional as autogradF from torch.utils.checkpoint import checkpoint +from torch.testing import make_tensor from torch.testing._internal.common_cuda import TEST_CUDA from torch.testing._internal.common_utils import (TestCase, run_tests, skipIfNoLapack, suppress_warnings, slowTest, load_tests, IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck, TEST_WITH_ROCM, disable_gc, - gradcheck, gradgradcheck, make_tensor) + gradcheck, gradgradcheck) from torch.autograd import Variable, Function, detect_anomaly, kineto_available from torch.autograd.function import InplaceFunction import torch.autograd.forward_ad as fwAD diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index 4995e0dfc6cc7..1e9e804ab86d1 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -13,12 +13,12 @@ from torch._six import inf, nan from torch.testing._internal.common_utils import ( TestCase, iter_indices, TEST_WITH_ASAN, run_tests, - torch_to_numpy_dtype_dict, make_tensor, TEST_SCIPY, set_default_dtype) + torch_to_numpy_dtype_dict, TEST_SCIPY, set_default_dtype) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA, skipCUDAIfRocm, skipIf, ops) -from torch.testing import all_types_and_complex_and, integral_types_and +from torch.testing import all_types_and_complex_and, integral_types_and, make_tensor from torch.testing._internal.common_methods_invocations import binary_ufuncs if TEST_SCIPY: diff --git a/test/test_buffer_protocol.py b/test/test_buffer_protocol.py index c797b913f033c..619386e6d5665 100644 --- a/test/test_buffer_protocol.py +++ b/test/test_buffer_protocol.py @@ -1,4 +1,5 @@ import torch.testing._internal.common_utils as common +from torch.testing import make_tensor from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, dtypes @@ -23,7 +24,7 @@ def _run_test(self, shape, dtype, count=-1, first=0, offset=None, **kwargs): if offset is None: offset = first * get_dtype_size(dtype) - numpy_original = common.make_tensor(shape, torch.device("cpu"), dtype).numpy() + numpy_original = make_tensor(shape, torch.device("cpu"), dtype).numpy() original = memoryview(numpy_original) # First call PyTorch's version in case of errors. # If this call exits successfully, the NumPy version must also do so. @@ -125,7 +126,7 @@ def test_invalid_positional_args(self, device, dtype): @dtypes(*common.torch_to_numpy_dtype_dict.keys()) def test_shared_buffer(self, device, dtype): - x = common.make_tensor((1,), device, dtype) + x = make_tensor((1,), device, dtype) # Modify the whole tensor arr, tensor = self._run_test(SHAPE, dtype) tensor[:] = x @@ -158,7 +159,7 @@ def test_not_a_buffer(self, device, dtype): @dtypes(*common.torch_to_numpy_dtype_dict.keys()) def test_non_writable_buffer(self, device, dtype): - numpy_arr = common.make_tensor((1,), device, dtype).numpy() + numpy_arr = make_tensor((1,), device, dtype).numpy() byte_arr = numpy_arr.tobytes() with self.assertWarnsOnceRegex(UserWarning, r"The given buffer is not writable."): diff --git a/test/test_foreach.py b/test/test_foreach.py index ce9b0d7ee55e3..123ef35bb7093 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -4,11 +4,13 @@ import re import torch import unittest + +from torch.testing import make_tensor from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM, TEST_WITH_SLOW from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, onlyCUDA, skipCUDAIfRocm, skipMeta, ops) from torch.testing._internal.common_methods_invocations import \ - (foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db, make_tensor) + (foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db) # Includes some values such that N * N won't be a multiple of 4, # which should ensure we test the vectorized and non-vectorized diff --git a/test/test_indexing.py b/test/test_indexing.py index 61580910f2cfb..8b8a2ead9ed72 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -8,7 +8,8 @@ import numpy as np -from torch.testing._internal.common_utils import TestCase, run_tests, make_tensor +from torch.testing import make_tensor +from torch.testing._internal.common_utils import TestCase, run_tests from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA, onlyOnCPUAndCUDA) diff --git a/test/test_jit.py b/test/test_jit.py index 2595411c01848..d1a170da6f750 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -69,8 +69,7 @@ from torch.autograd import Variable from torch.jit.annotations import BroadcastingList2, BroadcastingList3, Any # noqa: F401 from torch.nn.utils.rnn import PackedSequence -from torch.testing import FileCheck -from torch.testing._internal.common_utils import make_tensor +from torch.testing import FileCheck, make_tensor import torch.autograd.profiler import torch.cuda import torch.jit diff --git a/test/test_linalg.py b/test/test_linalg.py index 8ba3373d38ce4..f7ce39272bf86 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -14,14 +14,14 @@ from torch.testing._internal.common_utils import \ (TestCase, run_tests, TEST_SCIPY, IS_MACOS, IS_WINDOWS, slowTest, - TEST_WITH_ASAN, make_tensor, TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, + TEST_WITH_ASAN, TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices, gradcheck, gradgradcheck) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyOnCPUAndCUDA, dtypesIfCUDA, onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver) -from torch.testing import floating_and_complex_types, floating_types, all_types +from torch.testing import floating_and_complex_types, floating_types, all_types, make_tensor from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9 from torch.distributions.binomial import Binomial diff --git a/test/test_ops.py b/test/test_ops.py index a6baf8dbe699a..27aee72f00846 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -5,9 +5,9 @@ import torch from torch.testing import \ - (FileCheck, floating_and_complex_types_and, get_all_dtypes) + (FileCheck, floating_and_complex_types_and, get_all_dtypes, make_tensor) from torch.testing._internal.common_utils import \ - (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, make_tensor, + (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings) from torch.testing._internal.common_methods_invocations import \ (op_db, _NOTHING, UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo) diff --git a/test/test_reductions.py b/test/test_reductions.py index eed7f732051cd..ca3042b66cf91 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -10,10 +10,10 @@ from torch._six import inf, nan from torch.testing import ( - integral_types_and, floating_and_complex_types_and, get_all_dtypes) + integral_types_and, floating_and_complex_types_and, get_all_dtypes, make_tensor) from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict, - IS_WINDOWS, make_tensor) + IS_WINDOWS) from torch.testing._internal.common_device_type import ( OpDTypes, instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, ops, precisionOverride) diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py index 916adee666307..cb4ec3c18f82a 100644 --- a/test/test_shape_ops.py +++ b/test/test_shape_ops.py @@ -7,8 +7,9 @@ import warnings from torch._six import nan +from torch.testing import make_tensor from torch.testing._internal.common_utils import ( - TestCase, run_tests, make_tensor, torch_to_numpy_dtype_dict) + TestCase, run_tests, torch_to_numpy_dtype_dict) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyOnCPUAndCUDA, dtypesIfCPU, dtypesIfCUDA, largeTensorTest) diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index 564258aa77b51..e562e389a3fc8 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -5,9 +5,9 @@ from torch._six import nan from itertools import permutations, product -from torch.testing import all_types, all_types_and +from torch.testing import all_types, all_types_and, make_tensor from torch.testing._internal.common_utils import \ - (TEST_WITH_ROCM, TestCase, run_tests, make_tensor, slowTest) + (TEST_WITH_ROCM, TestCase, run_tests, slowTest) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, onlyOnCPUAndCUDA, skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest) diff --git a/test/test_sparse.py b/test/test_sparse.py index abe5e93889498..333f29f13138e 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -5,8 +5,9 @@ import random from collections import defaultdict import unittest +from torch.testing import make_tensor from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \ - do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, make_tensor, \ + do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \ DeterministicGuard from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version from numbers import Number diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py index b9f48855e46db..fbb2b30e46304 100644 --- a/test/test_sparse_csr.py +++ b/test/test_sparse_csr.py @@ -3,8 +3,10 @@ import unittest import random import itertools + +from torch.testing import make_tensor from torch.testing._internal.common_utils import \ - (IS_MACOS, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff, make_tensor) + (IS_MACOS, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, onlyCPU, onlyCUDA) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 192e03f61cac0..9ef374248984e 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -8,9 +8,10 @@ from itertools import product, combinations, combinations_with_replacement, permutations import random +from torch.testing import make_tensor from torch.testing._internal.common_utils import ( TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings, - torch_to_numpy_dtype_dict, slowTest, make_tensor, TEST_SCIPY, IS_MACOS, IS_PPC, + torch_to_numpy_dtype_dict, slowTest, TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, diff --git a/test/test_testing.py b/test/test_testing.py index 7e67569bb4799..f38183d4a3769 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -10,8 +10,9 @@ import torch +from torch.testing import make_tensor from torch.testing._internal.common_utils import \ - (IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS, TestCase, make_tensor, run_tests, skipIfRocm, slowTest) + (IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest) from torch.testing._internal.common_device_type import \ (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes, get_device_type_test_bases, instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, diff --git a/test/test_torch.py b/test/test_torch.py index 15e36c83654db..c50b7ca99e883 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -27,13 +27,14 @@ from itertools import product, combinations, permutations from functools import partial from torch import multiprocessing as mp +from torch.testing import make_tensor from torch.testing._internal.common_utils import ( TestCase, TEST_WITH_ROCM, run_tests, IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN, do_test_dtypes, IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest, skipCUDAMemoryLeakCheckIf, BytesIOContext, noarchTest, skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName, - wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard, make_tensor) + wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard) from multiprocessing.reduction import ForkingPickler from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index e5b8c4a66093b..22f61519a2853 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -11,7 +11,7 @@ from torch._six import inf, nan from torch.testing._internal.common_utils import ( TestCase, run_tests, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, - suppress_warnings, make_tensor, TEST_SCIPY, slowTest, skipIfNoSciPy, IS_WINDOWS) + suppress_warnings, TEST_SCIPY, slowTest, skipIfNoSciPy, IS_WINDOWS) from torch.testing._internal.common_methods_invocations import ( unary_ufuncs, _NOTHING) from torch.testing._internal.common_device_type import ( @@ -19,7 +19,7 @@ onlyCUDA, dtypesIfCUDA, precisionOverride, skipCUDAIfRocm, dtypesIfCPU, OpDTypes) from torch.testing import ( - floating_types_and, all_types_and_complex_and, floating_and_complex_types_and) + floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, make_tensor) if TEST_SCIPY: import scipy diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 306c6cb411f3f..7bb6906ef1cc7 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -6,8 +6,9 @@ from functools import partial import random +from torch.testing import make_tensor from torch.testing._internal.common_utils import \ - (TestCase, run_tests, suppress_warnings, make_tensor) + (TestCase, run_tests, suppress_warnings) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, onlyCPU, dtypes, onlyOnCPUAndCUDA) diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py index 526d02c71e322..7ea18a4f9cea2 100644 --- a/torch/testing/__init__.py +++ b/torch/testing/__init__.py @@ -1,4 +1,5 @@ from ._core import * # noqa: F403 from ._asserts import * # noqa: F403 +from ._creation import * # noqa: F403 from ._check_kernel_launches import * # noqa: F403 from ._deprecated import * # noqa: F403 diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py new file mode 100644 index 0000000000000..4eb10d1d5d26b --- /dev/null +++ b/torch/testing/_creation.py @@ -0,0 +1,155 @@ +""" +This module contains tensor creation utilities. +""" + +import torch +from typing import Optional, List, Tuple, Union, cast +import math + +__all__ = [ + "make_tensor", +] + +def make_tensor( + shape: Union[torch.Size, List[int], Tuple[int, ...]], + device: Union[str, torch.device], + dtype: torch.dtype, + *, + low: Optional[float] = None, + high: Optional[float] = None, + requires_grad: bool = False, + noncontiguous: bool = False, + exclude_zero: bool = False +) -> torch.Tensor: + r"""Creates a tensor with the given :attr:`shape`, :attr:`device`, and :attr:`dtype`, and filled with + values uniformly drawn from ``[low, high)``. + + If :attr:`low` or :attr:`high` are specified and are outside the range of the :attr:`dtype`'s representable + finite values then they are clamped to the lowest or highest representable finite value, respectively. + If ``None``, then the following table describes the default values for :attr:`low` and :attr:`high`, + which depend on :attr:`dtype`. + + +---------------------------+------------+----------+ + | ``dtype`` | ``low`` | ``high`` | + +===========================+============+==========+ + | boolean type | ``0`` | ``2`` | + +---------------------------+------------+----------+ + | unsigned integral type | ``0`` | ``10`` | + +---------------------------+------------+----------+ + | signed integral types | ``-9`` | ``10`` | + +---------------------------+------------+----------+ + | floating types | ``-9`` | ``9`` | + +---------------------------+------------+----------+ + | complex types | ``-9`` | ``9`` | + +---------------------------+------------+----------+ + + Args: + shape (Tuple[int, ...]): A sequence of integers defining the shape of the output tensor. + device (Union[str, torch.device]): The device of the returned tensor. + dtype (:class:`torch.dtype`): The data type of the returned tensor. + low (Optional[Number]): Sets the lower limit (inclusive) of the given range. If a number is provided it is + clamped to the least representable finite value of the given dtype. When ``None`` (default), + this value is determined based on the :attr:`dtype` (see the table above). Default: ``None``. + high (Optional[Number]): Sets the upper limit (exclusive) of the given range. If a number is provided it is + clamped to the greatest representable finite value of the given dtype. When ``None`` (default) this value + is determined based on the :attr:`dtype` (see the table above). Default: ``None``. + requires_grad (Optional[bool]): If autograd should record operations on the returned tensor. Default: ``False``. + noncontiguous (Optional[bool]): If `True`, the returned tensor will be noncontiguous. This argument is + ignored if the constructed tensor has fewer than two elements. + exclude_zero (Optional[bool]): If ``True`` then zeros are replaced with the dtype's small positive value + depending on the :attr:`dtype`. For bool and integer types zero is replaced with one. For floating + point types it is replaced with the dtype's smallest positive normal number (the "tiny" value of the + :attr:`dtype`'s :func:`~torch.finfo` object), and for complex types it is replaced with a complex number + whose real and imaginary parts are both the smallest positive normal number representable by the complex + type. Default ``False``. + + Raises: + ValueError: If ``low > high``. + ValueError: If either :attr:`low` or :attr:`high` is ``nan``. + TypeError: If :attr:`dtype` isn't supported by this function. + + Examples: + >>> from torch.testing import make_tensor + >>> # Creates a float tensor with values in [-1, 1) + >>> make_tensor((3,), device='cpu', dtype=torch.float32, low=-1, high=1) + tensor([ 0.1205, 0.2282, -0.6380]) + >>> # Creates a bool tensor on CUDA + >>> make_tensor((2, 2), device='cuda', dtype=torch.bool) + tensor([[False, False], + [False, True]], device='cuda:0') + """ + def _modify_low_high(low, high, lowest, highest, default_low, default_high, dtype): + """ + Modifies (and raises ValueError when appropriate) low and high values given by the user (input_low, input_high) if required. + """ + def clamp(a, l, h): + return min(max(a, l), h) + + low = low if low is not None else default_low + high = high if high is not None else default_high + + # Checks for error cases + if low != low or high != high: + raise ValueError("make_tensor: one of low or high was NaN!") + if low > high: + raise ValueError("make_tensor: low must be weakly less than high!") + + low = clamp(low, lowest, highest) + high = clamp(high, lowest, highest) + + if dtype in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]: + return math.floor(low), math.ceil(high) + + return low, high + + _integral_types = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64] + _floating_types = [torch.float16, torch.bfloat16, torch.float32, torch.float64] + _complex_types = [torch.cfloat, torch.cdouble] + + if dtype is torch.bool: + result = torch.randint(0, 2, shape, device=device, dtype=dtype) + elif dtype is torch.uint8: + ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max) + low, high = cast(Tuple[int, int], _modify_low_high(low, high, ranges[0], ranges[1], 0, 10, dtype)) + result = torch.randint(low, high, shape, device=device, dtype=dtype) + elif dtype in _integral_types: + ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max) + low, high = _modify_low_high(low, high, ranges[0], ranges[1], -9, 10, dtype) + result = torch.randint(low, high, shape, device=device, dtype=dtype) # type: ignore[call-overload] + elif dtype in _floating_types: + ranges_floats = (torch.finfo(dtype).min, torch.finfo(dtype).max) + low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype) + rand_val = torch.rand(shape, device=device, dtype=dtype) + result = high * rand_val + low * (1 - rand_val) + elif dtype in _complex_types: + float_dtype = torch.float if dtype is torch.cfloat else torch.double + ranges_floats = (torch.finfo(float_dtype).min, torch.finfo(float_dtype).max) + low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype) + real_rand_val = torch.rand(shape, device=device, dtype=float_dtype) + imag_rand_val = torch.rand(shape, device=device, dtype=float_dtype) + real = high * real_rand_val + low * (1 - real_rand_val) + imag = high * imag_rand_val + low * (1 - imag_rand_val) + result = torch.complex(real, imag) + else: + raise TypeError(f"The requested dtype '{dtype}' is not supported by torch.testing.make_tensor()." + " To request support, file an issue at: https://github.com/pytorch/pytorch/issues") + + if noncontiguous and result.numel() > 1: + result = torch.repeat_interleave(result, 2, dim=-1) + result = result[..., ::2] + + if exclude_zero: + if dtype in _integral_types or dtype is torch.bool: + replace_with = torch.tensor(1, device=device, dtype=dtype) + elif dtype in _floating_types: + replace_with = torch.tensor(torch.finfo(dtype).tiny, device=device, dtype=dtype) + else: # dtype in _complex_types: + float_dtype = torch.float if dtype is torch.cfloat else torch.double + float_eps = torch.tensor(torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype) + replace_with = torch.complex(float_eps, float_eps) + result[result == 0] = replace_with + + if dtype in _floating_types + _complex_types: + result.requires_grad = requires_grad + + return result diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 2230808b5fd43..a3d61b477b4a4 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -19,7 +19,7 @@ (make_non_contiguous, floating_types, floating_types_and, complex_types, floating_and_complex_types, floating_and_complex_types_and, all_types_and_complex_and, all_types_and, all_types_and_complex, - integral_types_and, all_types, double_types) + integral_types_and, all_types, double_types, make_tensor) from .._core import _dispatch_dtypes from torch.testing._internal.common_device_type import \ (onlyOnCPUAndCUDA, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver, @@ -32,7 +32,7 @@ random_symmetric_pd_matrix, make_symmetric_matrices, make_symmetric_pd_matrices, random_square_matrix_of_rank, random_fullrank_matrix_distinct_singular_value, - TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY, + TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY, torch_to_numpy_dtype_dict, TEST_WITH_ASAN, GRADCHECK_NONDET_TOL,) import torch.testing._internal.opinfo_helper as opinfo_helper diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py index 99525a7b68756..6ef4de398a39e 100644 --- a/torch/testing/_internal/common_modules.py +++ b/torch/testing/_internal/common_modules.py @@ -2,11 +2,11 @@ from copy import deepcopy from functools import wraps, partial from itertools import chain -from torch.testing import floating_types +from torch.testing import floating_types, make_tensor from torch.testing._internal.common_device_type import ( _TestParametrizer, _dtype_test_suffix, _update_param_kwargs, skipIf) from torch.testing._internal.common_nn import nllloss_reference, get_reduction -from torch.testing._internal.common_utils import make_tensor, freeze_rng_state +from torch.testing._internal.common_utils import freeze_rng_state from types import ModuleType from typing import List, Tuple, Type, Set, Dict @@ -225,7 +225,7 @@ def generate_regression_criterion_inputs(make_input): return [ ModuleInput( constructor_input=FunctionInput(reduction=reduction), - forward_input=FunctionInput(make_input(size=(4, )), make_input(size=4,)), + forward_input=FunctionInput(make_input(shape=(4, )), make_input(shape=4,)), reference_fn=no_batch_dim_reference_criterion_fn, desc='no_batch_dim_{}'.format(reduction) ) for reduction in ['none', 'mean', 'sum']] @@ -236,7 +236,7 @@ def module_inputs_torch_nn_AvgPool1d(module_info, device, dtype, requires_grad, return [ ModuleInput(constructor_input=FunctionInput(kernel_size=2), - forward_input=FunctionInput(make_input(size=(3, 6))), + forward_input=FunctionInput(make_input(shape=(3, 6))), desc='no_batch_dim', reference_fn=no_batch_dim_reference_fn)] @@ -246,13 +246,13 @@ def module_inputs_torch_nn_ELU(module_info, device, dtype, requires_grad, **kwar return [ ModuleInput(constructor_input=FunctionInput(alpha=2.), - forward_input=FunctionInput(make_input(size=(3, 2, 5))), + forward_input=FunctionInput(make_input(shape=(3, 2, 5))), reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1))), ModuleInput(constructor_input=FunctionInput(alpha=2.), - forward_input=FunctionInput(make_input(size=())), + forward_input=FunctionInput(make_input(shape=())), desc='scalar'), ModuleInput(constructor_input=FunctionInput(), - forward_input=FunctionInput(make_input(size=(3,))), + forward_input=FunctionInput(make_input(shape=(3,))), desc='no_batch_dim', reference_fn=no_batch_dim_reference_fn)] @@ -262,14 +262,14 @@ def module_inputs_torch_nn_CELU(module_info, device, dtype, requires_grad, **kwa return [ ModuleInput(constructor_input=FunctionInput(alpha=2.), - forward_input=FunctionInput(make_input(size=(3, 2, 5))), + forward_input=FunctionInput(make_input(shape=(3, 2, 5))), reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1))), ModuleInput(constructor_input=FunctionInput(alpha=2.), - forward_input=FunctionInput(make_input(size=())), + forward_input=FunctionInput(make_input(shape=())), reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1)), desc='scalar'), ModuleInput(constructor_input=FunctionInput(alpha=2.), - forward_input=FunctionInput(make_input(size=(3,))), + forward_input=FunctionInput(make_input(shape=(3,))), desc='no_batch_dim', reference_fn=no_batch_dim_reference_fn)] @@ -279,12 +279,12 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, **k return [ ModuleInput(constructor_input=FunctionInput(), - forward_input=FunctionInput(make_input(size=(2, 3, 4)), - make_input(size=(2, 3, 4))), + forward_input=FunctionInput(make_input(shape=(2, 3, 4)), + make_input(shape=(2, 3, 4))), reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum() for a, b in zip(i, t))), ModuleInput(constructor_input=FunctionInput(), - forward_input=FunctionInput(make_input(size=()), make_input(size=())), + forward_input=FunctionInput(make_input(shape=()), make_input(shape=())), reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(), desc='scalar')] + generate_regression_criterion_inputs(make_input) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index b8e5b097bd6c0..90f3551caae94 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -43,13 +43,13 @@ import numpy as np -from torch.testing import floating_types_and, integral_types, complex_types, get_all_dtypes import expecttest from .._core import \ (_compare_tensors_internal, _compare_scalars_internal, _compare_return_type) import torch import torch.cuda +from torch.testing import make_tensor from torch._utils_internal import get_writable_path from torch._six import string_classes from torch import Tensor @@ -1939,103 +1939,7 @@ def f_retry(*args, **kwargs): return deco_retry -# Methods for matrix and tensor generation - -def make_tensor(size, device: torch.device, dtype: torch.dtype, *, low=None, high=None, - requires_grad: bool = False, noncontiguous: bool = False, - exclude_zero: bool = False) -> torch.Tensor: - """ Creates a random tensor with the given size, device and dtype. - - Default values for low and high: - * boolean type: low = 0, high = 2 - * uint8 type: low = 0, high = 9 - * floating and integral types: low = -9 and high = 9 - * complex types, for each real and imaginary part: low = -9, high = 9 - If low/high are specified and within dtype limits: low = low, high = high - If low/high are specified but exceed the limits: low = dtype_min, high = dtype_max - If low is -inf and/or high is inf: low = dtype_min, high = dtype_max - If low is inf or nan and/or high is -inf or nan: ValueError raised - - If noncontiguous=True, a noncontiguous tensor with the given size will be returned unless the size - specifies a tensor with a 1 or 0 elements in which case the noncontiguous parameter is ignored because - it is not possible to create a noncontiguous Tensor with a single element. - - If exclude_zero is passed with True (default is False), all the matching values (with zero) in - created tensor are replaced with a tiny (smallest positive representable number) value if floating type, - [`tiny` + `tiny`.j] if complex type and 1 if integer/boolean type. - """ - def _modify_low_high(low, high, lowest, highest, default_low, default_high, dtype): - """ - Modifies (and raises ValueError when appropriate) low and high values given by the user (input_low, input_high) if required. - """ - def clamp(a, l, h): - return min(max(a, l), h) - - low = low if low is not None else default_low - high = high if high is not None else default_high - - # Checks for error cases - if low != low or high != high: - raise ValueError("make_tensor: one of low or high was NaN!") - if low > high: - raise ValueError("make_tensor: low must be weakly less than high!") - - low = clamp(low, lowest, highest) - high = clamp(high, lowest, highest) - - if dtype in integral_types(): - return math.floor(low), math.ceil(high) - - return low, high - - if dtype is torch.bool: - result = torch.randint(0, 2, size, device=device, dtype=dtype) - elif dtype is torch.uint8: - ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max) - low, high = _modify_low_high(low, high, ranges[0], ranges[1], 0, 9, dtype) - result = torch.randint(low, high, size, device=device, dtype=dtype) - elif dtype in integral_types(): - ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max) - low, high = _modify_low_high(low, high, ranges[0], ranges[1], -9, 9, dtype) - result = torch.randint(low, high, size, device=device, dtype=dtype) - elif dtype in floating_types_and(torch.half, torch.bfloat16): - ranges_floats = (torch.finfo(dtype).min, torch.finfo(dtype).max) - low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype) - rand_val = torch.rand(size, device=device, dtype=dtype) - result = high * rand_val + low * (1 - rand_val) - else: - assert dtype in complex_types() - float_dtype = torch.float if dtype is torch.cfloat else torch.double - ranges_floats = (torch.finfo(float_dtype).min, torch.finfo(float_dtype).max) - low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype) - real_rand_val = torch.rand(size, device=device, dtype=float_dtype) - imag_rand_val = torch.rand(size, device=device, dtype=float_dtype) - real = high * real_rand_val + low * (1 - real_rand_val) - imag = high * imag_rand_val + low * (1 - imag_rand_val) - result = torch.complex(real, imag) - - if noncontiguous and result.numel() > 1: - result = torch.repeat_interleave(result, 2, dim=-1) - result = result[..., ::2] - - if exclude_zero: - if dtype in integral_types() or dtype is torch.bool: - replace_with = torch.tensor(1, device=device, dtype=dtype) - elif dtype in floating_types_and(torch.half, torch.bfloat16): - replace_with = torch.tensor(torch.finfo(dtype).tiny, device=device, dtype=dtype) - elif dtype in complex_types(): - float_dtype = torch.float if dtype is torch.cfloat else torch.double - float_eps = torch.tensor(torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype) - replace_with = torch.complex(float_eps, float_eps) - else: - raise ValueError(f"Invalid dtype passed, supported dtypes are: {get_all_dtypes()}") - result[result == 0] = replace_with - - if dtype in floating_types_and(torch.half, torch.bfloat16) or\ - dtype in complex_types(): - result.requires_grad = requires_grad - - return result +# Methods for matrix generation def random_square_matrix_of_rank(l, rank, dtype=torch.double, device='cpu'): assert rank <= l From a8ffe81b2c3123926354b4ec2001693b38daa80d Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 30 Aug 2021 12:25:29 -0700 Subject: [PATCH 352/530] Bring back old algorithm for sorting on small number of segments (#64127) Summary: Fixes https://github.com/pytorch/pytorch/issues/63456 The code was copy-pasted from the previous commit without modification. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64127 Reviewed By: mruberry Differential Revision: D30632090 Pulled By: ngimel fbshipit-source-id: 58bbdd9b0423f01d4e65e2ec925ad9a3f88efc9b --- aten/src/ATen/native/cuda/Sort.cu | 95 +++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu index 83fce65d33b6c..9cb32bc5ac14c 100644 --- a/aten/src/ATen/native/cuda/Sort.cu +++ b/aten/src/ATen/native/cuda/Sort.cu @@ -207,6 +207,87 @@ struct offset_t { } +namespace { + +// Segmented sort by full sort algorithm:. +// Say we are sorting a (2, 3) tensor. We have in flattened form: +// values 0.4 1.2 5.3 6.2 1.3 2.3 +// indices 0 1 2 0 1 2 +// segment_id 0 0 0 1 1 1 + +// First we sort by values, globally: +// values 6.2 5.3 2.3 1.2 1.3 0.4 +// indices 0 2 2 1 1 0 +// segment_id 1 0 1 0 1 0 + +// Then we stable sort by segment id: +// values 5.3 1.2 0.4 6.2 2.3 1.3 +// indices 2 1 0 0 2 1 +// segment_id 0 0 0 1 1 1 + +// This method can only work if the slice we are sorting (`dim`) is +// innermost, and both values and indices are contiguous. We do this +// by re-arranging the input into this form as needed, which will +// unfortunately allocate memory if the request is not in this form. +// Vectorized sort is slower than iterated sort if the number of +// slices is small (since we're sorting twice, instead of invoking a +// smaller sort `numSlices` times), but the cub sort +// implementation here is a catch-all, so we're not looking for +// efficiency, but instead correctness. + +template +__global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64_t *index, const int2 *i_s_ptr, int nsegments, int nsort) { + CUDA_KERNEL_LOOP(i, nsegments * nsort) { + int segment = i / nsort; + int j = i % nsort; + + int offset = segment * nsort; + const scalar_t *in_ = in + offset; + scalar_t *out_ = out + offset; + int64_t *index_ = index + offset; + const int2 *i_s_ptr_ = i_s_ptr + offset; + + int idx = i_s_ptr_[j].y; + index_[j] = idx; + out_[j] = in_[idx]; + } +} + +template +inline void segmented_sort_pairs_by_full_sort( + int64_t nsegments, int64_t nsort, int64_t n, bool descending, const Tensor &indices, + const scalar_t *self_ptr, scalar_t *values_ptr, int64_t *indices_ptr +) { + int64_t segment_bits = std::max(1L, static_cast(std::ceil(std::log2(nsegments)))); + + auto int_options = indices.options().dtype(kInt); + auto indices_and_segment = at::empty({nsegments, nsort, 2}, int_options); + indices_and_segment.select(-1, 0).copy_( // segment id + at::arange(nsegments, int_options).view({nsegments, 1}).expand({nsegments, nsort})); + indices_and_segment.select(-1, 1).copy_( // reverse indices + at::arange(nsort, int_options).view({1, nsort}).expand({nsegments, nsort})); + + auto i_s_ptr = reinterpret_cast(indices_and_segment.data_ptr()); + auto indices_and_segment2 = at::empty_like(indices_and_segment); + auto i_s_ptr2 = reinterpret_cast(indices_and_segment2.data_ptr()); + + at::cuda::cub::sort_pairs( + self_ptr, nullptr, i_s_ptr, i_s_ptr2, + n, descending); + + TORCH_INTERNAL_ASSERT(segment_bits <= 32); + + // sort on lower 32bits, i.e. segment index + at::cuda::cub::sort_keys( + reinterpret_cast(i_s_ptr2), reinterpret_cast(i_s_ptr), + n, false, 0, segment_bits); + + sort_postprocess_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>( + self_ptr, values_ptr, indices_ptr, i_s_ptr, nsegments, nsort); +} + +} // namespace + // We perform a segmented sort in cub with inputs that have // more than 1024/2048 elements along the selected dimension. // Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace). @@ -349,11 +430,15 @@ std::tuple sort_out_stable_cuda(const Tensor & self, c10::opt int64_t n = std::min(remaining, nbatch); int64_t nsegments = n / nsort; - auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous(); - - at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr, - reverse_indices.data_ptr(), indices_ptr, n, nsegments, - offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending); + if (nsegments < 128) { + segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending, + indices, self_ptr, values_ptr, indices_ptr); + } else { + auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous(); + at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr, + reverse_indices.data_ptr(), indices_ptr, n, nsegments, + offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending); + } remaining -= n; self_ptr += n; From 401bbb2aa0a183ddfb309740c020fb4962367ac9 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 30 Aug 2021 12:28:39 -0700 Subject: [PATCH 353/530] remove componentwise comparison of complex values in TestCase.assertEqual (#63572) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63572 Addresses #61906. Issue will be fixed later in the stack when `torch.testing.assert_close` got the same treatment. cc ezyang gchanan Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D30633527 Pulled By: mruberry fbshipit-source-id: c2002a4998a7a75cb2ab83f87190bde43a9d4f7c --- test/test_tensor_creation_ops.py | 2 +- test/test_testing.py | 54 ++++------------------- test/test_torch.py | 4 +- test/test_unary_ufuncs.py | 5 +-- torch/testing/_core.py | 75 ++++---------------------------- 5 files changed, 21 insertions(+), 119 deletions(-) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 9ef374248984e..dcb49386c9ff8 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -3258,7 +3258,7 @@ def seed(generator): self.assertTrue((res1 >= 0).all().item()) @dtypes(torch.half, torch.float, torch.bfloat16, torch.double, - torch.complex32, torch.complex64, torch.complex128) + torch.complex64, torch.complex128) def test_randn(self, device, dtype): SIZE = 100 for size in [0, SIZE]: diff --git a/test/test_testing.py b/test/test_testing.py index f38183d4a3769..fdc3463edcb2b 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -88,25 +88,19 @@ def test__comparescalars_debug_msg(self, device): "atol=1e-05 is only 1.9100000000000003e-05!") self.assertEqual(debug_msg, expected_msg) - # complex x complex, real difference + # complex x complex result, debug_msg = self._compareScalars(complex(1, 3), complex(3, 1)) - expected_msg = ("Comparing the real part 1.0 and 3.0 gives a difference " - "of 2.0, but the allowed difference with rtol=1.3e-06 " - "and atol=1e-05 is only 1.39e-05!") - self.assertEqual(debug_msg, expected_msg) - - # complex x complex, imaginary difference - result, debug_msg = self._compareScalars(complex(1, 3), complex(1, 5.5)) - expected_msg = ("Comparing the imaginary part 3.0 and 5.5 gives a " - "difference of 2.5, but the allowed difference with " - "rtol=1.3e-06 and atol=1e-05 is only 1.715e-05!") + expected_msg = ("Comparing (1+3j) and (3+1j) gives a difference " + "of 2.8284271247461903, but the allowed difference " + "with rtol=1.3e-06 and atol=1e-05 is only " + "1.4110960958218895e-05!") self.assertEqual(debug_msg, expected_msg) # complex x int result, debug_msg = self._compareScalars(complex(1, -2), 1) - expected_msg = ("Comparing the imaginary part -2.0 and 0.0 gives a " - "difference of 2.0, but the allowed difference with " - "rtol=1.3e-06 and atol=1e-05 is only 1e-05!") + expected_msg = ("Comparing (1-2j) and 1 gives a difference of 2.0, " + "but the allowed difference with rtol=1.3e-06 and " + "atol=1e-05 is only 1.13e-05!") self.assertEqual(debug_msg, expected_msg) # NaN x NaN, equal_nan=False @@ -170,28 +164,6 @@ def test__comparetensors_debug_msg(self, device): "occuring at index 0.") self.assertEqual(debug_msg, expected_msg) - # Checks complex tensor comparisons (real part) - a = torch.tensor((1 - 1j, 4 + 3j), device=device) - b = torch.tensor((1 - 1j, 1 + 3j), device=device) - result, debug_msg = self._compareTensors(a, b) - expected_msg = ("Real parts failed to compare as equal! " - "With rtol=1.3e-06 and atol={0}, " - "found 1 element(s) (out of 2) whose difference(s) exceeded the " - "margin of error (including 0 nan comparisons). The greatest difference was " - "3.0 (4.0 vs. 1.0), which occurred at index 1.").format(atol) - self.assertEqual(debug_msg, expected_msg) - - # Checks complex tensor comparisons (imaginary part) - a = torch.tensor((1 - 1j, 4 + 3j), device=device) - b = torch.tensor((1 - 1j, 4 - 21j), device=device) - result, debug_msg = self._compareTensors(a, b) - expected_msg = ("Imaginary parts failed to compare as equal! " - "With rtol=1.3e-06 and atol={0}, " - "found 1 element(s) (out of 2) whose difference(s) exceeded the " - "margin of error (including 0 nan comparisons). The greatest difference was " - "24.0 (3.0 vs. -21.0), which occurred at index 1.").format(atol) - self.assertEqual(debug_msg, expected_msg) - # Checks size mismatch a = torch.tensor((1, 2), device=device) b = torch.tensor((3), device=device) @@ -407,7 +379,7 @@ def test_isclose_comparetensors_complex(self, device, dtype): tests = ( (complex(1, -1), complex(-1, 1), False), (complex(1, -1), complex(2, -2), True), - (complex(1, 99), complex(4, 100), False), + (complex(1, 99), complex(4, 100), True), ) self._comparetensors_helper(tests, device, dtype, False, atol=.5, rtol=.5) @@ -421,14 +393,6 @@ def test_isclose_comparetensors_complex(self, device, dtype): (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True), ) self._isclose_helper(tests, device, dtype, True) - - tests = ( - (complex(1, 1), complex(1, float('nan')), False), - (complex(1, 1), complex(float('nan'), 1), False), - (complex(float('nan'), 1), complex(float('nan'), 1), True), - (complex(float('nan'), 1), complex(1, float('nan')), False), - (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True), - ) self._comparetensors_helper(tests, device, dtype, True) # Tests that isclose with rtol or atol values less than zero throws a diff --git a/test/test_torch.py b/test/test_torch.py index c50b7ca99e883..b267b9cd6b610 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5121,7 +5121,7 @@ def filter_shape(shape, dim): spacing = [space.cpu().detach().numpy() for space in spacing] expected = np.gradient(t_np, *self._wrap_to_list(spacing), axis=dims, edge_order=edge_order) actual, expected = self._inf_nan_preprocess(list(actual), self._wrap_to_list(expected)) - self.assertEqual(actual, expected, equal_nan="relaxed", atol=1e-4, rtol=0, exact_dtype=False) + self.assertEqual(actual, expected, equal_nan=True, atol=1e-4, rtol=0, exact_dtype=False) @onlyOnCPUAndCUDA @dtypes(torch.long, torch.float32, torch.complex64) @@ -5188,7 +5188,7 @@ def test_gradient_type_promotion(self, device): self.assertEqual(expected[i].imag, torch.zeros(actual[i].shape), exact_dtype=False) else: actual, expected = self._inf_nan_preprocess(list(actual), expected) - self.assertEqual(actual, expected, equal_nan="relaxed", exact_dtype=False) + self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False) @onlyOnCPUAndCUDA @dtypes(torch.long, torch.float32, torch.complex64) diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 22f61519a2853..526b67a6b03da 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -359,10 +359,7 @@ def test_reference_numerics_extremal(self, device, dtype, op): tensors = generate_numeric_tensors_extremal(device, dtype, domain=op.domain) - # https://github.com/pytorch/pytorch/issues/50749 - equal_nan = "relaxed" if device.startswith('cuda') else True - - self._test_reference_numerics(dtype, op, tensors, equal_nan) + self._test_reference_numerics(dtype, op, tensors) # Tests for testing (non)contiguity consistency diff --git a/torch/testing/_core.py b/torch/testing/_core.py index d9806150047c5..66060f8cbcee0 100644 --- a/torch/testing/_core.py +++ b/torch/testing/_core.py @@ -6,7 +6,7 @@ import random import math import cmath -from typing import cast, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import operator FileCheck = torch._C.FileCheck @@ -78,27 +78,12 @@ def _unravel_index(flat_index, shape): # Two tensors are "equal" if they are "close", in the sense of torch.allclose. # The only exceptions are complex tensors and bool tensors. # -# Complex tensors are "equal" if both the -# real and complex parts (separately) are close. This is divergent from -# torch.allclose's behavior, which compares the absolute values of the -# complex numbers instead. -# -# Using torch.allclose would be a less strict -# comparison that would allow large complex values with -# significant real or imaginary differences to be considered "equal," -# and would make setting rtol and atol for complex tensors distinct from -# other tensor types. -# # Bool tensors are equal only if they are identical, regardless of # the rtol and atol values. # # The `equal_nan` can be True or False, which maps to the True or False -# in `torch.allclose`. `equal_nan` can also be "relaxed", which means -# the complex will be compared in the relaxed mode: -# 2 + nan j == 3 + nan j ---> False when equal_nan=True -# True when equal_nan="relaxed" -def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, equal_nan: Union[str, bool]) -> _compare_return_type: - assert equal_nan in {True, False, "relaxed"} +# in `torch.allclose`. +def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, equal_nan) -> _compare_return_type: debug_msg : Optional[str] # Integer (including bool) comparisons are identity comparisons # when rtol is zero and atol is less than one @@ -129,48 +114,19 @@ def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, e _unravel_index(greatest_diff_index, a.shape))) return (False, debug_msg) - # Compares complex tensors' real and imaginary parts separately. - # (see NOTE Test Framework Tensor "Equality") - if a.is_complex(): - if equal_nan == "relaxed": - a = a.clone() - b = b.clone() - a.real[a.imag.isnan()] = math.nan - a.imag[a.real.isnan()] = math.nan - b.real[b.imag.isnan()] = math.nan - b.imag[b.real.isnan()] = math.nan - - real_result, debug_msg = _compare_tensors_internal(a.real, b.real, - rtol=rtol, atol=atol, - equal_nan=equal_nan) - - if not real_result: - debug_msg = "Real parts failed to compare as equal! " + cast(str, debug_msg) - return (real_result, debug_msg) - - imag_result, debug_msg = _compare_tensors_internal(a.imag, b.imag, - rtol=rtol, atol=atol, - equal_nan=equal_nan) - - if not imag_result: - debug_msg = "Imaginary parts failed to compare as equal! " + cast(str, debug_msg) - return (imag_result, debug_msg) - - return (True, None) - # All other comparisons use torch.allclose directly - if torch.allclose(a, b, rtol=rtol, atol=atol, equal_nan=(equal_nan in {"relaxed", True})): + if torch.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan): return (True, None) # Gathers debug info for failed float tensor comparison # NOTE: converts to float64 to best represent differences - a_flat = a.to(torch.float64).flatten() - b_flat = b.to(torch.float64).flatten() + a_flat = a.to(torch.float64 if not a.dtype.is_complex else torch.complex128).flatten() + b_flat = b.to(torch.float64 if not a.dtype.is_complex else torch.complex128).flatten() diff = torch.abs(a_flat - b_flat) # Masks close values # NOTE: this avoids (inf - inf) oddities when computing the difference - close = torch.isclose(a_flat, b_flat, rtol, atol, (equal_nan in {"relaxed", True})) + close = torch.isclose(a_flat, b_flat, rtol, atol, equal_nan) diff[close] = 0 nans = torch.isnan(diff) num_nans = nans.sum() @@ -212,7 +168,7 @@ def _helper(a, b, s) -> _compare_return_type: # Special-case for infinity comparisons # NOTE: if b is inf then allowed_diff will be inf when rtol is not 0 - if ((math.isinf(a) or math.isinf(b)) and a != b): + if ((cmath.isinf(a) or cmath.isinf(b)) and a != b): result = False msg = None @@ -228,21 +184,6 @@ def _helper(a, b, s) -> _compare_return_type: ) return result, msg - if isinstance(a, complex) or isinstance(b, complex): - a = complex(a) - b = complex(b) - - if equal_nan == "relaxed": - if cmath.isnan(a) and cmath.isnan(b): - return (True, None) - - result, msg = _helper(a.real, b.real, " the real part ") - - if not result: - return (False, msg) - - return _helper(a.imag, b.imag, " the imaginary part ") - return _helper(a, b, " ") From eafe33c995d47d45fceaf42801717f3120d799b9 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 30 Aug 2021 12:28:39 -0700 Subject: [PATCH 354/530] remove componentwise comparison of complex values in torch.testing.assert_close (#63841) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63841 Closes #61906. cc ezyang gchanan Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D30633526 Pulled By: mruberry fbshipit-source-id: ddb5d61838cd1e12d19d0093799e827344382cdc --- test/test_testing.py | 65 +++++++++++++++-------------- torch/testing/_asserts.py | 86 ++++++++------------------------------- 2 files changed, 50 insertions(+), 101 deletions(-) diff --git a/test/test_testing.py b/test/test_testing.py index fdc3463edcb2b..a5ea232122e08 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -863,20 +863,43 @@ def test_matching_atol(self): for fn in assert_close_with_inputs(actual, expected): fn(rtol=0.0, atol=eps * 2) - def test_matching_nan(self): - actual = torch.tensor(float("NaN")) - expected = actual.clone() + # TODO: the code that this test was designed for was removed in https://github.com/pytorch/pytorch/pull/56058 + # We need to check if this test is still needed or if this behavior is now enabled by default. + def test_matching_conjugate_bit(self): + actual = torch.tensor(complex(1, 1)).conj() + expected = torch.tensor(complex(1, -1)) for fn in assert_close_with_inputs(actual, expected): - with self.assertRaises(AssertionError): - fn() + fn() + + def test_matching_nan(self): + nan = float("NaN") + + tests = ( + (nan, nan), + (complex(nan, 0), complex(0, nan)), + (complex(nan, nan), complex(nan, 0)), + (complex(nan, nan), complex(nan, nan)), + ) + + for actual, expected in tests: + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaises(AssertionError): + fn() def test_matching_nan_with_equal_nan(self): - actual = torch.tensor(float("NaN")) - expected = actual.clone() + nan = float("NaN") - for fn in assert_close_with_inputs(actual, expected): - fn(equal_nan=True) + tests = ( + (nan, nan), + (complex(nan, 0), complex(0, nan)), + (complex(nan, nan), complex(nan, 0)), + (complex(nan, nan), complex(nan, nan)), + ) + + for actual, expected in tests: + for fn in assert_close_with_inputs(actual, expected): + fn(equal_nan=True) def test_numpy(self): tensor = torch.rand(2, 2, dtype=torch.float32) @@ -1181,30 +1204,6 @@ def test_mapping_mismatching_values_msg(self): torch.testing.assert_close(actual, expected) -class TestAssertCloseComplex(TestCase): - def test_mismatching_nan_with_equal_nan(self): - actual = torch.tensor(complex(1, float("NaN"))) - expected = torch.tensor(complex(float("NaN"), 1)) - - for fn in assert_close_with_inputs(actual, expected): - with self.assertRaises(AssertionError): - fn(equal_nan=True) - - def test_mismatching_nan_with_equal_nan_relaxed(self): - actual = torch.tensor(complex(1, float("NaN"))) - expected = torch.tensor(complex(float("NaN"), 1)) - - for fn in assert_close_with_inputs(actual, expected): - fn(equal_nan="relaxed") - - def test_matching_conjugate_bit(self): - actual = torch.tensor(complex(1, 1)).conj() - expected = torch.tensor(complex(1, -1)) - - for fn in assert_close_with_inputs(actual, expected): - fn() - - class TestAssertCloseSparseCOO(TestCase): def test_matching_coalesced(self): indices = ( diff --git a/torch/testing/_asserts.py b/torch/testing/_asserts.py index 2de2cc0735529..073e2e2230820 100644 --- a/torch/testing/_asserts.py +++ b/torch/testing/_asserts.py @@ -44,52 +44,6 @@ def _get_default_rtol_and_atol(actual: Tensor, expected: Tensor) -> Tuple[float, return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol) -def _check_complex_components_individually( - check_tensors: Callable[..., Optional[_TestingErrorMeta]] -) -> Callable[..., Optional[_TestingErrorMeta]]: - """Decorates real-valued tensor check functions to handle complex components individually. - - If the inputs are not complex, this decorator is a no-op. - - Args: - check_tensors (Callable[[Tensor, Tensor], Optional[_TestingErrorMeta]]): Tensor check function for real-valued - tensors. - """ - - @functools.wraps(check_tensors) - def wrapper( - actual: Tensor, expected: Tensor, *, equal_nan: Union[str, bool], **kwargs: Any - ) -> Optional[_TestingErrorMeta]: - if equal_nan == "relaxed": - relaxed_complex_nan = True - equal_nan = True - else: - relaxed_complex_nan = False - - if actual.dtype not in (torch.complex32, torch.complex64, torch.complex128): - return check_tensors(actual, expected, equal_nan=equal_nan, **kwargs) - - if relaxed_complex_nan: - actual, expected = [ - t.clone().masked_fill( - t.real.isnan() | t.imag.isnan(), complex(float("NaN"), float("NaN")) # type: ignore[call-overload] - ) - for t in (actual, expected) - ] - - error_meta = check_tensors(actual.real, expected.real, equal_nan=equal_nan, **kwargs) - if error_meta: - return error_meta - - error_meta = check_tensors(actual.imag, expected.imag, equal_nan=equal_nan, **kwargs) - if error_meta: - return error_meta - - return None - - return wrapper - - def _check_sparse_coo_members_individually( check_tensors: Callable[..., Optional[_TestingErrorMeta]] ) -> Callable[..., Optional[_TestingErrorMeta]]: @@ -430,10 +384,24 @@ def append_difference(msg: str, *, type: str, difference: float, index: Tuple[in return msg.strip() +def _get_comparison_dtype(dtype: torch.dtype) -> torch.dtype: + """Selects the comparison dtype based on the input dtype. + + Returns: + Highest precision dtype of the same dtype category as the input. :class:`torch.bool` is treated as integral + dtype. + """ + if dtype.is_complex: + return torch.complex128 + elif dtype.is_floating_point: + return torch.float64 + else: + return torch.int64 + + @_check_quantized @_check_sparse_coo_members_individually @_check_sparse_csr_members_individually -@_check_complex_components_individually def _check_values_close( actual: Tensor, expected: Tensor, @@ -457,7 +425,7 @@ def _check_values_close( Returns: (Optional[AssertionError]): If check did not pass. """ - dtype = torch.float64 if actual.dtype.is_floating_point else torch.int64 + dtype = _get_comparison_dtype(actual.dtype) actual = actual.to(dtype) expected = expected.to(dtype) mismatches = ~torch.isclose(actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan) @@ -740,7 +708,7 @@ def assert_close( allow_subclasses: bool = True, rtol: Optional[float] = None, atol: Optional[float] = None, - equal_nan: Union[bool, str] = False, + equal_nan: bool = False, check_device: bool = True, check_dtype: bool = True, check_stride: bool = False, @@ -761,9 +729,6 @@ def assert_close( (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are only considered equal to each other if :attr:`equal_nan` is ``True``. - If :attr:`actual` and :attr:`expected` are complex-valued, they are considered close if both their real and - imaginary components are considered close according to the definition above. - If :attr:`actual` and :attr:`expected` are sparse (either having COO or CSR layout), their strided members are checked individually. Indices, namely ``indices`` for COO or ``crow_indices`` and ``col_indices`` for CSR layout, are always checked for equality whereas the values are checked for closeness according to the definition above. @@ -795,8 +760,7 @@ def assert_close( default values based on the :attr:`~torch.Tensor.dtype` are selected with the below table. atol (Optional[float]): Absolute tolerance. If specified :attr:`rtol` must also be specified. If omitted, default values based on the :attr:`~torch.Tensor.dtype` are selected with the below table. - equal_nan (Union[bool, str]): If ``True``, two ``NaN`` values will be considered equal. If ``"relaxed"``, - complex values are considered as ``NaN`` if either the real **or** imaginary component is ``NaN``. + equal_nan (Union[bool, str]): If ``True``, two ``NaN`` values will be considered equal. check_device (bool): If ``True`` (default), asserts that corresponding tensors are on the same :attr:`~torch.Tensor.device`. If this check is disabled, tensors on different :attr:`~torch.Tensor.device`'s are moved to the CPU before being compared. @@ -956,20 +920,6 @@ def assert_close( Relative difference: nan (up to 1.3e-06 allowed) >>> torch.testing.assert_close(actual, expected, equal_nan=True) - >>> # If equal_nan=True, the real and imaginary NaN's of complex inputs have to match. - >>> expected = torch.tensor(complex(float("NaN"), 0)) - >>> actual = torch.tensor(complex(0, float("NaN"))) - >>> torch.testing.assert_close(actual, expected, equal_nan=True) - Traceback (most recent call last): - ... - AssertionError: Scalars are not close! - - Absolute difference: nan (up to 1e-05 allowed) - Relative difference: nan (up to 1.3e-06 allowed) - >>> # If equal_nan="relaxed", however, then complex numbers are treated as NaN if any - >>> # of the real or imaginary components is NaN. - >>> torch.testing.assert_close(actual, expected, equal_nan="relaxed") - >>> expected = torch.tensor([1.0, 2.0, 3.0]) >>> actual = torch.tensor([1.0, 4.0, 5.0]) >>> # The default mismatch message can be overwritten. From e98173ff3423247c597e21c923c8f47470ef07ab Mon Sep 17 00:00:00 2001 From: Tanvir Zaman Date: Mon, 30 Aug 2021 12:56:15 -0700 Subject: [PATCH 355/530] Fix bytes_written and bytes_read (#64040) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64040 In operator cost inference functions, in many places we are using sizeof(x.data_type()). Since data_type() returns a 32 bit integer from [this enum](https://www.internalfb.com/code/fbsource/[15e7ffe4073cf08c61077c7c24a4839504b964a2]/fbcode/caffe2/caffe2/proto/caffe2.proto?lines=20), we are basically always getting 4 for sizeof(x.data_type()) no matter what actual data type x has. Big thanks to Jack Langman for specifically pointing to this bug. We would instead use the size in bytes based on actual data type. Test Plan: Added unit tests BatchMatMulMemCostTest: buck test //caffe2/caffe2/fb/fbgemm:batch_matmul_op_test -- BatchMatMulMemCostTest Extended existing unit test test_columnwise_concat for different data types: buck test //caffe2/caffe2/python/operator_test:concat_op_cost_test -- test_columnwise_concat Differential Revision: D30561459 fbshipit-source-id: 976fa5167097a35af548498480001aafd7851d93 --- caffe2/core/operator_schema.h | 17 ++- caffe2/operators/batch_matmul_op.cc | 113 ++++++++++-------- caffe2/operators/concat_split_op.cc | 15 ++- caffe2/operators/conv_pool_op_base.h | 15 ++- caffe2/operators/distance_op.cc | 28 +++-- caffe2/operators/fc_inference.cc | 22 ++-- caffe2/operators/one_hot_ops.cc | 30 +++-- caffe2/operators/utility_ops.cc | 13 +- .../operator_test/concat_op_cost_test.py | 54 +++++---- caffe2/python/workspace_test.py | 2 +- caffe2/sgd/adagrad_op.cc | 55 ++++++--- 11 files changed, 224 insertions(+), 140 deletions(-) diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h index 64f5ef3ed883a..0d048eb8d26e9 100644 --- a/caffe2/core/operator_schema.h +++ b/caffe2/core/operator_schema.h @@ -6,12 +6,13 @@ #include #include #include -#include #include +#include #include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" +#include "caffe2/core/types.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/filler.h" #include "caffe2/utils/proto_utils.h" @@ -273,8 +274,8 @@ class TORCH_API OpSchema { OpSchema& Arg(const char* name, const char* description, bool required = false); -#define DECLARE_STANDARD_ARG(name, str) \ - static const char* Arg_##name; \ +#define DECLARE_STANDARD_ARG(name, str) \ + static const char* Arg_##name; \ OpSchema& Arg##name(const char* description); DECLARE_STANDARD_ARG(IsTest, is_test) @@ -339,7 +340,9 @@ class TORCH_API OpSchema { return inplace_enforced_(x, y); } - TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema); + TORCH_API friend std::ostream& operator<<( + std::ostream& out, + const OpSchema& schema); const std::vector& args() const { return args_; @@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference( } c.flops = nElemX * OpsPerPoint; - c.bytes_read = nElemRead * sizeof(X.data_type()); - c.bytes_written = nElemX * sizeof(X.data_type()); + auto const& X_element_size_byte = + DataTypeToTypeMeta(X.data_type()).itemsize(); + c.bytes_read = nElemRead * X_element_size_byte; + c.bytes_written = nElemX * X_element_size_byte; return c; } diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc index 32799ced10671..205acf74f1572 100644 --- a/caffe2/operators/batch_matmul_op.cc +++ b/caffe2/operators/batch_matmul_op.cc @@ -1,6 +1,7 @@ #include "caffe2/operators/batch_matmul_op.h" #include "caffe2/core/operator_schema.h" +#include "caffe2/core/types.h" namespace caffe2 { @@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul( K = in[0].dims(ndims_A - 1); } + auto const& A_element_size_byte = + DataTypeToTypeMeta(A.data_type()).itemsize(); + auto const& Y_element_size_byte = + DataTypeToTypeMeta(Y.data_type()).itemsize(); c.flops = 2 * nElemY * K; - c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type()); - c.bytes_written = nElemY * sizeof(Y.data_type()); + c.bytes_read = (nElemA + nElemB) * A_element_size_byte; + c.bytes_written = nElemY * Y_element_size_byte; c.params_bytes = 0; return c; } @@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase { auto no_trans_arg = vector(); auto trans_a_arg = vector{MakeArgument("trans_a", 1)}; auto trans_b_arg = vector{MakeArgument("trans_b", 1)}; - auto trans_both_arg = vector{MakeArgument("trans_a", 1), - MakeArgument("trans_b", 1)}; + auto trans_both_arg = vector{ + MakeArgument("trans_a", 1), MakeArgument("trans_b", 1)}; if (trans_a) { if (trans_b) { // A'B': // dA = B'G', dB = G'A' - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{I(1), GO(0)}, - vector{GI(0)}, - trans_both_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(0)}, - vector{GI(1)}, - trans_both_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(1), GO(0)}, + vector{GI(0)}, + trans_both_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(0)}, + vector{GI(1)}, + trans_both_arg)}; } else { // A'B: // dA = BG', dB = AG - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{I(1), GO(0)}, - vector{GI(0)}, - trans_b_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(0), GO(0)}, - vector{GI(1)}, - no_trans_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(1), GO(0)}, + vector{GI(0)}, + trans_b_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(0), GO(0)}, + vector{GI(1)}, + no_trans_arg)}; } } else { if (trans_b) { // AB': // dA = GB, dB = G'A - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(1)}, - vector{GI(0)}, - no_trans_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(0)}, - vector{GI(1)}, - trans_a_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(1)}, + vector{GI(0)}, + no_trans_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(0)}, + vector{GI(1)}, + trans_a_arg)}; } else { // AB: // dA = GB', dB = A'G - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(1)}, - vector{GI(0)}, - trans_b_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(0), GO(0)}, - vector{GI(1)}, - trans_a_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(1)}, + vector{GI(0)}, + trans_b_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(0), GO(0)}, + vector{GI(1)}, + trans_a_arg)}; } } } diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc index 8eceb5ab4a577..8aa9e282adb84 100644 --- a/caffe2/operators/concat_split_op.cc +++ b/caffe2/operators/concat_split_op.cc @@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit( CAFFE_ENFORCE_GT(in.size(), 0); struct OpSchema::Cost cost; cost.flops = 0; - auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type()); - auto split_bytes_count = - (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type()); + auto const& input_0_element_size_byte = + DataTypeToTypeMeta(in[0].data_type()).itemsize(); + auto const& input_1_element_size_byte = + (in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0; + auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte; + auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte; // There can be two input blobs: // (1) actual tensor to be split // (2) lengths of outputs along split axis @@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat( } auto split_info_bytes_count = in.size() * sizeof(int); + auto const& input_0_element_size_byte = + DataTypeToTypeMeta(in[0].data_type()).itemsize(); struct OpSchema::Cost cost; cost.flops = 0; - cost.bytes_read = nElemRead * sizeof(in[0].data_type()); + cost.bytes_read = nElemRead * input_0_element_size_byte; cost.bytes_written = - size * sizeof(in[0].data_type()) + split_info_bytes_count; + size * input_0_element_size_byte + split_info_bytes_count; cost.params_bytes = 0; return cost; } diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index 25bd99a92e50f..b356ef952d79c 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -7,6 +7,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" +#include "caffe2/core/types.h" #include "caffe2/proto/caffe2_legacy.pb.h" #include "caffe2/utils/math.h" @@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator { uint64_t nElemW = nElemFromDim(W); uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0; + auto const& X_elemenet_size_byte = + DataTypeToTypeMeta(X.data_type()).itemsize(); + auto const& Y_element_size_byte = + DataTypeToTypeMeta(Y.data_type()).itemsize(); + auto const& W_element_size_byte = + DataTypeToTypeMeta(W.data_type()).itemsize(); + // grouping is NOT properly handled yet c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h * in_channels * out_channels * 2; - c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type()); - c.bytes_written = - N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type()); + c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte; + c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte; c.params_bytes = out_channels * in_channels * kernel_t * kernel_h * - kernel_w * sizeof(W.data_type()); + kernel_w * W_element_size_byte; return c; } diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc index 1529534d8fb2e..9ea8eea5a2725 100644 --- a/caffe2/operators/distance_op.cc +++ b/caffe2/operators/distance_op.cc @@ -1,4 +1,5 @@ #include "caffe2/operators/distance_op.h" +#include "caffe2/core/types.h" #include "caffe2/utils/eigen_utils.h" #ifdef CAFFE2_USE_MKLDNN #include @@ -7,7 +8,7 @@ namespace caffe2 { -template<> +template <> bool SquaredL2DistanceOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); @@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct( CAFFE_ENFORCE_EQ(out[0].dims().size(), 1); struct OpSchema::Cost c = PointwiseCostInference<2>(def, in); - c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type()); + auto const& out_0_element_size_byte = + DataTypeToTypeMeta(out[0].data_type()).itemsize(); + c.bytes_written = out[0].dims(0) * out_0_element_size_byte; c.params_bytes = 0; return c; } @@ -379,10 +382,12 @@ bool DotProductWithPaddingOp::RunOnDevice() { } // L2 -REGISTER_CPU_OPERATOR(SquaredL2Distance, - SquaredL2DistanceOp); -REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient, - SquaredL2DistanceGradientOp); +REGISTER_CPU_OPERATOR( + SquaredL2Distance, + SquaredL2DistanceOp); +REGISTER_CPU_OPERATOR( + SquaredL2DistanceGradient, + SquaredL2DistanceGradientOp); OPERATOR_SCHEMA(SquaredL2Distance) .NumInputs(2) @@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; vector GetGradientDefs() override { return SingleGradientDef( - "SquaredL2DistanceGradient", "", + "SquaredL2DistanceGradient", + "", vector{I(0), I(1), GO(0)}, vector{GI(0), GI(1)}); } @@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase { replicate = GetArgument(Def(), "replicate").i(); } - const auto dot_arg = - vector{MakeArgument("pad_value", pad_value), - MakeArgument("replicate", replicate)}; + const auto dot_arg = vector{ + MakeArgument("pad_value", pad_value), + MakeArgument("replicate", replicate)}; return SingleGradientDef( "DotProductWithPaddingGradient", @@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase { } }; REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient); -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc index a44c230980c7f..ba1b7122cdc9d 100644 --- a/caffe2/operators/fc_inference.cc +++ b/caffe2/operators/fc_inference.cc @@ -1,4 +1,5 @@ #include "caffe2/operators/fc_inference.h" +#include "caffe2/core/types.h" namespace caffe2 { std::vector FCShapeInference( @@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC( ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1])) : size_to_dim_(canonical_axis_w, GetDimsVector(in[1])); - const auto& X = in[0]; + auto const& X_element_size_byte = + DataTypeToTypeMeta(in[0].data_type()).itemsize(); c.flops = M * N * (2 * K + 1); - c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type()); - c.bytes_written = M * N * sizeof(X.data_type()); - c.params_bytes = (K * N + N) * sizeof(X.data_type()); + c.bytes_read = (K * (M + N) + N) * X_element_size_byte; + c.bytes_written = M * N * X_element_size_byte; + c.params_bytes = (K * N + N) * X_element_size_byte; return c; } @@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient( CAFFE_ENFORCE_LT(0, out.size()); const TensorShape dW = out[0]; + auto const& dW_element_size_byte = + DataTypeToTypeMeta(dW.data_type()).itemsize(); const TensorShape db = out[1]; + auto const& db_element_size_byte = + DataTypeToTypeMeta(db.data_type()).itemsize(); auto axis = helper.GetSingleArgument("axis", 1); const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size()); @@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient( uint64_t size_db = nElemFromDim(db); c.flops = M * N * (2 * K + 1); - c.bytes_written = (size_dW + size_db) * sizeof(float); + c.bytes_written = + size_dW * dW_element_size_byte + size_db * db_element_size_byte; c.params_bytes = (K * N + N) * sizeof(float); if (out.size() == 3) { const TensorShape dX = out[2]; uint64_t size_dX = nElemFromDim(dX); - + auto const& dX_element_size_byte = + DataTypeToTypeMeta(dX.data_type()).itemsize(); c.flops += 2 * M * N * K; - c.bytes_written += size_dX * sizeof(float); + c.bytes_written += size_dX * dX_element_size_byte; } return c; } diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc index c3eaf05db0e8f..55c73a5be22c4 100644 --- a/caffe2/operators/one_hot_ops.cc +++ b/caffe2/operators/one_hot_ops.cc @@ -2,6 +2,7 @@ #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" +#include "caffe2/core/types.h" namespace caffe2 { @@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot( const auto& length = in[1]; const auto& values = in[2]; - uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type()); - uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type()); - uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type()); + auto const& data_element_size_byte = + DataTypeToTypeMeta(data.data_type()).itemsize(); + auto const& length_element_size_byte = + DataTypeToTypeMeta(length.data_type()).itemsize(); + auto const& values_element_size_byte = + DataTypeToTypeMeta(values.data_type()).itemsize(); + auto const& output_element_size_byte = + DataTypeToTypeMeta(output.data_type()).itemsize(); + + uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte; + uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte; + uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte; c.flops = 0; c.bytes_read = nBytesData + nBytesLength + nBytesValues; - c.bytes_written = nElemFromDim(output) * sizeof(output.data_type()); + c.bytes_written = nElemFromDim(output) * output_element_size_byte; c.params_bytes = 0; return c; } @@ -145,15 +155,15 @@ bool BatchBucketOneHotOp::RunOnDevice() { for (int64_t j = 0; j < D; j++) { // here we assume the boundary values for each feature are sorted int64_t lower_bucket_idx = std::lower_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; int64_t upper_bucket_idx = std::upper_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2; diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 8b5e116024b81..561da9189b388 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -1,6 +1,7 @@ #include "caffe2/operators/utility_ops.h" #include #include +#include "caffe2/core/types.h" #include "caffe2/utils/eigen_utils.h" namespace caffe2 { @@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum( const auto& nElem = nElemFromDim(X0); const auto& nInputs = in.size(); c.flops = (nInputs - 1) * nElem; - c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type()); - c.bytes_written = nElem * sizeof(X0.data_type()); - c.params_bytes = (nInputs / 2) * sizeof(X0.data_type()); + auto const& X0_element_size_byte = + DataTypeToTypeMeta(X0.data_type()).itemsize(); + c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte; + c.bytes_written = nElem * X0_element_size_byte; + c.params_bytes = (nInputs / 2) * X0_element_size_byte; return c; } @@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp); REGISTER_CPU_OPERATOR(SumInt, SumOp); REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp); REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp); -REGISTER_CPU_OPERATOR( - ScatterWeightedSum, - ScatterWeightedSumOp); +REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp); REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); REGISTER_CPU_OPERATOR(Scatter, ScatterOp); diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py index 996b330be4947..7dab4d6bd5d1f 100644 --- a/caffe2/python/operator_test/concat_op_cost_test.py +++ b/caffe2/python/operator_test/concat_op_cost_test.py @@ -7,33 +7,39 @@ class TestConcatOpCost(TestCase): def test_columnwise_concat(self): - workspace.ResetWorkspace() - workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)) - workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32)) - concat_op = core.CreateOperator( - "Concat", - ["input_1", "input_2"], - ["output", "split_info"], - ) - workspace.RunOperatorOnce(concat_op) + def _test_columnwise_concat_for_type(dtype): + workspace.ResetWorkspace() + workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)) + workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype)) + concat_op = core.CreateOperator( + "Concat", + ["input_1", "input_2"], + ["output", "split_info"], + ) + workspace.RunOperatorOnce(concat_op) - output = workspace.FetchBlob("output") - self.assertTupleEqual(output.shape, (2, 4)) - np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]]) + output = workspace.FetchBlob("output") + self.assertTupleEqual(output.shape, (2, 4)) + np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]]) - flops, bytes_written, bytes_read = workspace.GetOperatorCost( - concat_op, concat_op.input - ) + flops, bytes_written, bytes_read = workspace.GetOperatorCost( + concat_op, concat_op.input + ) - self.assertEqual(flops, 0) - self.assertEqual( - bytes_read, - sum(workspace.FetchBlob(b).nbytes for b in concat_op.input), - ) - self.assertEqual( - bytes_written, - sum(workspace.FetchBlob(b).nbytes for b in concat_op.output), - ) + self.assertEqual(flops, 0) + self.assertEqual( + bytes_read, + sum(workspace.FetchBlob(b).nbytes for b in concat_op.input), + ) + self.assertEqual( + bytes_written, + sum(workspace.FetchBlob(b).nbytes for b in concat_op.output), + ) + + [ + _test_columnwise_concat_for_type(t) + for t in [np.int64, np.float, np.half, np.int8] + ] def test_split_then_concat(self): workspace.ResetWorkspace() diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index afb2065027075..1bf7b607e1b7e 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -60,7 +60,7 @@ def testGetOperatorCost(self): self.assertTupleEqual( op_cost, namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])( - 1152, 256, 2084 + 1152, 256, 4168 ), ) diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc index 0de50f03e62d5..0b6f604b48cdb 100644 --- a/caffe2/sgd/adagrad_op.cc +++ b/caffe2/sgd/adagrad_op.cc @@ -1,4 +1,5 @@ #include "adagrad_op.h" +#include "caffe2/core/types.h" namespace caffe2 { @@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad( // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = grad_size * 10; + auto const& moment_element_size_byte = + DataTypeToTypeMeta(moment.data_type()).itemsize(); + auto const& param_element_size_byte = + DataTypeToTypeMeta(param.data_type()).itemsize(); + auto const& grad_element_size_byte = + DataTypeToTypeMeta(grad.data_type()).itemsize(); + auto const& lr_element_size_byte = + DataTypeToTypeMeta(lr.data_type()).itemsize(); uint64_t bytes_written = - grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type())); + grad_size * param_element_size_byte + moment_element_size_byte; if (output_size == 3) { // also need to output effective learning rate in this case // assume it's the same data type as lr - bytes_written += grad_size * sizeof(lr.data_type()); + bytes_written += grad_size * lr_element_size_byte; } else if (output_size == 4) { // also need to output effective learning rate and updates in this case // assume update is the same data type as param bytes_written += - grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type())); + grad_size * (lr_element_size_byte + param_element_size_byte); } c.bytes_written = bytes_written; c.bytes_read = c.bytes_written + - grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type())); + grad_size * (grad_element_size_byte + lr_element_size_byte); return c; } @@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad( // (optimistically count sqrt as one flop). // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = grad_size * 7; + auto const& param_element_size_byte = + DataTypeToTypeMeta(param.data_type()).itemsize(); + auto const& moment_element_size_byte = + DataTypeToTypeMeta(moment.data_type()).itemsize(); c.bytes_written = - grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type())); - c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) + - n * sizeof(indices.data_type()); + grad_size * (param_element_size_byte + moment_element_size_byte); + auto const& grad_element_size_byte = + DataTypeToTypeMeta(grad.data_type()).itemsize(); + auto const& indices_element_size_byte = + DataTypeToTypeMeta(indices.data_type()).itemsize(); + c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte + + n * indices_element_size_byte; return c; } @@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad( OpSchema::Cost c; if (n > 0) { + auto const& param_element_size_byte = + DataTypeToTypeMeta(param.data_type()).itemsize(); + auto const& moment_element_size_byte = + DataTypeToTypeMeta(moment.data_type()).itemsize(); + auto const& grad_element_size_byte = + DataTypeToTypeMeta(grad.data_type()).itemsize(); + auto const& indices_element_size_byte = + DataTypeToTypeMeta(indices.data_type()).itemsize(); + auto const& lr_element_size_byte = + DataTypeToTypeMeta(lr.data_type()).itemsize(); auto block_size = grad_size / n; if (block_size == 1) { // +2: applying weight decay and add to grads @@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad( // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = n * 9; c.bytes_written = - n * (sizeof(param.data_type()) + sizeof(moment.data_type())); + n * (param_element_size_byte + moment_element_size_byte); c.bytes_read = c.bytes_written + n * - (sizeof(grad.data_type()) + sizeof(indices.data_type()) + - sizeof(lr.data_type())); + (grad_element_size_byte + indices_element_size_byte + + lr_element_size_byte); } else { // 5 per block (not counting index transforms) // 8 for each value of a block // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = n * (5 + (block_size * 8)); - c.bytes_written = - n * sizeof(moment.data_type()) + n * block_size * (param.data_type()); + c.bytes_written = n * moment_element_size_byte + + n * block_size * param_element_size_byte; - c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) + + c.bytes_read = c.bytes_written + n * lr_element_size_byte + 2 * n * block_size * - (sizeof(grad.data_type()) + sizeof(param.data_type())); + (grad_element_size_byte + param_element_size_byte); } } return c; From f3e329cbec5f4f32e195bbe3b8b5b4d2b1323128 Mon Sep 17 00:00:00 2001 From: lezcano Date: Mon, 30 Aug 2021 13:10:23 -0700 Subject: [PATCH 356/530] Implements the orthogonal parametrization (#62089) Summary: Implements an orthogonal / unitary parametrisation. It does passes the tests and I have trained a couple models with this implementation, so I believe it should be somewhat correct. Now, the implementation is very subtle. I'm tagging nikitaved and IvanYashchuk as reviewers in case they have comments / they see some room for optimisation of the code, in particular of the `forward` function. Fixes https://github.com/pytorch/pytorch/issues/42243 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62089 Reviewed By: ezyang Differential Revision: D30639063 Pulled By: albanD fbshipit-source-id: 988664f333ac7a75ce71ba44c8d77b986dff2fe6 --- docs/source/nn.rst | 3 +- test/test_nn.py | 133 +++++++++++++ torch/nn/utils/parametrizations.py | 298 +++++++++++++++++++++++++++-- torch/nn/utils/parametrize.py | 73 ++++--- 4 files changed, 464 insertions(+), 43 deletions(-) diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 07ce4db2f48af..6eca9d4b16b6a 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -389,6 +389,7 @@ in :func:`torch.nn.utils.parameterize.register_parametrization`. :toctree: generated :nosignatures: + parametrizations.orthogonal parametrizations.spectral_norm Utility functions to parametrize Tensors on existing Modules. @@ -396,7 +397,7 @@ Note that these functions can be used to parametrize a given Parameter or Buffer given a specific function that maps from an input space to the parametrized space. They are not parameterizations that would transform an object into a parameter. See the -`Parametrizations `__ tutorial +`Parametrizations tutorial `_ for more information on how to implement your own parametrizations. .. autosummary:: diff --git a/test/test_nn.py b/test/test_nn.py index c9815dbf2ee0e..c6d0e78044126 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -4518,6 +4518,139 @@ def test_spectral_norm_pickle(self): m = pickle.loads(pickle.dumps(m)) self.assertIsInstance(m, nn.Linear) + def test_orthogonal_parametrization(self): + # Orthogonal implements 6 algorithms (3x parametrizations times 2 options of use_trivialization) + + def assert_is_orthogonal(X): + n, k = X.size(-2), X.size(-1) + if n < k: + X = X.transpose(-2, -1) + n, k = k, n + Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k) + eps = 10 * n * torch.finfo(X.dtype).eps + torch.testing.assert_allclose(X.transpose(-2, -1).conj() @ X, Id, atol=eps, rtol=0.) + + + def assert_weight_allclose_Q(weight, W): + # Test that weight is equal to the Q part of the QR decomposition of W + # (or of its transpose if the matrix is wide) + wide_matrix = W.size(-2) < W.size(-1) + if wide_matrix: + W = W.transpose(-2, -1) + Q, R = torch.linalg.qr(W) + Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2) + if wide_matrix: + Q = Q.transpose(-2, -1) + torch.testing.assert_allclose(Q, weight, atol=1e-5, rtol=0.) + + + for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)), # square/ tall / wide + (torch.float32, torch.complex64), + (True, False)): + # Conv2d does not support complex yet + if not use_linear and dtype.is_complex: + continue + + if use_linear: + input = torch.randn(3, shape[0], dtype=dtype) + else: + input = torch.randn(2, 2, shape[0] + 2, shape[1] + 1, dtype=dtype) + + for parametrization, use_trivialization in product(("matrix_exp", "cayley", "householder"), + (False, True)): + # right_inverse for Cayley and matrix_exp not implemented for use_trivialization=False + # See Note [right_inverse expm cayley] + can_initialize = use_trivialization or parametrization == "householder" + + # We generate them every time to always start with fresh weights + if use_linear: + m = nn.Linear(*shape, dtype=dtype) + else: + m = nn.Conv2d(2, 3, shape, dtype=dtype) + + # We do not support householder for complex inputs + # See Note [Householder complex] + w_init = m.weight.clone() + if parametrization == "householder" and m.weight.is_complex(): + msg = "householder parametrization does not support complex tensors" + with self.assertRaisesRegex(ValueError, msg): + torch.nn.utils.parametrizations.orthogonal(m, + "weight", + parametrization, + use_trivialization=use_trivialization) + continue + + wide_matrix = w_init.size(-2) < w_init.size(-1) + torch.nn.utils.parametrizations.orthogonal(m, + "weight", + parametrization, + use_trivialization=use_trivialization) + # Forwards works as expected + self.assertEqual(w_init.shape, m.weight.shape) + assert_is_orthogonal(m.weight) + if can_initialize: + assert_weight_allclose_Q(m.weight, w_init) + + # Intializing with a given orthogonal matrix works + X = torch.randn_like(m.weight) + if wide_matrix: + X = X.transpose(-2, -1) + w_new = torch.linalg.qr(X).Q + if wide_matrix: + w_new = w_new.transpose(-2, -1) + if can_initialize: + m.weight = w_new + torch.testing.assert_allclose(w_new, m.weight, atol=1e-5, rtol=0.) + else: + msg = "assign to the matrix exponential or the Cayley parametrization" + with self.assertRaisesRegex(NotImplementedError, msg): + m.weight = w_new + + # Intializing with a non-orthogonal matrix makes m.weight be the Q part of the given matrix + w_new = torch.randn_like(m.weight) + if can_initialize: + m.weight = w_new + assert_weight_allclose_Q(m.weight, w_new) + else: + msg = "assign to the matrix exponential or the Cayley parametrization" + with self.assertRaisesRegex(NotImplementedError, msg): + m.weight = w_new + + opt = torch.optim.SGD(m.parameters(), lr=0.1) + for _ in range(2): + opt.zero_grad() + m(input).norm().backward() + grad = m.parametrizations.weight.original.grad + self.assertIsNotNone(grad) + # We do not update the upper triangular part of the matrix if tall tril if wide + if grad.size(-2) >= grad.size(-1): + zeros_grad = grad.triu(1) + else: + zeros_grad = grad.tril(-1) + self.assertEqual(zeros_grad, torch.zeros_like(zeros_grad)) + # The gradient in the diagonal can only be imaginary because a skew-Hermitian + # matrix has imaginary diagonal + diag_grad = grad.diagonal(dim1=-2, dim2=-1) + if grad.is_complex(): + diag_grad = diag_grad.real + self.assertEqual(diag_grad, torch.zeros_like(diag_grad)) + opt.step() + assert_is_orthogonal(m.weight) + + def test_orthogonal_errors(self): + m = nn.Linear(3, 4) + with self.assertRaisesRegex(ValueError, "has to be one of"): + torch.nn.utils.parametrizations.orthogonal(m, "weight", "foo") + + with self.assertRaisesRegex(ValueError, "Expected a matrix"): + torch.nn.utils.parametrizations.orthogonal(m, "bias") + + torch.nn.utils.parametrizations.orthogonal(m, "weight") + with self.assertRaisesRegex(ValueError, "matrices of shape"): + m.weight = torch.randn(5, 5) + torch.nn.utils.parametrize.remove_parametrizations(m, "weight") + + def test_threshold_int(self): x = torch.tensor([-3, -2, -1, 0, 1, 2, 3]) expected = torch.tensor([99, 99, 99, 99, 1, 2, 3]) diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py index de3d5c7144f9e..de67aa814f39c 100644 --- a/torch/nn/utils/parametrizations.py +++ b/torch/nn/utils/parametrizations.py @@ -1,10 +1,286 @@ +from enum import Enum, auto + import torch +from torch import Tensor from ..utils import parametrize from ..modules import Module from .. import functional as F from typing import Optional + +def _is_orthogonal(Q, eps=None): + n, k = Q.size(-2), Q.size(-1) + Id = torch.eye(k, dtype=Q.dtype, device=Q.device) + # A reasonable eps, but not too large + eps = 10. * n * torch.finfo(Q.dtype).eps + return torch.allclose(Q.transpose(-2, -1).conj() @ Q, Id, atol=eps) + + +def _make_orthogonal(A): + """ Assume that A is a tall matrix. + Compute the Q factor s.t. A = QR (A may be complex) and diag(R) is real and non-negative + """ + X, tau = torch.geqrf(A) + Q = torch.linalg.householder_product(X, tau) + # The diagonal of X is the diagonal of R (which is always real) so we normalise by its signs + Q *= X.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2) + return Q + + +class _OrthMaps(Enum): + matrix_exp = auto() + cayley = auto() + householder = auto() + + +class _Orthogonal(Module): + base: Tensor + + def __init__(self, + weight, + orthogonal_map: _OrthMaps, + *, + use_trivialization=True) -> None: + super().__init__() + + # Note [Householder complex] + # For complex tensors, it is not possible to compute the tensor `tau` necessary for + # linalg.householder_product from the reflectors. + # To see this, note that the reflectors have a shape like: + # 0 0 0 + # * 0 0 + # * * 0 + # which, for complex matrices, give n(n-1) (real) parameters. Now, you need n^2 parameters + # to parametrize the unitary matrices. Saving tau on its own does not work either, because + # not every combination of `(A, tau)` gives a unitary matrix, meaning that if we optimise + # them as independent tensors we would not maintain the constraint + # An equivalent reasoning holds for rectangular matrices + if weight.is_complex() and orthogonal_map == _OrthMaps.householder: + raise ValueError("The householder parametrization does not support complex tensors.") + + self.shape = weight.shape + self.orthogonal_map = orthogonal_map + if use_trivialization: + self.register_buffer("base", None) + + def forward(self, X: torch.Tensor) -> torch.Tensor: + n, k = X.size(-2), X.size(-1) + transposed = n < k + if transposed: + X = X.transpose(-2, -1) + n, k = k, n + # Here n > k and X is a tall matrix + if self.orthogonal_map == _OrthMaps.matrix_exp or self.orthogonal_map == _OrthMaps.cayley: + # We just need n x k - k(k-1)/2 parameters + X = X.tril() + if n != k: + # Embed into a square matrix + X = torch.cat([X, X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1) + A = X - X.transpose(-2, -1).conj() + # A is skew-symmetric (or skew-hermitian) + if self.orthogonal_map == _OrthMaps.matrix_exp: + Q = torch.matrix_exp(A) + elif self.orthogonal_map == _OrthMaps.cayley: + # Computes the Cayley retraction (I+A/2)(I-A/2)^{-1} + Id = torch.eye(n, dtype=A.dtype, device=A.device) + Q = torch.linalg.solve(torch.add(Id, A, alpha=-0.5), torch.add(Id, A, alpha=0.5)) + # Q is now orthogonal (or unitary) of size (..., n, n) + if n != k: + Q = Q[..., :k] + # Q is now the size of the X (albeit perhaps transposed) + else: + # X is real here, as we do not support householder with complex numbers + A = X.tril(diagonal=-1) + tau = 2. / (1. + (A * A).sum(dim=-2)) + Q = torch.linalg.householder_product(A, tau) + # The diagonal of X is 1's and -1's + # We do not want to differentiate through this or update the diagonal of X hence the casting + Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2) + + if hasattr(self, "base"): + Q = self.base @ Q + if transposed: + Q = Q.transpose(-2, -1) + return Q + + @torch.autograd.no_grad() + def right_inverse(self, Q: torch.Tensor) -> torch.Tensor: + if Q.shape != self.shape: + raise ValueError(f"Expected a matrix or batch of matrices of shape {self.shape}. " + f"Got a tensor of shape {Q.shape}.") + + Q_init = Q + n, k = Q.size(-2), Q.size(-1) + transpose = n < k + if transpose: + Q = Q.transpose(-2, -1) + n, k = k, n + + # We always make sure to always copy Q in every path + if not hasattr(self, "base"): + # Note [right_inverse expm cayley] + # If we do not have use_trivialization=True, we just implement the inverse of the forward + # map for the Householder. To see why, think that for the Cayley map, + # we would need to find the matrix X \in R^{n x k} such that: + # Y = torch.cat([X.tril(), X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1) + # A = Y - Y.transpose(-2, -1).conj() + # cayley(A)[:, :k] + # gives the original tensor. It is not clear how to do this. + # Perhaps via some algebraic manipulation involving the QR like that of + # Corollary 2.2 in Edelman, Arias and Smith? + if self.orthogonal_map == _OrthMaps.cayley or self.orthogonal_map == _OrthMaps.matrix_exp: + raise NotImplementedError("It is not possible to assign to the matrix exponential " + "or the Cayley parametrizations when use_trivialization=False.") + + # If parametrization == _OrthMaps.householder, make Q orthogonal via the QR decomposition. + # Here Q is always real because we do not support householder and complex matrices. + # See note [Householder complex] + A, tau = torch.geqrf(Q) + # We want to have a decomposition X = QR with diag(R) > 0, as otherwise we could + # decompose an orthogonal matrix Q as Q = (-Q)@(-Id), which is a valid QR decomposition + # The diagonal of Q is the diagonal of R from the qr decomposition + A.diagonal(dim1=-2, dim2=-1).sign_() + # Equality with zero is ok because LAPACK returns exactly zero when it does not want + # to use a particular reflection + A.diagonal(dim1=-2, dim2=-1)[tau == 0.] *= -1 + return A.transpose(-2, -1) if transpose else A + else: + if n == k: + # We check whether Q is orthogonal + if not _is_orthogonal(Q): + Q = _make_orthogonal(Q) + else: # Is orthogonal + Q = Q.clone() + else: + # Complete Q into a full n x n orthogonal matrix + N = torch.randn(*(Q.size()[:-2] + (n, n - k)), dtype=Q.dtype, device=Q.device) + Q = torch.cat([Q, N], dim=-1) + Q = _make_orthogonal(Q) + self.base = Q + + # It is necessary to return the -Id, as we use the diagonal for the + # Householder parametrization. Using -Id makes: + # householder(torch.zeros(m,n)) == torch.eye(m,n) + # Poor man's version of eye_like + neg_Id = torch.zeros_like(Q_init) + neg_Id.diagonal(dim1=-2, dim2=-1).fill_(-1.) + return neg_Id + + +def orthogonal(module: Module, + name: str = 'weight', + orthogonal_map: Optional[str] = None, + *, + use_trivialization: bool = True) -> Module: + r"""Applies an orthogonal or unitary parametrization to a matrix or a batch of matrices. + + Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, the parametrized + matrix :math:`Q \in \mathbb{K}^{m \times n}` is **orthogonal** as + + .. math:: + + \begin{align*} + Q^{\text{H}}Q &= \mathrm{I}_n \mathrlap{\qquad \text{if }m \geq n}\\ + QQ^{\text{H}} &= \mathrm{I}_m \mathrlap{\qquad \text{if }m < n} + \end{align*} + + where :math:`Q^{\text{H}}` is the conjugate transpose when :math:`Q` is complex + and the transpose when :math:`Q` is real-valued, and + :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix. + In plain words, :math:`Q` will have orthonormal columns whenever :math:`m \geq n` + and orthonormal rows otherwise. + + If the tensor has more than two dimensions, we consider it as a batch of matrices of shape `(..., m, n)`. + + The matrix :math:`Q` may be parametrized via three different ``orthogonal_map`` in terms of the original tensor: + + - ``"matrix_exp"``/``"cayley"``: + the :func:`~torch.matrix_exp` :math:`Q = \exp(A)` and the `Cayley map`_ + :math:`Q = (\mathrm{I}_n + A/2)(\mathrm{I}_n - A/2)^{-1}` are applied to a skew-symmetric + :math:`A` to give an orthogonal matrix. + - ``"householder"``: computes a product of Householder reflectors + (:func:`~torch.linalg.householder_product`). + + ``"matrix_exp"``/``"cayley"`` often make the parametrized weight converge faster than + ``"householder"``, but they are slower to compute for very thin or very wide matrices. + + If ``use_trivialization=True`` (default), the parametrization implements the "Dynamic Trivialization Framework", + where an extra matrix :math:`B \in \mathbb{K}^{n \times n}` is stored under + ``module.parametrizations.weight[0].base``. This helps the + convergence of the parametrized layer at the expense of some extra memory use. + See `Trivializations for Gradient-Based Optimization on Manifolds`_ . + + Initial value of :math:`Q`: + If the original tensor is not parametrized and ``use_trivialization=True`` (default), the initial value + of :math:`Q` is that of the original tensor if it is orthogonal (or unitary in the complex case) + and it is orthogonalized via the QR decomposition otherwise (see :func:`torch.linalg.qr`). + Same happens when it is not parametrized and ``orthogonal_map="householder"`` even when ``use_trivialization=False``. + Otherwise, the initial value is the result of the composition of all the registered + parametrizations applied to the original tensor. + + .. note:: + This function is implemented using the parametrization functionality + in :func:`~torch.nn.utils.parametrize.register_parametrization`. + + + .. _`Cayley map`: https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map + .. _`Trivializations for Gradient-Based Optimization on Manifolds`: https://arxiv.org/abs/1909.09501 + + Args: + module (nn.Module): module on which to register the parametrization. + name (str, optional): name of the tensor to make orthogonal. Default: ``"weight"``. + orthogonal_map (str, optional): One of the following: ``"matrix_exp"``, ``"cayley"``, ``"householder"``. + Default: ``"matrix_exp"`` if the matrix is square or complex, ``"householder"`` otherwise. + use_trivialization (bool, optional): whether to use the dynamic trivialization framework. + Default: ``True``. + + Returns: + The original module with an orthogonal parametrization registered to the specified + weight + + Example:: + + >>> orth_linear = orthogonal(nn.Linear(20, 40)) + >>> orth_linear + ParametrizedLinear( + in_features=20, out_features=40, bias=True + (parametrizations): ModuleDict( + (weight): ParametrizationList( + (0): _Orthogonal() + ) + ) + ) + >>> Q = orth_linear.weight + >>> torch.dist(Q.T @ Q, torch.eye(20)) + tensor(4.9332e-07) + """ + weight = getattr(module, name, None) + if not isinstance(weight, Tensor): + raise ValueError( + "Module '{}' has no parameter ot buffer with name '{}'".format(module, name) + ) + + # We could implement this for 1-dim tensors as the maps on the sphere + # but I believe it'd bite more people than it'd help + if weight.ndim < 2: + raise ValueError("Expected a matrix or batch of matrices. " + f"Got a tensor of {weight.ndim} dimensions.") + + if orthogonal_map is None: + orthogonal_map = "matrix_exp" if weight.size(-2) == weight.size(-1) or weight.is_complex() else "householder" + + orth_enum = getattr(_OrthMaps, orthogonal_map, None) + if orth_enum is None: + raise ValueError('orthogonal_map has to be one of "matrix_exp", "cayley", "householder". ' + f'Got: {orthogonal_map}') + orth = _Orthogonal(weight, + orth_enum, + use_trivialization=use_trivialization) + parametrize.register_parametrization(module, name, orth, unsafe=True) + return module + + class _SpectralNorm(Module): def __init__( self, @@ -147,8 +423,8 @@ def spectral_norm(module: Module, .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957 .. note:: - This function is implemented using the new parametrization functionality - in :func:`torch.nn.utils.parametrize.register_parametrization`. It is a + This function is implemented using the parametrization functionality + in :func:`~torch.nn.utils.parametrize.register_parametrization`. It is a reimplementation of :func:`torch.nn.utils.spectral_norm`. .. note:: @@ -165,13 +441,13 @@ def spectral_norm(module: Module, Args: module (nn.Module): containing module - name (str, optional): name of weight parameter + name (str, optional): name of weight parameter. Default: ``"weight"``. n_power_iterations (int, optional): number of power iterations to - calculate spectral norm + calculate spectral norm. Default: ``1``. eps (float, optional): epsilon for numerical stability in - calculating norms - dim (int, optional): dimension corresponding to number of outputs, - the default is ``0``, except for modules that are instances of + calculating norms. Default: ``1e-12``. + dim (int, optional): dimension corresponding to number of outputs. + Default: ``0``, except for modules that are instances of ConvTranspose{1,2,3}d, when it is ``1`` Returns: @@ -193,13 +469,11 @@ def spectral_norm(module: Module, >>> torch.linalg.matrix_norm(snm.weight, 2) tensor(1.0000, grad_fn=) """ - if not hasattr(module, name): + weight = getattr(module, name, None) + if not isinstance(weight, Tensor): raise ValueError( - "Module '{}' has no attribute with name '{}'".format(module, name) + "Module '{}' has no parameter or buffer with name '{}'".format(module, name) ) - # getattr should get the correct parametrized weight if there - # is already an parametrization registered - weight = getattr(module, name) if dim is None: if isinstance(module, (torch.nn.ConvTranspose1d, diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py index 332fe762b8309..d8f2a947352cf 100644 --- a/torch/nn/utils/parametrize.py +++ b/torch/nn/utils/parametrize.py @@ -129,8 +129,11 @@ def __init__( new = original for module in reversed(self): # type: ignore[call-overload] if hasattr(module, "right_inverse"): - new = module.right_inverse(new) - # else, we assume that right_inverse is the identity + try: + new = module.right_inverse(new) + except NotImplementedError: + pass + # else, or if it throws, we assume that right_inverse is the identity if not isinstance(new, Tensor) and not isinstance(new, collections.abc.Sequence): raise ValueError("'right_inverse' must return a Tensor or a Sequence of tensors (list, tuple...). " @@ -209,7 +212,9 @@ def right_inverse(self, value: Tensor) -> None: for module in reversed(self): # type: ignore[call-overload] if hasattr(module, "right_inverse"): value = module.right_inverse(value) - # else we assume that right_inverse is the identity + else: + raise RuntimeError(f"parametrization {type(module).__name__} does not implement " + "right_inverse.") if self.is_tensor: # These exceptions should only throw when a right_inverse function does not # return the same dtype for every input, which should most likely be caused by a bug @@ -372,16 +377,12 @@ def register_parametrization( def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]] - If this method is not implemented, it defaults to the identity. This method is called on the unparametrized tensor when the first parametrization - is registered. + is registered to compute the initial value of the original tensor. + If this method is not implemented, the original tensor will be just the unparametrized tensor. - In most situations, ``right_inverse`` will be a function such that - ``forward(right_inverse(X)) == X`` (see - `right inverse `_). - Sometimes, when the parametrization is not surjective, it may be reasonable - to relax this. - This may be used to initialize the tensor, as shown in the example below. + If all the parametrizations registered on a tensor implement `right_inverse` it is possible + to initialize a parametrized tensor by assigning to it, as shown in the example below. It is possible for the first parametrization to depend on several inputs. This may be implemented returning a tuple of tensors from ``right_inverse`` @@ -397,6 +398,14 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]] If unsafe=True, then right_inverse will be called if the tensor is not parametrized, and nothing will be called otherwise. + .. note:: + + In most situations, ``right_inverse`` will be a function such that + ``forward(right_inverse(X)) == X`` (see + `right inverse `_). + Sometimes, when the parametrization is not surjective, it may be reasonable + to relax this. + .. warning:: If a parametrization depends on several inputs, :func:`~register_parametrization` @@ -483,25 +492,29 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]] f"parametrization(module.{tensor_name}).shape: {X.shape}" ) if hasattr(parametrization, "right_inverse"): - Z = parametrization.right_inverse(X) # type: ignore[operator] - if not isinstance(Z, Tensor): - raise ValueError( - f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}" - ) - if Z.dtype != Y.dtype: - raise ValueError( - "The tensor returned by parametrization.right_inverse must have the same dtype " - f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n" - f"module.{tensor_name}.dtype: {Y.dtype}\n" - f"returned dtype: {Z.dtype}" - ) - if Z.shape != Y.shape: - raise ValueError( - "The tensor returned by parametrization.right_inverse must have the same shape " - f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n" - f"module.{tensor_name}.shape: {Y.shape}\n" - f"returned shape: {Z.shape}" - ) + try: + Z = parametrization.right_inverse(X) # type: ignore[operator] + except NotImplementedError: + pass + else: + if not isinstance(Z, Tensor): + raise ValueError( + f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}" + ) + if Z.dtype != Y.dtype: + raise ValueError( + "The tensor returned by parametrization.right_inverse must have the same dtype " + f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n" + f"module.{tensor_name}.dtype: {Y.dtype}\n" + f"returned dtype: {Z.dtype}" + ) + if Z.shape != Y.shape: + raise ValueError( + "The tensor returned by parametrization.right_inverse must have the same shape " + f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n" + f"module.{tensor_name}.shape: {Y.shape}\n" + f"returned shape: {Z.shape}" + ) # else right_inverse is assumed to be the identity # add the new parametrization to the parametrization list From b9933f08b985f9105e00804d5c99016841bd4cc7 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 30 Aug 2021 13:26:00 -0700 Subject: [PATCH 357/530] Fix type annotation in tools/nightly.py (#64202) Summary: `tempfile.TemporaryDirectory` is a generic only in python-3.9 and above Workaround by wrapping type annotation in quotes Fixes https://github.com/pytorch/pytorch/issues/64017 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64202 Reviewed By: janeyx99 Differential Revision: D30644215 Pulled By: malfet fbshipit-source-id: 3c16240b9fa899bd4d572c1732a7d87d3dd0fbd5 --- tools/nightly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nightly.py b/tools/nightly.py index 0b387e3b32dcf..7a46a011d232a 100755 --- a/tools/nightly.py +++ b/tools/nightly.py @@ -324,7 +324,7 @@ def deps_install(deps: List[str], existing_env: bool, env_opts: List[str]) -> No @timed("Installing pytorch nightly binaries") -def pytorch_install(url: str) -> tempfile.TemporaryDirectory[str]: +def pytorch_install(url: str) -> "tempfile.TemporaryDirectory[str]": """"Install pytorch into a temporary directory""" pytdir = tempfile.TemporaryDirectory() cmd = ["conda", "create", "--yes", "--no-deps", "--prefix", pytdir.name, url] From 85df73658ca38e894542e649bd053f269e77880a Mon Sep 17 00:00:00 2001 From: Will Constable Date: Mon, 30 Aug 2021 13:29:51 -0700 Subject: [PATCH 358/530] Make name() part of IMethod interface (#63995) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63995 JIT methods already have name() in their interface, and Py methods have names in their implementation. I'm adding this for a particular case where someone tried to use name() on a JIT method that we're replacing with an IMethod. Test Plan: add case to imethod API test Reviewed By: suo Differential Revision: D30559401 fbshipit-source-id: 76236721f5cd9a9d9d488ddba12bfdd01d679a2c --- test/cpp/api/imethod.cpp | 3 +++ torch/csrc/api/include/torch/imethod.h | 2 ++ torch/csrc/deploy/deploy.h | 4 ++++ torch/csrc/jit/api/method.h | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/test/cpp/api/imethod.cpp b/test/cpp/api/imethod.cpp index 8673e55fb5629..b8c12c649fd19 100644 --- a/test/cpp/api/imethod.cpp +++ b/test/cpp/api/imethod.cpp @@ -28,6 +28,9 @@ TEST(IMethodTest, CallMethod) { auto pyModel = package.load_pickle("model", "model.pkl"); torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward"); + EXPECT_EQ(scriptMethod.name(), "forward"); + EXPECT_EQ(pyMethod.name(), "forward"); + auto input = torch::ones({10, 20}); auto outputPy = pyMethod({input}); auto outputScript = scriptMethod({input}); diff --git a/torch/csrc/api/include/torch/imethod.h b/torch/csrc/api/include/torch/imethod.h index af010785a8016..5ab9b83888214 100644 --- a/torch/csrc/api/include/torch/imethod.h +++ b/torch/csrc/api/include/torch/imethod.h @@ -28,6 +28,8 @@ class TORCH_API IMethod { std::vector args, const IValueMap& kwargs = IValueMap()) const = 0; + virtual const std::string& name() const = 0; + // Returns an ordered list of argument names, possible in both // script and python methods. This is a more portable dependency // than a ScriptMethod FunctionSchema, which has more information diff --git a/torch/csrc/deploy/deploy.h b/torch/csrc/deploy/deploy.h index 20364797edd8a..f34e4bc5fdbcc 100644 --- a/torch/csrc/deploy/deploy.h +++ b/torch/csrc/deploy/deploy.h @@ -232,6 +232,10 @@ class PythonMethodWrapper : public torch::IMethod { std::string method_name) : model_(std::move(model)), method_name_(std::move(method_name)) {} + const std::string& name() const override { + return method_name_; + } + c10::IValue operator()( std::vector args, const IValueMap& kwargs = IValueMap()) const override { diff --git a/torch/csrc/jit/api/method.h b/torch/csrc/jit/api/method.h index bcd44a1df343a..3fcc4421891a0 100644 --- a/torch/csrc/jit/api/method.h +++ b/torch/csrc/jit/api/method.h @@ -46,7 +46,7 @@ struct TORCH_API Method : public torch::IMethod { return function_->graph(); } - const std::string& name() const { + const std::string& name() const override { return function_->name(); } From 9035a1cb4d6fd927b04d8491cd0e8e073ee22025 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Mon, 30 Aug 2021 13:55:19 -0700 Subject: [PATCH 359/530] .github: Adding configuration for docs_test (#64201) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64201 Adds docs_test to our existing test matrix for github actions Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30646765 Pulled By: seemethere fbshipit-source-id: 946adae01ff1f1f7ebe626e408e161b77b19a011 --- .github/scripts/generate_ci_workflows.py | 2 ++ .github/scripts/generate_pytorch_test_matrix.py | 2 ++ .github/templates/linux_ci_workflow.yml.j2 | 2 +- .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 2 +- .../generated-linux-bionic-py3.8-gcc9-coverage.yml | 2 +- .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 +- .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 +- .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 2 +- .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 +- .jenkins/pytorch/test.sh | 6 ++++++ 10 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index f1819dbac589d..3ae63051c327a 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -148,6 +148,7 @@ class CIWorkflow: enable_nogpu_no_avx_test: YamlShellBool = "''" enable_nogpu_no_avx2_test: YamlShellBool = "''" enable_slow_test: YamlShellBool = "''" + enable_docs_test: YamlShellBool = "''" def __post_init__(self) -> None: if self.is_libtorch: @@ -266,6 +267,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: test_runner_type=LINUX_CPU_TEST_RUNNER, on_pull_request=True, enable_doc_jobs=True, + enable_docs_test=1, num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py index 75df57cfa2f89..7dc29097ab83a 100755 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ b/.github/scripts/generate_pytorch_test_matrix.py @@ -55,6 +55,8 @@ def main() -> None: configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} if os.getenv('ENABLE_SLOW_TEST'): configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} + if os.getenv('ENABLE_DOCS_TEST'): + configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} matrix = { 'include': [ { diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 7d9020790710e..e7681b049464a 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -254,6 +254,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }} ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }} ENABLE_SLOW_TEST: !{{ enable_slow_test }} + ENABLE_DOCS_TEST: !{{ enable_docs_test }} NUM_TEST_SHARDS: !{{ num_test_shards }} MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -346,7 +347,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index c51f8f047e986..0c5096146273c 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -230,6 +230,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: '' ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -325,7 +326,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 014b1d1162d07..536274b7df5c0 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -230,6 +230,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: '' ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -325,7 +326,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 76b973eebce24..42d10cb4782ce 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -230,6 +230,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: 1 ENABLE_NOGPU_NO_AVX2_TEST: 1 ENABLE_SLOW_TEST: 1 + ENABLE_DOCS_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -325,7 +326,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 8114bd541fdb3..8c3b8d40d7651 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -230,6 +230,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: '' ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -325,7 +326,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 6cc391ba15991..cbbfa5981cb44 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -230,6 +230,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: '' ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: 1 NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -325,7 +326,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 7b947790902ec..fc43fa74c8bfd 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -228,6 +228,7 @@ jobs: ENABLE_NOGPU_NO_AVX_TEST: '' ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -323,7 +324,6 @@ jobs: env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: - BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }} PR_NUMBER: ${{ github.event.pull_request.number }} run: | if [[ $TEST_CONFIG == 'multigpu' ]]; then diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 5014f603e4bb9..9710d3aafb35b 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -494,6 +494,10 @@ test_torch_deploy() { assert_git_not_dirty } +test_docs_test() { + .jenkins/pytorch/docs-test.sh +} + if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") @@ -532,6 +536,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then test_distributed test_rpc +elif [[ "${TEST_CONFIG}" = docs_test ]]; then + test_docs_test else install_torchvision install_monkeytype From 09e53c0cfe81a40c32610f0cb76b3072e3bfca02 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Mon, 30 Aug 2021 13:55:19 -0700 Subject: [PATCH 360/530] .github: Adding configuration for backwards_compat (#64204) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64204 Adds backwards_compat to our existing test matrix for github actions Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30646764 Pulled By: seemethere fbshipit-source-id: f0da6027e29fab03aff058cb13466fae5dcf3678 --- .github/scripts/generate_ci_workflows.py | 2 ++ .github/scripts/generate_pytorch_test_matrix.py | 2 ++ .github/templates/linux_ci_workflow.yml.j2 | 1 + .../workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 1 + .../workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml | 1 + .../workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 1 + .../workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 1 + .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 1 + .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 1 + 9 files changed, 11 insertions(+) diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 3ae63051c327a..467d13d0dc45d 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -149,6 +149,7 @@ class CIWorkflow: enable_nogpu_no_avx2_test: YamlShellBool = "''" enable_slow_test: YamlShellBool = "''" enable_docs_test: YamlShellBool = "''" + enable_backwards_compat_test: YamlShellBool = "''" def __post_init__(self) -> None: if self.is_libtorch: @@ -268,6 +269,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: on_pull_request=True, enable_doc_jobs=True, enable_docs_test=1, + enable_backwards_compat_test=1, num_test_shards=2, ciflow_config=CIFlowConfig( enabled=True, diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py index 7dc29097ab83a..cb71f588ece5e 100755 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ b/.github/scripts/generate_pytorch_test_matrix.py @@ -57,6 +57,8 @@ def main() -> None: configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} if os.getenv('ENABLE_DOCS_TEST'): configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} + if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'): + configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} matrix = { 'include': [ { diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index e7681b049464a..d7be808898476 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -255,6 +255,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }} ENABLE_SLOW_TEST: !{{ enable_slow_test }} ENABLE_DOCS_TEST: !{{ enable_docs_test }} + ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }} NUM_TEST_SHARDS: !{{ num_test_shards }} MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 0c5096146273c..f34765c98160b 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -231,6 +231,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 536274b7df5c0..6162b3cac1604 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -231,6 +231,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 42d10cb4782ce..2fe24a515ea2a 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -231,6 +231,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: 1 ENABLE_SLOW_TEST: 1 ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 8c3b8d40d7651..59b0e2535b3bb 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -231,6 +231,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index cbbfa5981cb44..1fa72f51255dd 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -231,6 +231,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: 1 + ENABLE_BACKWARDS_COMPAT_TEST: 1 NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index fc43fa74c8bfd..246d5cabd86de 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -229,6 +229,7 @@ jobs: ENABLE_NOGPU_NO_AVX2_TEST: '' ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge From 65050ec9247ef4566d035bfe3d0c58eb6e7f091b Mon Sep 17 00:00:00 2001 From: Daya Khudia Date: Mon, 30 Aug 2021 13:58:47 -0700 Subject: [PATCH 361/530] Back out "[JIT] Add aten::slice optimization" Summary: Original commit changeset: d12ee39f6828 build-break overriding_review_checks_triggers_an_audit_and_retroactive_review Oncall Short Name: dskhudia Test Plan: Local run succeeds Differential Revision: D30633990 fbshipit-source-id: 91cf7cc0ad7e47d919347c2a1527688e062e0c62 --- test/jit/test_peephole.py | 74 +------------- .../csrc/jit/passes/peephole_list_idioms.cpp | 97 ++++++------------- torch/csrc/jit/passes/peephole_list_idioms.h | 8 -- 3 files changed, 31 insertions(+), 148 deletions(-) diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py index ecb4a06dfe0b9..23de44807761c 100644 --- a/test/jit/test_peephole.py +++ b/test/jit/test_peephole.py @@ -2,7 +2,7 @@ from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA, _inline_everything from torch import nn from torch.testing import FileCheck -from typing import Callable, List +from typing import List import unittest @@ -721,75 +721,3 @@ def foo(): self.run_pass("peephole", foo.graph) FileCheck().check("DictConstruct").check("len").run(foo.graph) self.assertEqual(foo(), 1) - - def test_peephole_slice_all_three_args(self): - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][-5:6:2] - - graph = torch.jit.script(foo).graph - self.run_pass("peephole", graph) - FileCheck().check_not("aten::slice").run(graph) - self.checkScript(foo, (3, )) - - def test_peephole_slice_one_empty_arg(self): - def check_helper(fn: Callable[[int], None]) -> None: - graph = torch.jit.script(fn).graph - self.run_pass("peephole", graph) - FileCheck().check_not("aten::slice").run(graph) - self.checkScript(fn, (3, )) - - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][1::2] - - check_helper(foo) - - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][:5:3] - - check_helper(foo) - - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][0:4] - - check_helper(foo) - - def test_peephole_slice_two_empty_args(self): - def check_helper(fn: Callable[[int], None]) -> None: - graph = torch.jit.script(fn).graph - self.run_pass("peephole", graph) - FileCheck().check_not("aten::slice").run(graph) - self.checkScript(fn, (3, )) - - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][::2] - - check_helper(foo) - - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][:5] - - check_helper(foo) - - def foo(x: int): - return [1, 2, x, 4, 5, 6, 7][1:] - - check_helper(foo) - - def test_peephole_slice_optimization_not_applied_list_modified(self): - @torch.jit.script - def foo(): - li = [1, 2, 3, 4, 5, 6, 7] - li[0] = 0 - return li[2:5] - - self.run_pass("peephole", foo.graph) - FileCheck().check("aten::slice").run(foo.graph) - - def test_peephole_slice_optimization_not_applied_non_const_args(self): - @torch.jit.script - def foo(x: int, y: int): - li = [1, 2, 3, 4, 5, 6, 7] - return li[x:y] - - self.run_pass("peephole", foo.graph) - FileCheck().check("aten::slice").run(foo.graph) diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp index ec3d249b8b1be..f33f388259d20 100644 --- a/torch/csrc/jit/passes/peephole_list_idioms.cpp +++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp @@ -7,9 +7,7 @@ #include #include #include -#include #include -#include namespace torch { namespace jit { @@ -59,7 +57,7 @@ struct ListLenRefiner { } auto first_input = n->input(0); - if (first_input->type()->castRaw() && + if (first_input->type()->cast() && !mutated_lists_.count(first_input)) { if (!li_with_len_use.count(first_input)) { li_with_len_use.insert(first_input); @@ -174,7 +172,7 @@ struct PeepholeOptimizeListIdiomsImpl { private: void checkForMutatedList(Value* v) { - if (v->type()->castRaw() && aliasDb_->hasWriters(v)) { + if (v->type()->cast() && aliasDb_->hasWriters(v)) { mutated_lists_.insert(v); } } @@ -193,43 +191,6 @@ struct PeepholeOptimizeListIdiomsImpl { } } - bool optimizeSlice(Node* slice_node, Node* list_construct_node) { - auto start_val = toIValue(slice_node->input(1)); - auto end_val = toIValue(slice_node->input(2)); - auto step_val = toIValue(slice_node->input(3)); - - // All args must be constant to apply this optimization. - if (start_val == c10::nullopt || end_val == c10::nullopt || - step_val == c10::nullopt) { - return false; - } - - int64_t start = start_val->isInt() ? start_val->to() - : std::numeric_limits::max(); - int64_t end = end_val->isInt() ? end_val->to() - : std::numeric_limits::max(); - int64_t step = step_val->isInt() ? step_val->to() : 1; - - size_t list_size = list_construct_node->inputs().size(); - size_t num_values = slice_indices_adjust(list_size, &start, &end, step); - - WithInsertPoint guard(slice_node); - auto slice_list_construct = - graph_->insertNode(graph_->create(prim::ListConstruct)); - slice_list_construct->output()->setType(slice_node->output()->type()); - for (size_t i = start, j = 0; j < num_values; ++j) { - slice_list_construct->addInput(list_construct_node->input(i)); - i += step; - } - - slice_node->output()->replaceAllUsesWith(slice_list_construct->output()); - if (mutated_lists_.count(slice_node->output())) { - mutated_lists_.insert(slice_list_construct->output()); - } - - return true; - } - bool runBlock(Block* block) { bool changed = false; for (Node* node : block->nodes()) { @@ -239,7 +200,7 @@ struct PeepholeOptimizeListIdiomsImpl { // only optimizing list ops if (node->inputs().size() == 0 || - !node->input(0)->type()->castRaw()) { + !node->input(0)->type()->cast()) { continue; } @@ -250,33 +211,36 @@ struct PeepholeOptimizeListIdiomsImpl { continue; } - auto list_creation_node = first_input->node(); - if (list_creation_node->kind() != prim::ListConstruct) { - continue; - } - if (node->kind() == aten::len) { - WithInsertPoint guard(node); - node->output()->replaceAllUsesWith(graph_->insertConstant( - static_cast(first_input->node()->inputs().size()))); - changed = true; + if (first_input->node()->kind() == prim::ListConstruct) { + WithInsertPoint guard(node); + node->output()->replaceAllUsesWith(graph_->insertConstant( + static_cast(first_input->node()->inputs().size()))); + changed = true; + } } else if (node->kind() == aten::__getitem__) { - if (auto index = toIValue(node->input(1))) { - size_t list_size = list_creation_node->inputs().size(); - if (auto norm_index = normalizeIndex(index->toInt(), list_size)) { - node->output()->replaceAllUsesWith( - list_creation_node->input(*norm_index)); - changed = true; + auto list_creation_node = first_input->node(); + if (list_creation_node->kind() == prim::ListConstruct) { + if (auto index = toIValue(node->input(1))) { + size_t list_size = list_creation_node->inputs().size(); + if (auto norm_index = normalizeIndex(index->toInt(), list_size)) { + node->output()->replaceAllUsesWith( + list_creation_node->input(*norm_index)); + changed = true; + } } } } else if (node->kind() == prim::ListUnpack) { - // if sizes are unequal it's a runtime error - if (list_creation_node->inputs().size() != node->outputs().size()) { - continue; - } - for (size_t i = 0; i < node->outputs().size(); ++i) { - node->output(i)->replaceAllUsesWith(list_creation_node->input(i)); - changed = true; + auto list_creation_node = first_input->node(); + if (list_creation_node->kind() == prim::ListConstruct) { + // if sizes are unequal it's a runtime error + if (list_creation_node->inputs().size() != node->outputs().size()) { + continue; + } + for (size_t i = 0; i < node->outputs().size(); ++i) { + node->output(i)->replaceAllUsesWith(list_creation_node->input(i)); + changed = true; + } } } else if (node->kind() == aten::add) { if (node->inputs().size() != 2) { @@ -287,7 +251,8 @@ struct PeepholeOptimizeListIdiomsImpl { if (mutated_lists_.count(second_input)) { continue; } - if (second_input->node()->kind() != prim::ListConstruct) { + if (first_input->node()->kind() != prim::ListConstruct || + second_input->node()->kind() != prim::ListConstruct) { continue; } WithInsertPoint guard(node); @@ -305,8 +270,6 @@ struct PeepholeOptimizeListIdiomsImpl { mutated_lists_.insert(list_construct->output()); } changed = true; - } else if (node->kind() == aten::slice) { - changed |= optimizeSlice(node, first_input->node()); } } return changed; diff --git a/torch/csrc/jit/passes/peephole_list_idioms.h b/torch/csrc/jit/passes/peephole_list_idioms.h index d20df9571db01..c8add4849d4ce 100644 --- a/torch/csrc/jit/passes/peephole_list_idioms.h +++ b/torch/csrc/jit/passes/peephole_list_idioms.h @@ -51,14 +51,6 @@ namespace jit { // // This is only applied to lists that are not modified. // -// 5. Slice -// Given a function like this: -// def foo(): -// return [1, 2, 3, 4, 5][0:2] -// This pass produces (after deadcode elimination): -// def foo(): -// return [1, 2] -// // Currently this is invoked as part of PeepholeOptimize // return true if graph is modified. // If `refine_list_len` is true will attempt to refine the len of lists through From 8f88f797dbff54aa4d2b153e9f0dc87794e4cf38 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Mon, 30 Aug 2021 14:21:39 -0700 Subject: [PATCH 362/530] [quant][graphmode][fx] Add reference quantized conv module (#63828) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63828 Added reference quantized conv module for the custom backend flow, the reference quantized module will have the following code: ``` w(float) -- quant - dequant \ x(float) ------------- F.conv2d --- ``` In the full model, we will see ``` w(float) -- quant - *dequant \ x -- quant --- *dequant -- *F.conv2d --- *quant - dequant ``` and the backend should be able to fuse the ops with `*` into a quantized linear Test Plan: python test/test_quantization.py TestQuantizeFx.test_conv_linear_reference Imported from OSS Reviewed By: vkuzo Differential Revision: D30504749 fbshipit-source-id: e1d8c43a0e0d6d9ea2375b8ca59a9c0f455514fb --- .../core/test_quantized_module.py | 84 +++---- test/quantization/fx/test_quantize_fx.py | 68 +++-- .../quantized/_reference/__init__.py | 1 - .../quantized/_reference/modules/__init__.py | 8 - .../quantized/_reference/modules/conv_relu.py | 58 ----- torch/nn/quantized/_reference/modules/conv.py | 237 ++++++++++++------ .../quantization/fx/quantization_patterns.py | 30 ++- torch/quantization/quantization_mappings.py | 15 -- 8 files changed, 257 insertions(+), 244 deletions(-) delete mode 100644 torch/nn/intrinsic/quantized/_reference/__init__.py delete mode 100644 torch/nn/intrinsic/quantized/_reference/modules/__init__.py delete mode 100644 torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py index bc8a6b397eef8..b0bc78294d9b5 100644 --- a/test/quantization/core/test_quantized_module.py +++ b/test/quantization/core/test_quantized_module.py @@ -2,9 +2,7 @@ import torch.nn as nn import torch.nn.intrinsic as nni import torch.nn.intrinsic.quantized as nniq -import torch.nn.intrinsic.quantized._reference as nniqr import torch.nn.quantized as nnq -import torch.nn.quantized._reference as nnqr import torch.nn.quantized.dynamic as nnqd import torch.quantization @@ -211,12 +209,11 @@ def test_quant_dequant_api(self): self.assertEqual(rqr, rqr2) def _test_conv_api_impl( - self, module_name, qconv_module, conv_module, batch_size, - in_channels_per_group, input_feature_map_size, out_channels_per_group, - groups, kernel_size, stride, padding, padding_mode, dilation, - X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point, - use_bias, use_fused, use_channelwise, is_reference - ): + self, module_name, qconv_module, conv_module, batch_size, + in_channels_per_group, input_feature_map_size, out_channels_per_group, + groups, kernel_size, stride, padding, padding_mode, dilation, + X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point, + use_bias, use_fused, use_channelwise): for i in range(len(kernel_size)): assume(input_feature_map_size[i] + 2 * padding[i] >= dilation[i] * (kernel_size[i] - 1) + 1) @@ -245,8 +242,7 @@ def _test_conv_api_impl( # Test members self.assertTrue(module_name == qconv_module._get_name(), module_name + " " + qconv_module._get_name()) - if not is_reference: - self.assertTrue(hasattr(qconv_module, '_packed_params')) + self.assertTrue(hasattr(qconv_module, '_packed_params')) self.assertTrue(hasattr(qconv_module, 'scale')) self.assertTrue(hasattr(qconv_module, 'zero_point')) @@ -275,9 +271,8 @@ def _test_conv_api_impl( # For example, the result of round(2.5) + 1 is 3 while round(2.5 + 1) is # 4 assuming the rounding mode is round-to-nearest, ties-to-even. # skip numerics checking for reference module - if not is_reference: - np.testing.assert_array_almost_equal( - Y_exp.int_repr().numpy(), Y_act.int_repr().numpy(), decimal=0) + np.testing.assert_array_almost_equal( + Y_exp.int_repr().numpy(), Y_act.int_repr().numpy(), decimal=0) # Test serialization of quantized Conv Module using state_dict model_dict = qconv_module.state_dict() @@ -297,8 +292,7 @@ def _test_conv_api_impl( self.assertTrue(dir(loaded_qconv_module) == dir(qconv_module)) self.assertTrue(module_name == loaded_qconv_module._get_name()) - if not is_reference: - self.assertTrue(hasattr(loaded_qconv_module, '_packed_params')) + self.assertTrue(hasattr(loaded_qconv_module, '_packed_params')) self.assertTrue(hasattr(loaded_qconv_module, '_weight_bias')) self.assertEqual(qconv_module.weight(), loaded_qconv_module.weight()) @@ -308,9 +302,8 @@ def _test_conv_api_impl( self.assertEqual(qconv_module.zero_point, loaded_qconv_module.zero_point) Y_loaded = loaded_qconv_module(X_q) - if not is_reference: - np.testing.assert_array_almost_equal( - Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0) + np.testing.assert_array_almost_equal( + Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0) # Test serialization b = io.BytesIO() @@ -330,9 +323,8 @@ def _test_conv_api_impl( self.assertEqual(copied_conv.zero_point, qconv_module.zero_point) Y_copied = copied_conv(X_q) - if not is_reference: - np.testing.assert_array_almost_equal( - Y_exp.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0) + np.testing.assert_array_almost_equal( + Y_exp.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0) deepcopied_conv = copy.deepcopy(qconv_module) self.assertEqual(deepcopied_conv.bias(), qconv_module.bias()) @@ -340,9 +332,8 @@ def _test_conv_api_impl( self.assertEqual(deepcopied_conv.zero_point, qconv_module.zero_point) Y_deepcopied = copied_conv(X_q) - if not is_reference: - np.testing.assert_array_almost_equal( - Y_exp.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0) + np.testing.assert_array_almost_equal( + Y_exp.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0) # JIT testing self.checkScriptable( @@ -377,9 +368,8 @@ def test_conv1d_api(self): [True, False], # use_bias [True, False], # use_fused [True, False], # use_channelwise - [True, False] # is_reference ) - for pad_mode, use_bias, use_fused, use_channelwise, is_reference in options: + for pad_mode, use_bias, use_fused, use_channelwise in options: if torch.backends.quantized.engine == "qnnpack": use_channelwise = False batch_size = 2 @@ -407,15 +397,13 @@ def test_conv1d_api(self): Y_zero_point = 4 if torch.backends.quantized.engine == 'qnnpack': use_channelwise = False - # (use_fused, is_reference) -> quantized class + # use_fused -> quantized class class_map = { - (True, True): (nniqr.ConvReLU1d, "QuantizedConvReLU1d(Reference)"), - (True, False): (nniq.ConvReLU1d, "QuantizedConvReLU1d"), - (False, True): (nnqr.Conv1d, "QuantizedConv1d(Reference)"), - (False, False): (nnq.Conv1d, "QuantizedConv1d") + True: (nniq.ConvReLU1d, "QuantizedConvReLU1d"), + False: (nnq.Conv1d, "QuantizedConv1d") } - qconv_cls, module_name = class_map[(use_fused, is_reference)] + qconv_cls, module_name = class_map[use_fused] qconv_module = qconv_cls( in_channels, out_channels, kernel, stride, pad, dilation, groups, use_bias, padding_mode=pad_mode @@ -434,7 +422,7 @@ def test_conv1d_api(self): in_channels_per_group, input_feature_map_size, out_channels_per_group, groups, kernel_size, stride, pad, pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, - Y_zero_point, use_bias, use_fused, use_channelwise, is_reference) + Y_zero_point, use_bias, use_fused, use_channelwise) @override_qengines def test_conv2d_api(self): @@ -443,9 +431,8 @@ def test_conv2d_api(self): [True, False], # use_bias [True, False], # use_fused [True, False], # use_channelwise - [True, False] # is_reference ) - for pad_mode, use_bias, use_fused, use_channelwise, is_reference in options: + for pad_mode, use_bias, use_fused, use_channelwise in options: if torch.backends.quantized.engine == "qnnpack": use_channelwise = False batch_size = 2 @@ -475,15 +462,13 @@ def test_conv2d_api(self): W_zero_point = [3] Y_scale = 5.0 Y_zero_point = 4 - # (use_fused, is_reference) -> quantized class + # use_fused -> quantized class class_map = { - (True, True): (nniqr.ConvReLU2d, "QuantizedConvReLU2d(Reference)"), - (True, False): (nniq.ConvReLU2d, "QuantizedConvReLU2d"), - (False, True): (nnqr.Conv2d, "QuantizedConv2d(Reference)"), - (False, False): (nnq.Conv2d, "QuantizedConv2d") + True: (nniq.ConvReLU2d, "QuantizedConvReLU2d"), + False: (nnq.Conv2d, "QuantizedConv2d") } - qconv_cls, module_name = class_map[(use_fused, is_reference)] + qconv_cls, module_name = class_map[use_fused] qconv_module = qconv_cls( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, use_bias, padding_mode=pad_mode @@ -502,7 +487,7 @@ def test_conv2d_api(self): in_channels_per_group, input_feature_map_size, out_channels_per_group, groups, kernel_size, stride, padding, pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point, - Y_scale, Y_zero_point, use_bias, use_fused, use_channelwise, is_reference) + Y_scale, Y_zero_point, use_bias, use_fused, use_channelwise) @skipIfNoFBGEMM def test_conv3d_api(self): @@ -510,9 +495,8 @@ def test_conv3d_api(self): [True, False], # use_bias [True, False], # use_fused [True, False], # use_channelwise - [True, False] # is_reference ) - for use_bias, use_fused, use_channelwise, is_reference in options: + for use_bias, use_fused, use_channelwise in options: if torch.backends.quantized.engine == "qnnpack": use_channelwise = False batch_size = 2 @@ -547,16 +531,14 @@ def test_conv3d_api(self): W_zero_point = [3] Y_scale = 5.0 Y_zero_point = 4 - # (use_fused, is_reference) -> quantized class + # use_fused -> quantized class class_map = { - (True, True): (nniqr.ConvReLU3d, "QuantizedConvReLU3d(Reference)"), - (True, False): (nniq.ConvReLU3d, "QuantizedConvReLU3d"), - (False, True): (nnqr.Conv3d, "QuantizedConv3d(Reference)"), - (False, False): (nnq.Conv3d, "QuantizedConv3d") + True: (nniq.ConvReLU3d, "QuantizedConvReLU3d"), + False: (nnq.Conv3d, "QuantizedConv3d") } with override_quantized_engine('fbgemm'): - qconv_cls, module_name = class_map[(use_fused, is_reference)] + qconv_cls, module_name = class_map[use_fused] qconv_module = qconv_cls( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, use_bias, padding_mode=pad_mode @@ -576,7 +558,7 @@ def test_conv3d_api(self): out_channels_per_group, groups, kernel_size, stride, padding, pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point, use_bias, use_fused, - use_channelwise, is_reference) + use_channelwise) def test_pool_api(self): """Tests the correctness of the pool module. diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 7ae29e03f6a46..9682da14483df 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -532,7 +532,7 @@ def forward(self, x): Conv1d, conv1d_module_args, (conv1d_input,), - ns.call_module(nn.Conv1d if is_reference else nnq.Conv1d), + ns.call_module(nnqr.Conv1d if is_reference else nnq.Conv1d), None ), ( @@ -540,7 +540,7 @@ def forward(self, x): Conv2d, conv2d_module_args, (conv2d_input,), - ns.call_module(nn.Conv2d if is_reference else nnq.Conv2d), + ns.call_module(nnqr.Conv2d if is_reference else nnq.Conv2d), None ), ( @@ -548,7 +548,7 @@ def forward(self, x): Conv3d, conv3d_module_args, (conv3d_input,), - ns.call_module(nn.Conv3d if is_reference else nnq.Conv3d), + ns.call_module(nnqr.Conv3d if is_reference else nnq.Conv3d), None ), ( @@ -631,11 +631,7 @@ def _get_keys(prefix, is_dynamic): qr = result_dict["quantized_reference"] def checkWeightQParams(model): - for module_name in ("conv",): - if hasattr(model, module_name): - self.assertTrue(hasattr(qr.get_submodule(module_name), "_weight_qparams")) - self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name()) - for module_name in ("linear",): + for module_name in ("linear", "conv"): if hasattr(model, module_name): self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_qscheme")) self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_scale")) @@ -643,19 +639,7 @@ def checkWeightQParams(model): self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name()) def checkSerDeser(model, is_dynamic): - for module_name in ("conv",): - if hasattr(model, module_name): - # make sure seralization works - state_dict = copy.deepcopy(model.state_dict()) - self.assertTrue(module_name + "._weight_qparams" in state_dict) - - # check load_state_dict restores states - module = getattr(model, module_name) - prev_scale = module._weight_qparams["scale"] - module._weight_qparams["scale"] = None - model.load_state_dict(state_dict) - self.assertTrue(torch.equal(prev_scale, module._weight_qparams["scale"])) - for module_name in ("linear",): + for module_name in ("linear", "conv"): if hasattr(model, module_name): # make sure seralization works state_dict = copy.deepcopy(model.state_dict()) @@ -3001,6 +2985,44 @@ def forward(self, x): result_ref = m_ref(data) self.assertTrue(torch.equal(result, result_ref)) + def test_ref_conv_module(self): + """ Make sure the numerics for models with ref conv module + matches models with fbgemm/qnnpack module + """ + convs = { + 1: nn.Conv1d, + 2: nn.Conv2d, + 3: nn.Conv3d, + } + + class M1(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.conv = convs[dim](3, 3, 3) + + def forward(self, x): + return self.conv(x) + + class M2(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.conv = convs[dim](3, 3, 3) + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(self.conv(x)) + + for dim, M in itertools.product([1, 2, 3], [M1, M2]): + m = M(dim).eval() + m = prepare_fx(m, {"": default_qconfig}) + m_copy = copy.deepcopy(m) + m = convert_fx(m, is_reference=False) + m_ref = convert_fx(m_copy, is_reference=True) + data = self.img_data_dict[dim][0][0] + result = m(data) + result_ref = m_ref(data) + self.assertTrue(torch.equal(result, result_ref)) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops @@ -4558,13 +4580,13 @@ def forward(self, x): reference_order_check = [ ns.call_function(torch.quantize_per_tensor), ns.call_method('dequantize'), - ns.call_module(nn.Conv2d), + ns.call_module(nnqr.Conv2d), ns.call_function(torch.quantize_per_tensor), ns.call_method('dequantize'), ns.call_module(nn.Sigmoid), ns.call_function(torch.quantize_per_tensor), ns.call_method('dequantize'), - ns.call_module(nn.Conv2d), + ns.call_module(nnqr.Conv2d), ns.call_function(torch.quantize_per_tensor), ns.call_method('dequantize'), ] diff --git a/torch/nn/intrinsic/quantized/_reference/__init__.py b/torch/nn/intrinsic/quantized/_reference/__init__.py deleted file mode 100644 index 3d79bdbfe8320..0000000000000 --- a/torch/nn/intrinsic/quantized/_reference/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .modules import * # noqa: F403 diff --git a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py b/torch/nn/intrinsic/quantized/_reference/modules/__init__.py deleted file mode 100644 index 33b18d8cf7d3f..0000000000000 --- a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import torch -from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d - -__all__ = [ - 'ConvReLU1d', - 'ConvReLU2d', - 'ConvReLU3d', -] diff --git a/torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py b/torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py deleted file mode 100644 index b0305f6207d95..0000000000000 --- a/torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch -import torch.nn.quantized._reference as nnqr -import torch.nn.functional as F - -class ConvReLU1d(nnqr.Conv1d): - _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU1d - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.conv1d( - x_dequant, weight_dequant, self._bias, self._conv1d_stride, # type: ignore[has-type] - self._conv1d_padding, self._conv1d_dilation, self.groups) # type: ignore[has-type] - float_result = F.relu(float_result, inplace=True) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) - return result - - def _get_name(self): - return "QuantizedConvReLU1d(Reference)" - - -class ConvReLU2d(nnqr.Conv2d): - _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU2d - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.conv2d( - x_dequant, weight_dequant, self._bias, self.stride, - self.padding, self.dilation, self.groups) - float_result = F.relu(float_result, inplace=True) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) - return result - - def _get_name(self): - return "QuantizedConvReLU2d(Reference)" - -class ConvReLU3d(nnqr.Conv3d): - _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU3d - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.conv3d( - x_dequant, weight_dequant, self._bias, self.stride, - self.padding, self.dilation, self.groups) - float_result = F.relu(float_result, inplace=True) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) - return result - - def _get_name(self): - return "QuantizedConvReLU3d(Reference)" diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py index 036f8e46212c5..6b03bb0491ad1 100644 --- a/torch/nn/quantized/_reference/modules/conv.py +++ b/torch/nn/quantized/_reference/modules/conv.py @@ -1,42 +1,101 @@ import torch -import torch.nn.quantized as nnq +import torch.nn as nn import torch.nn.functional as F -from typing import Optional +from typing import Optional, Dict, Any from torch.nn.common_types import _size_1_t -from torch.nn.modules.utils import _single +from .utils import _quantize_and_dequantize_weight +from .utils import _save_weight_qparams +from .utils import _get_weight_qparam_keys -class _ConvNd(nnq._ConvNd): +class _ConvNd(torch.nn.modules.conv._ConvNd): """ A reference version of nn.quantized.Conv2d we will not pack the parameters in this module, since weight packing is an optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), this is useful when user want to use this module in other backends like Glow. """ - __annotations__ = {"_bias": Optional[torch.Tensor]} + __annotations__ = {"bias": Optional[torch.Tensor]} def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) - destination[prefix + '_qweight'] = self._qweight - destination[prefix + '_bias'] = self._bias + _save_weight_qparams( + destination, prefix, self.weight_qscheme, self.weight_dtype, + self.weight_scale, self.weight_zero_point, self.weight_axis) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - self._qweight = state_dict[prefix + '_qweight'] - self._bias = state_dict[prefix + '_bias'] - state_dict.pop(prefix + '_qweight') - state_dict.pop(prefix + '_bias') + for key in _get_weight_qparam_keys(state_dict, prefix): + setattr(self, key, state_dict[prefix + key]) + state_dict.pop(prefix + key) super()._load_from_state_dict( state_dict, prefix, local_metadata, False, missing_keys, unexpected_keys, error_msgs) - def _weight_bias(self): - return self._qweight, self._bias - - def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: - self._qweight = w - self._bias = b - -class Conv1d(_ConvNd, nnq.Conv1d): + def _init_weight_qparams(self, weight_qparams, device): + if weight_qparams is None: + weight_qparams = { + "qscheme": torch.per_tensor_affine, + "dtype": torch.quint8, + "scale": 1.0, + "zero_point": 0 + } + self.weight_qscheme = weight_qparams["qscheme"] + self.weight_dtype = weight_qparams["dtype"] + assert self.weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \ + Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized linear module") + if self.weight_qscheme is not None: + self.register_buffer( + "weight_scale", + torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device)) + self.register_buffer( + "weight_zero_point", + torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device)) + if self.weight_qscheme == torch.per_channel_affine: + self.register_buffer( + "weight_axis", + torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device)) + else: + # added for TorchScriptability, not used + self.register_buffer( + "weight_axis", torch.tensor(0, dtype=torch.int, device=device)) + + def get_weight(self): + """ + Fake quantize (quantize and dequantize) the weight with + the quantization parameters for weight, this is used to + simulate the numerics for the quantized weight in a quantized + model + """ + # supress mypy warning + assert isinstance(self.weight, torch.Tensor) + assert isinstance(self.weight_scale, torch.Tensor) + assert isinstance(self.weight_zero_point, torch.Tensor) + assert isinstance(self.weight_axis, torch.Tensor) + return _quantize_and_dequantize_weight( + self.weight, self.weight_qscheme, + self.weight_dtype, self.weight_scale, self.weight_zero_point, self.weight_axis) + + @staticmethod + def from_float(cls, float_conv, weight_qparams): + qref_conv = cls( + float_conv.in_channels, + float_conv.out_channels, + float_conv.kernel_size, # type: ignore[arg-type] + float_conv.stride, # type: ignore[arg-type] + float_conv.padding, # type: ignore[arg-type] + float_conv.dilation, # type: ignore[arg-type] + float_conv.groups, + float_conv.bias is not None, # type: ignore[arg-type] + float_conv.padding_mode, + device=float_conv.weight.device, + dtype=float_conv.weight.dtype, + weight_qparams=weight_qparams) + qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) + if float_conv.bias is not None: + qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) + return qref_conv + +class Conv1d(_ConvNd, nn.Conv1d): def __init__(self, in_channels: int, out_channels: int, @@ -46,91 +105,107 @@ def __init__(self, dilation: _size_1_t = 1, groups: int = 1, bias: bool = True, - padding_mode: str = 'zeros'): - nnq.Conv1d.__init__( + padding_mode: str = "zeros", + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.Conv1d.__init__( self, in_channels, out_channels, kernel_size, stride, padding, dilation, - groups, bias, padding_mode) - # self.stride, self.padding, self.dilation are 2d tuple since - # current quantized conv1d is using Conv2dPackedParams - # TODO: we should fix this if we implemenet Conv1dPackedParams - self._conv1d_stride = _single(self.stride[0]) - self._conv1d_padding = _single(self.padding[0]) - self._conv1d_dilation = _single(self.dilation[0]) + groups, bias, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.conv1d( - x_dequant, weight_dequant, self._bias, self._conv1d_stride, - self._conv1d_padding, self._conv1d_dilation, self.groups) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.conv1d --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.conv1d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv1d + """ + weight_dequant = self.get_weight() + result = F.conv1d( + x, weight_dequant, self.bias, self.stride, + self.padding, self.dilation, self.groups) return result def _get_name(self): - return 'QuantizedConv1d(Reference)' - - @torch.jit.export - def __setstate__(self, state): - self.in_channels = state[0] - self.out_channels = state[1] - self.kernel_size = state[2] - self.stride = state[3] - self.padding = state[4] - self.dilation = state[5] - self.transposed = state[6] - self.output_padding = state[7] - self.groups = state[8] - self.padding_mode = state[9] - self.set_weight_bias(state[10], state[11]) - self.scale = state[12] - self.zero_point = state[13] - self.training = state[14] - self._conv1d_stride = (self.stride[0],) - self._conv1d_padding = (self.padding[0],) - self._conv1d_dilation = (self.dilation[0],) - -class Conv2d(_ConvNd, nnq.Conv2d): + return "QuantizedConv1d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvNd.from_float(cls, float_conv, weight_qparams) + +class Conv2d(_ConvNd, nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, - padding_mode='zeros'): - nnq.Conv2d.__init__( + padding_mode='zeros', + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.Conv2d.__init__( self, in_channels, out_channels, kernel_size, stride, padding, dilation, - groups, bias, padding_mode) + groups, bias, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.conv2d( - x_dequant, weight_dequant, self._bias, self.stride, + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.conv2d --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.conv2d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv2d + """ + weight_dequant = self.get_weight() + result = F.conv2d( + x, weight_dequant, self.bias, self.stride, self.padding, self.dilation, self.groups) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) return result def _get_name(self): - return 'QuantizedConv2d(Reference)' + return "QuantizedConv2d(Reference)" -class Conv3d(_ConvNd, nnq.Conv3d): + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvNd.from_float(cls, float_conv, weight_qparams) + +class Conv3d(_ConvNd, nn.Conv3d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, - padding_mode='zeros'): - nnq.Conv3d.__init__( + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.Conv3d.__init__( self, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) + self._init_weight_qparams(weight_qparams, device) def forward(self, x: torch.Tensor) -> torch.Tensor: - x_dequant = x.dequantize() - weight_dequant = self._qweight.dequantize() - float_result = F.conv3d( - x_dequant, weight_dequant, self._bias, self.stride, + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.conv3d --- + + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.conv3d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv3d + """ + weight_dequant = self.get_weight() + result = F.conv3d( + x, weight_dequant, self.bias, self.stride, self.padding, self.dilation, self.groups) - # NEEDFIX: we don't have dtype in the Linear module APIs right now! - result = torch.quantize_per_tensor( - float_result, self.scale, self.zero_point, torch.quint8) return result def _get_name(self): - return 'QuantizedConv3d(Reference)' + return "QuantizedConv3d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvNd.from_float(cls, float_conv, weight_qparams) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 779dfcf07aece..418cae1511c35 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -638,19 +638,22 @@ def convert(self, # and qparam is a dictionary of # {"qscheme": ..., "scale": ..., "zero_point": ...} for per tensor quantization or # {"qscheme": ..., "scale": ..., "zero_point": ..., "axis": ...} for per channel quantization + float_conv = self.conv + fused_conv = None if isinstance( - self.conv, + float_conv, QAT_CONV_MODULE_CLASSES): # case 1. converting qat conv module to # a float conv module, we need to attch # weight fake_quant to the conv module, # weight fake_quant is assumed to be run during # QAT so we don't need to run it again here - float_conv = self.conv.to_float() + float_conv = self.conv.to_float() # type: ignore[operator] # change qat conv to conv parent_name, name = _parent_name(self.conv_node.target) setattr(modules[parent_name], name, float_conv) if isinstance(float_conv, torch.nn.intrinsic._FusedModule): + fused_conv = float_conv float_conv = float_conv[0] weight_post_process = self.conv.weight_fake_quant else: @@ -658,15 +661,28 @@ def convert(self, # to float conv module, we need to attach # weight observer to the conv module and run it # with conv weight - float_conv = self.conv - if isinstance(self.conv, torch.nn.intrinsic._FusedModule): - float_conv = self.conv[0] + if isinstance(float_conv, torch.nn.intrinsic._FusedModule): + fused_conv = float_conv + float_conv = float_conv[0] # type: ignore[index] assert qconfig is not None weight_post_process = qconfig.weight() # run weight observer - weight_post_process(float_conv.weight) + weight_post_process(float_conv.weight) # type: ignore[operator] weight_qparams = get_qparam_dict(weight_post_process) - _to_reference(float_conv, weight_qparams) + # hardcoded for now, TODO: expose the api to user, + # we can have a map from module to reference module + # and allow user to register new ones + qconv_cls = get_static_quant_module_class( + type(float_conv), is_reference=is_reference) + ref_conv = qconv_cls.from_float(float_conv, weight_qparams) # type: ignore[attr-defined] + # if the parent is a fused conv (Sequential), we can replace the first + # item to ref conv, otherwise we can update + # the conv instance in the module tree + if fused_conv is not None: + fused_conv[0] = ref_conv + else: + parent_name, name = _parent_name(self.conv_node.target) + setattr(modules[parent_name], name, ref_conv) op_out = quantized_graph.create_node( 'call_module', self.conv_node.target, diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py index 03b177805bac3..6851ba7bd447d 100644 --- a/torch/quantization/quantization_mappings.py +++ b/torch/quantization/quantization_mappings.py @@ -7,7 +7,6 @@ import torch.nn.intrinsic as nni import torch.nn.intrinsic.quantized as nniq import torch.nn.intrinsic.quantized.dynamic as nniqd -import torch.nn.intrinsic.quantized._reference as nniqr import torch.nn.intrinsic.qat as nniqat import torch.nn.quantized as nnq import torch.nn.quantized._reference as nnqr @@ -29,20 +28,6 @@ nn.Conv1d: nnqr.Conv1d, nn.Conv2d: nnqr.Conv2d, nn.Conv3d: nnqr.Conv3d, - nni.ConvReLU1d: nniqr.ConvReLU1d, - nni.ConvReLU2d: nniqr.ConvReLU2d, - nni.ConvReLU3d: nniqr.ConvReLU3d, - # QAT Modules - nnqat.Conv2d: nnqr.Conv2d, - nnqat.Conv3d: nnqr.Conv3d, - nniqat.ConvBn1d: nnqr.Conv1d, - nniqat.ConvBn2d: nnqr.Conv2d, - nniqat.ConvBn3d: nnqr.Conv3d, - nniqat.ConvBnReLU1d: nniqr.ConvReLU1d, - nniqat.ConvBnReLU2d: nniqr.ConvReLU2d, - nniqat.ConvBnReLU3d: nniqr.ConvReLU3d, - nniqat.ConvReLU2d: nniqr.ConvReLU2d, - nniqat.ConvReLU3d: nniqr.ConvReLU3d, } # Default map for swapping float module to quantized ones From e4fd2ab59ce8645f5ae9477c7724b6af82124b3b Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Mon, 30 Aug 2021 14:46:50 -0700 Subject: [PATCH 363/530] Back out "Added reference tests to ReductionOpInfo" (#64183) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64183 Original commit changeset: 6a1f82ac2819 Test Plan: CI Reviewed By: soulitzer Differential Revision: D30639835 fbshipit-source-id: e238043c6fbd0453317a9ed219e348298f98aaca --- aten/src/ATen/native/cpu/ReduceOpsKernel.cpp | 29 ++- test/test_reductions.py | 111 +---------- .../_internal/common_methods_invocations.py | 179 +++++------------- 3 files changed, 61 insertions(+), 258 deletions(-) diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 01ed54e56fc73..89d2fb21fb511 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -163,29 +163,24 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t } static void prod_kernel_impl(TensorIterator& iter) { - // Workaround for the error: '*' in boolean context, suggest '&&' instead - // [-Werror=int-in-bool-context] + // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] if (iter.dtype() == ScalarType::Bool) { using scalar_t = bool; binary_kernel_reduce_vec( - iter, - [=](scalar_t a, scalar_t b) - __ubsan_ignore_undefined__ -> scalar_t { return a && b; }, - [=](Vectorized a, Vectorized b) - __ubsan_ignore_undefined__ { return a && b; }, - // NOLINTNEXTLINE(bugprone-argument-comment) - /*identity=*/1); + iter, + [=](scalar_t a, scalar_t b) -> scalar_t { return a && b; }, + [=](Vectorized a, Vectorized b) { return a && b; }, + // NOLINTNEXTLINE(bugprone-argument-comment) + /*identity=*/1); } else { AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] { binary_kernel_reduce_vec( - iter, - [=](scalar_t a, scalar_t b) - __ubsan_ignore_undefined__ -> scalar_t { return a * b; }, - [=](Vectorized a, Vectorized b) - __ubsan_ignore_undefined__ { return a * b; }, - // NOLINTNEXTLINE(bugprone-argument-comment) - /*identity=*/1); - }); + iter, + [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; }, + [=](Vectorized a, Vectorized b) { return a * b; }, + // NOLINTNEXTLINE(bugprone-argument-comment) + /*identity=*/1); + }); } } diff --git a/test/test_reductions.py b/test/test_reductions.py index ca3042b66cf91..a9c667564d118 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -10,7 +10,7 @@ from torch._six import inf, nan from torch.testing import ( - integral_types_and, floating_and_complex_types_and, get_all_dtypes, make_tensor) + integral_types_and, floating_and_complex_types_and, make_tensor) from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict, IS_WINDOWS) @@ -296,115 +296,6 @@ def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo): result = op(t, *args, dim=dim, **kwargs) self.assertEqual(result.shape, _reduced_shape(t.shape, dim)) - def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs): - """Helper method to test noncontiguous input tensors.""" - assert not t.is_contiguous() - - t_contig = t.contiguous() - for args, kwargs in op.generate_args_kwargs(t_contig, **reduction_kwargs): - kwargs.update(reduction_kwargs) - result = op(t, *args, **kwargs) - expected = op(t_contig, *args, **kwargs) - self.assertEqual(result, expected) - - @ops(reduction_ops) - def test_noncontiguous_innermost(self, device, dtype, op: ReductionOpInfo): - """Tests reducing along noncontiguous innermost dimension.""" - t = make_tensor((10, 10), device, dtype) - self._test_noncontiguous(op, t[:, ::2], dim=1) - - @ops(reduction_ops) - def test_noncontiguous_outermost(self, device, dtype, op: ReductionOpInfo): - """Tests reducing along noncontiguous outermost dimension.""" - t = make_tensor((10, 10), device, dtype) - self._test_noncontiguous(op, t[::2, :], dim=0) - - @ops(reduction_ops) - def test_noncontiguous_all(self, device, dtype, op: ReductionOpInfo): - """Tests reducing all dimensions of a noncontiguous tensor.""" - t = make_tensor((5, 5, 5), device, dtype) - self._test_noncontiguous(op, t[::2, ::3, 1:-1:2]) - - @ops(reduction_ops) - def test_noncontiguous_transposed(self, device, dtype, op: ReductionOpInfo): - """Tests reducing a transposed tensor.""" - t = make_tensor((5, 5), device, dtype) - self._test_noncontiguous(op, t.T) - - @ops(reduction_ops) - def test_noncontiguous_expanded(self, device, dtype, op: ReductionOpInfo): - """Tests reducing a tensor with expanded singleton dimensions.""" - t = make_tensor((2, 3), device, dtype) - self._test_noncontiguous(op, t.unsqueeze(1).expand(-1, 5, -1)) - - # NumPy does not support BFloat16 so we don't test that against reference - # implementations. We also don't compare dtypes or test for different - # keepdim because we already have other tests covering those. - # The test_reference_testing in test_ops.py only uses the samples from - # sample_inputs_func which do not test as exhaustively as these tests. - - def _test_ref(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs): - """Compares op against op.ref for the given input and reduction kwargs""" - for args, kwargs in op.generate_args_kwargs(t, **reduction_kwargs): - kwargs.update(reduction_kwargs) - result = op(t, *args, **kwargs) - expected = op.ref(t.detach().cpu().numpy(), *args, **kwargs) - self.assertEqual(result, expected, exact_dtype=False) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=get_all_dtypes(include_bfloat16=False)) - def test_ref_scalar_input(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for scalar input tensors""" - self._test_ref(op, make_tensor([], device, dtype)) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=get_all_dtypes(include_bfloat16=False)) - def test_ref_small_input(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for small input tensors""" - t = make_tensor((5, 3, 4, 2), device, dtype, exclude_zero=True) - self._test_ref(op, t) - for dim in [0, 1, 3] + ([[0, 2], [1, 3]] if op.supports_multiple_dims else []): - self._test_ref(op, t, dim=dim) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=[torch.float32]) - def test_ref_large_input_1D(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for a large 1D input tensor to check stability""" - self._test_ref(op, make_tensor((2 ** 20,), device, dtype, low=-1, high=2, exclude_zero=True)) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=[torch.float32]) - def test_ref_large_input_2D(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for a large 2D input tensor to test parallelism""" - t = make_tensor((32, 2 ** 16), device, dtype, low=-1, high=2, exclude_zero=True) - self._test_ref(op, t, dim=1) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=[torch.float32]) - def test_ref_large_input_64bit_indexing(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for a very large input tensor that requires 64 bit indexing""" - self._test_ref(op, make_tensor((275000000,), device, dtype, low=-1, high=2, exclude_zero=True)) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=get_all_dtypes(include_bfloat16=False)) - def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for input tensors with duplicate values""" - t = make_tensor((8, 8), device, dtype, exclude_zero=True) - t[::2, ::2] = t[1::2, 1::2] - self._test_ref(op, t) - self._test_ref(op, t, dim=0) - self._test_ref(op, t, dim=1) - - @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=[torch.float32, torch.complex64]) - def test_ref_extremal_values(self, device, dtype, op: ReductionOpInfo): - """Compares op against reference for input tensors with extremal values""" - t = make_tensor((10,), device, dtype, exclude_zero=True) - extremals = [0, 1] + [nan, inf, -inf] if torch.is_floating_point(t) else [] - for extremal in extremals: - t[5] = extremal - self._test_ref(op, t) - ########################################################################### # TODO: Legacy tests - port to ReductionOpInfo ########################################################################### diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index a3d61b477b4a4..1349a29e9d7fe 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -187,8 +187,6 @@ def _np(t): return tuple(map(to_numpy, x)) elif isinstance(x, dict): return {k: to_numpy(v) for k, v in x.items()} - elif isinstance(x, torch.dtype): - return torch.empty(0, dtype=x).numpy().dtype elif isinstance(x, (numbers.Number, bool, str)): return x @@ -784,8 +782,8 @@ def _generate_reduction_inputs(device, dtype, requires_grad): """Generates input tensors for testing reduction operators""" yield make_tensor([], device, dtype, requires_grad=requires_grad) yield make_tensor([2], device, dtype, requires_grad=requires_grad) - yield make_tensor([3, 5], device, dtype, requires_grad=requires_grad, noncontiguous=True) - yield make_tensor([3, 2, 1, 2], device, dtype, requires_grad=requires_grad) + yield make_tensor([2, 3], device, dtype, requires_grad=requires_grad, noncontiguous=True) + yield make_tensor([3, 2, 1, 5], device, dtype, requires_grad=requires_grad) def _generate_reduction_kwargs(ndim, supports_multiple_dims=True): @@ -929,8 +927,6 @@ def sample_inputs_func(*args, **kwargs): # Override OpInfo defaults and call base class __init__ kwargs.setdefault('inplace_variant', None) kwargs.setdefault('sample_inputs_func', sample_inputs_func) - kwargs.setdefault('default_test_dtypes', ( - torch.uint8, torch.int64, torch.float16, torch.bfloat16, torch.float32, torch.complex64)) super(ReductionOpInfo, self).__init__(name, **kwargs) self.identity = identity @@ -4084,6 +4080,38 @@ def generator(): return list(generator()) +def sample_inputs_prod(op_info, device, dtype, requires_grad): + def make_arg(shape): + # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck + return make_tensor(shape, device, dtype, low=-1, high=+1, requires_grad=requires_grad) + + def prod_single_zero(): + result = make_arg(2 * (S,)) + with torch.no_grad(): + result[0, 1] = 0 + return result + + # will not be needed once OpInfo tests support Iterables + def sample_generator(): + for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad): + yield SampleInput(sample.input) # only Tensor, ignore other inputs + yield sample + sample.kwargs['keepdim'] = True + yield sample + yield SampleInput(prod_single_zero()) + yield SampleInput(make_arg((3, 3, 3)), args=(1,)) + yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True}) + + # test zero scalar tensor + zero = make_arg(()) + with torch.no_grad(): + zero.zero_() + yield SampleInput(zero) + yield SampleInput(zero, args=(0,)) + yield SampleInput(zero, args=(0,), kwargs={'keepdim': True}) + + return list(sample_generator()) + def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad) @@ -5493,53 +5521,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): return op(input.triu() if upper else input.tril(), upper) -def reference_reduction_numpy(f, supports_keepdims=True): - """Wraps a NumPy reduction operator. - - The wrapper function will forward dim and keepdim kwargs to the wrapped - function as the NumPy equivalent axis and keepdims kwargs. - - Args: - f: NumPy reduction operator to wrap - supports_keepdims (bool, optional): Whether the NumPy operator accepts - keepdims parameter. If it does not, the wrapper will manually unsqueeze - the reduced dimensions if it was called with keepdim=True. Defaults to True. - - Returns: - Wrapped function - """ - @wraps(f) - def wrapper(x: np.ndarray, *args, **kwargs): - # Copy keys into a set - keys = set(kwargs.keys()) - - dim = kwargs.pop('dim', None) - keepdim = kwargs.pop('keepdim', False) - - if 'dim' in keys: - if x.ndim == 0: - # NumPy reductions don't accept dim=0 for scalar inputs - for i in dim if isinstance(dim, tuple) else (dim,): - assert i in {0, -1} - kwargs['axis'] = None - else: - kwargs['axis'] = tuple(dim) if isinstance(dim, Sequence) else dim - - if 'keepdim' in keys and supports_keepdims: - kwargs['keepdims'] = keepdim - - result = f(x, *args, **kwargs) - - # Unsqueeze reduced dimensions if NumPy does not support keepdims - if keepdim and not supports_keepdims and x.ndim > 0: - dim = list(range(x.ndim)) if dim is None else dim - result = np.expand_dims(result, dim) - - return result - - return wrapper - - # Operator database (sorted alphabetically) op_db: List[OpInfo] = [ UnaryUfuncInfo('abs', @@ -7058,6 +7039,15 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_out=False, supports_forward_ad=True, sample_inputs_func=sample_inputs_max_min_reduction_no_dim,), + # TODO(@heitorschueroff) Add test for dtype kwarg + OpInfo('mean', + dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), + assert_autodiffed=True, + supports_forward_ad=True, + sample_inputs_func=sample_inputs_reduction, + # Need to skip out test because one of the overload for mean does not support it + # TODO(@heitorschueroff) fix this when implementing ReductionInfo + skips=(SkipInfo('TestCommon', 'test_out'),)), OpInfo('quantile', dtypes=floating_types(), sample_inputs_func=sample_inputs_reduction_quantile), @@ -8900,7 +8890,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_autograd=False, result_dtype=torch.bool, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.all), skips=( # FIXME: does not support passing keepdim without dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8908,8 +8897,7 @@ def wrapper(x: np.ndarray, *args, **kwargs): SkipInfo('TestReductions', 'test_dim_none'), SkipInfo('TestReductions', 'test_dim_none_keepdim'), # FIXME: uint8 input returns uint8 instead of bool - SkipInfo('TestReductions', 'test_result_dtype', - dtypes=[torch.uint8]), + SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]), ), ), ReductionOpInfo( @@ -8920,7 +8908,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_autograd=False, result_dtype=torch.bool, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.any), skips=( # FIXME: does not support passing keepdim without dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8928,15 +8915,14 @@ def wrapper(x: np.ndarray, *args, **kwargs): SkipInfo('TestReductions', 'test_dim_none'), SkipInfo('TestReductions', 'test_dim_none_keepdim'), # FIXME: uint8 input returns uint8 instead of bool - SkipInfo('TestReductions', 'test_result_dtype', - dtypes=[torch.uint8]), + SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]), ), ), ReductionOpInfo( 'amax', nan_policy='propagate', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - ref=reference_reduction_numpy(np.amax), + ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs), skips=( # FIXME: sum reduces all dimensions when dim=[] SkipInfo('TestReductions', 'test_dim_empty'), @@ -8947,7 +8933,7 @@ def wrapper(x: np.ndarray, *args, **kwargs): 'amin', nan_policy='propagate', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - ref=reference_reduction_numpy(np.amin), + ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs), skips=( # FIXME: sum reduces all dimensions when dim=[] SkipInfo('TestReductions', 'test_dim_empty'), @@ -8960,7 +8946,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_autograd=False, result_dtype=torch.int64, dtypes=all_types_and(torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.argmax, supports_keepdims=False), skips=( # FIXME: keepdim parameter is ignored when dim=None SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8973,7 +8958,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_autograd=False, result_dtype=torch.int64, dtypes=all_types_and(torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.argmin, supports_keepdims=False), skips=( # FIXME: keepdim parameter is ignored when dim=None SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -8988,7 +8972,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): result_dtype=torch.int64, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_reduction_count_nonzero, - ref=reference_reduction_numpy(np.count_nonzero), skips=( # FIXME: count_nonzero does not accept keepdim kwarg SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -9002,35 +8985,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): SkipInfo('TestReductions', 'test_dim_empty'), ), ), - ReductionOpInfo( - 'mean', - nan_policy='propagate', - supports_out=False, - supports_forward_ad=True, - assert_autodiffed=True, - promotes_int_to_float=True, - dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.mean), - decorators=( - # FIXME: fix precision - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-05, rtol=1e-02), - }), 'TestReductions', 'test_noncontiguous_all'), - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-05, rtol=1e-02), - }), 'TestReductions', 'test_ref_small_input'), - ), - skips=( - # FIXME: prod does not support passing keepdim without passing dim - SkipInfo('TestReductions', 'test_dim_default_keepdim'), - # FIXME: prod reduces all dimensions when dim=[] - SkipInfo('TestReductions', 'test_dim_empty'), - SkipInfo('TestReductions', 'test_dim_empty_keepdim'), - # FIXME: prod does not support passing None to dim - SkipInfo('TestReductions', 'test_dim_none'), - SkipInfo('TestReductions', 'test_dim_none_keepdim'), - ), - ), ReductionOpInfo( 'prod', identity=1, @@ -9041,7 +8995,7 @@ def wrapper(x: np.ndarray, *args, **kwargs): gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, dtypes=all_types_and_complex_and(torch.bool), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.prod), + sample_inputs_func=sample_inputs_prod, skips=( # FIXME: prod does not support passing keepdim without passing dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -9051,11 +9005,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): # FIXME: prod does not support passing None to dim SkipInfo('TestReductions', 'test_dim_none'), SkipInfo('TestReductions', 'test_dim_none_keepdim'), - # FIXME: improve precision, failing with nan != inf - SkipInfo('TestReductions', 'test_ref_small_input', - dtypes=[torch.float16, torch.complex64]), - SkipInfo('TestReductions', 'test_ref_duplicate_values', - dtypes=[torch.uint8, torch.float16, torch.complex64]), ), ), ReductionOpInfo( @@ -9066,22 +9015,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_forward_ad=True, promotes_int_to_int64=True, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.sum), - decorators=( - # FIXME: fix precision - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-05, rtol=1e-02), - }), 'TestReductions', 'test_noncontiguous_all'), - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-03, rtol=1e-02), - }), 'TestReductions', 'test_ref_small_input'), - DecorateInfo(toleranceOverride({ - torch.float32: tol(atol=1e-03, rtol=1e-03), - }), 'TestReductions', 'test_ref_large_input_64bit_indexing'), - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-05, rtol=1e-02), - }), 'TestReductions', 'test_ref_duplicate_values'), - ), skips=( # FIXME: sum does not support passing keepdim without passing dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), @@ -9100,22 +9033,6 @@ def wrapper(x: np.ndarray, *args, **kwargs): supports_out=False, promotes_int_to_int64=True, dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), - ref=reference_reduction_numpy(np.nansum), - decorators=( - # FIXME: fix precision - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-05, rtol=1e-02), - }), 'TestReductions', 'test_noncontiguous_all'), - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-03, rtol=1e-02), - }), 'TestReductions', 'test_ref_small_input'), - DecorateInfo(toleranceOverride({ - torch.float32: tol(atol=1e-03, rtol=1e-03), - }), 'TestReductions', 'test_ref_large_input_64bit_indexing'), - DecorateInfo(toleranceOverride({ - torch.float16: tol(atol=1e-05, rtol=1e-02), - }), 'TestReductions', 'test_ref_duplicate_values'), - ), skips=( # FIXME: nansum does not support passing keepdim without passing dim SkipInfo('TestReductions', 'test_dim_default_keepdim'), From c3464e78a461c6275e9fbbe3dfa72ca3983cb4df Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Mon, 30 Aug 2021 14:56:35 -0700 Subject: [PATCH 364/530] Revert D30561459: Fix bytes_written and bytes_read Test Plan: revert-hammer Differential Revision: D30561459 (https://github.com/pytorch/pytorch/commit/e98173ff3423247c597e21c923c8f47470ef07ab) Original commit changeset: 976fa5167097 fbshipit-source-id: 43f4c234ca400820fe6db5b4f37a25e14dc4b0dd --- caffe2/core/operator_schema.h | 17 +-- caffe2/operators/batch_matmul_op.cc | 113 ++++++++---------- caffe2/operators/concat_split_op.cc | 15 +-- caffe2/operators/conv_pool_op_base.h | 15 +-- caffe2/operators/distance_op.cc | 28 ++--- caffe2/operators/fc_inference.cc | 22 ++-- caffe2/operators/one_hot_ops.cc | 30 ++--- caffe2/operators/utility_ops.cc | 13 +- .../operator_test/concat_op_cost_test.py | 54 ++++----- caffe2/python/workspace_test.py | 2 +- caffe2/sgd/adagrad_op.cc | 55 +++------ 11 files changed, 140 insertions(+), 224 deletions(-) diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h index 0d048eb8d26e9..64f5ef3ed883a 100644 --- a/caffe2/core/operator_schema.h +++ b/caffe2/core/operator_schema.h @@ -6,13 +6,12 @@ #include #include #include -#include #include +#include #include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" -#include "caffe2/core/types.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/filler.h" #include "caffe2/utils/proto_utils.h" @@ -274,8 +273,8 @@ class TORCH_API OpSchema { OpSchema& Arg(const char* name, const char* description, bool required = false); -#define DECLARE_STANDARD_ARG(name, str) \ - static const char* Arg_##name; \ +#define DECLARE_STANDARD_ARG(name, str) \ + static const char* Arg_##name; \ OpSchema& Arg##name(const char* description); DECLARE_STANDARD_ARG(IsTest, is_test) @@ -340,9 +339,7 @@ class TORCH_API OpSchema { return inplace_enforced_(x, y); } - TORCH_API friend std::ostream& operator<<( - std::ostream& out, - const OpSchema& schema); + TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema); const std::vector& args() const { return args_; @@ -565,10 +562,8 @@ OpSchema::Cost PointwiseCostInference( } c.flops = nElemX * OpsPerPoint; - auto const& X_element_size_byte = - DataTypeToTypeMeta(X.data_type()).itemsize(); - c.bytes_read = nElemRead * X_element_size_byte; - c.bytes_written = nElemX * X_element_size_byte; + c.bytes_read = nElemRead * sizeof(X.data_type()); + c.bytes_written = nElemX * sizeof(X.data_type()); return c; } diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc index 205acf74f1572..32799ced10671 100644 --- a/caffe2/operators/batch_matmul_op.cc +++ b/caffe2/operators/batch_matmul_op.cc @@ -1,7 +1,6 @@ #include "caffe2/operators/batch_matmul_op.h" #include "caffe2/core/operator_schema.h" -#include "caffe2/core/types.h" namespace caffe2 { @@ -117,13 +116,9 @@ OpSchema::Cost CostInferenceForBatchMatMul( K = in[0].dims(ndims_A - 1); } - auto const& A_element_size_byte = - DataTypeToTypeMeta(A.data_type()).itemsize(); - auto const& Y_element_size_byte = - DataTypeToTypeMeta(Y.data_type()).itemsize(); c.flops = 2 * nElemY * K; - c.bytes_read = (nElemA + nElemB) * A_element_size_byte; - c.bytes_written = nElemY * Y_element_size_byte; + c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type()); + c.bytes_written = nElemY * sizeof(Y.data_type()); c.params_bytes = 0; return c; } @@ -185,76 +180,72 @@ class GetBatchMatMulGradient : public GradientMakerBase { auto no_trans_arg = vector(); auto trans_a_arg = vector{MakeArgument("trans_a", 1)}; auto trans_b_arg = vector{MakeArgument("trans_b", 1)}; - auto trans_both_arg = vector{ - MakeArgument("trans_a", 1), MakeArgument("trans_b", 1)}; + auto trans_both_arg = vector{MakeArgument("trans_a", 1), + MakeArgument("trans_b", 1)}; if (trans_a) { if (trans_b) { // A'B': // dA = B'G', dB = G'A' - return vector{ - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(1), GO(0)}, - vector{GI(0)}, - trans_both_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(0)}, - vector{GI(1)}, - trans_both_arg)}; + return vector{CreateOperatorDef( + "BatchMatMul", + "", + vector{I(1), GO(0)}, + vector{GI(0)}, + trans_both_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(0)}, + vector{GI(1)}, + trans_both_arg)}; } else { // A'B: // dA = BG', dB = AG - return vector{ - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(1), GO(0)}, - vector{GI(0)}, - trans_b_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(0), GO(0)}, - vector{GI(1)}, - no_trans_arg)}; + return vector{CreateOperatorDef( + "BatchMatMul", + "", + vector{I(1), GO(0)}, + vector{GI(0)}, + trans_b_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(0), GO(0)}, + vector{GI(1)}, + no_trans_arg)}; } } else { if (trans_b) { // AB': // dA = GB, dB = G'A - return vector{ - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(1)}, - vector{GI(0)}, - no_trans_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(0)}, - vector{GI(1)}, - trans_a_arg)}; + return vector{CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(1)}, + vector{GI(0)}, + no_trans_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(0)}, + vector{GI(1)}, + trans_a_arg)}; } else { // AB: // dA = GB', dB = A'G - return vector{ - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(1)}, - vector{GI(0)}, - trans_b_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(0), GO(0)}, - vector{GI(1)}, - trans_a_arg)}; + return vector{CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(1)}, + vector{GI(0)}, + trans_b_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(0), GO(0)}, + vector{GI(1)}, + trans_a_arg)}; } } } diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc index 8aa9e282adb84..8eceb5ab4a577 100644 --- a/caffe2/operators/concat_split_op.cc +++ b/caffe2/operators/concat_split_op.cc @@ -101,12 +101,9 @@ OpSchema::Cost CostInferenceForSplit( CAFFE_ENFORCE_GT(in.size(), 0); struct OpSchema::Cost cost; cost.flops = 0; - auto const& input_0_element_size_byte = - DataTypeToTypeMeta(in[0].data_type()).itemsize(); - auto const& input_1_element_size_byte = - (in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0; - auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte; - auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte; + auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type()); + auto split_bytes_count = + (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type()); // There can be two input blobs: // (1) actual tensor to be split // (2) lengths of outputs along split axis @@ -332,13 +329,11 @@ OpSchema::Cost CostInferenceForConcat( } auto split_info_bytes_count = in.size() * sizeof(int); - auto const& input_0_element_size_byte = - DataTypeToTypeMeta(in[0].data_type()).itemsize(); struct OpSchema::Cost cost; cost.flops = 0; - cost.bytes_read = nElemRead * input_0_element_size_byte; + cost.bytes_read = nElemRead * sizeof(in[0].data_type()); cost.bytes_written = - size * input_0_element_size_byte + split_info_bytes_count; + size * sizeof(in[0].data_type()) + split_info_bytes_count; cost.params_bytes = 0; return cost; } diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index b356ef952d79c..25bd99a92e50f 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -7,7 +7,6 @@ #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" -#include "caffe2/core/types.h" #include "caffe2/proto/caffe2_legacy.pb.h" #include "caffe2/utils/math.h" @@ -520,20 +519,14 @@ class ConvPoolOpBase : public Operator { uint64_t nElemW = nElemFromDim(W); uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0; - auto const& X_elemenet_size_byte = - DataTypeToTypeMeta(X.data_type()).itemsize(); - auto const& Y_element_size_byte = - DataTypeToTypeMeta(Y.data_type()).itemsize(); - auto const& W_element_size_byte = - DataTypeToTypeMeta(W.data_type()).itemsize(); - // grouping is NOT properly handled yet c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h * in_channels * out_channels * 2; - c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte; - c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte; + c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type()); + c.bytes_written = + N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type()); c.params_bytes = out_channels * in_channels * kernel_t * kernel_h * - kernel_w * W_element_size_byte; + kernel_w * sizeof(W.data_type()); return c; } diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc index 9ea8eea5a2725..1529534d8fb2e 100644 --- a/caffe2/operators/distance_op.cc +++ b/caffe2/operators/distance_op.cc @@ -1,5 +1,4 @@ #include "caffe2/operators/distance_op.h" -#include "caffe2/core/types.h" #include "caffe2/utils/eigen_utils.h" #ifdef CAFFE2_USE_MKLDNN #include @@ -8,7 +7,7 @@ namespace caffe2 { -template <> +template<> bool SquaredL2DistanceOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); @@ -258,9 +257,7 @@ OpSchema::Cost CostInferenceForDotProduct( CAFFE_ENFORCE_EQ(out[0].dims().size(), 1); struct OpSchema::Cost c = PointwiseCostInference<2>(def, in); - auto const& out_0_element_size_byte = - DataTypeToTypeMeta(out[0].data_type()).itemsize(); - c.bytes_written = out[0].dims(0) * out_0_element_size_byte; + c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type()); c.params_bytes = 0; return c; } @@ -382,12 +379,10 @@ bool DotProductWithPaddingOp::RunOnDevice() { } // L2 -REGISTER_CPU_OPERATOR( - SquaredL2Distance, - SquaredL2DistanceOp); -REGISTER_CPU_OPERATOR( - SquaredL2DistanceGradient, - SquaredL2DistanceGradientOp); +REGISTER_CPU_OPERATOR(SquaredL2Distance, + SquaredL2DistanceOp); +REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient, + SquaredL2DistanceGradientOp); OPERATOR_SCHEMA(SquaredL2Distance) .NumInputs(2) @@ -407,8 +402,7 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; vector GetGradientDefs() override { return SingleGradientDef( - "SquaredL2DistanceGradient", - "", + "SquaredL2DistanceGradient", "", vector{I(0), I(1), GO(0)}, vector{GI(0), GI(1)}); } @@ -768,9 +762,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase { replicate = GetArgument(Def(), "replicate").i(); } - const auto dot_arg = vector{ - MakeArgument("pad_value", pad_value), - MakeArgument("replicate", replicate)}; + const auto dot_arg = + vector{MakeArgument("pad_value", pad_value), + MakeArgument("replicate", replicate)}; return SingleGradientDef( "DotProductWithPaddingGradient", @@ -781,4 +775,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase { } }; REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient); -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc index ba1b7122cdc9d..a44c230980c7f 100644 --- a/caffe2/operators/fc_inference.cc +++ b/caffe2/operators/fc_inference.cc @@ -1,5 +1,4 @@ #include "caffe2/operators/fc_inference.h" -#include "caffe2/core/types.h" namespace caffe2 { std::vector FCShapeInference( @@ -52,12 +51,11 @@ OpSchema::Cost CostInferenceForFC( ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1])) : size_to_dim_(canonical_axis_w, GetDimsVector(in[1])); - auto const& X_element_size_byte = - DataTypeToTypeMeta(in[0].data_type()).itemsize(); + const auto& X = in[0]; c.flops = M * N * (2 * K + 1); - c.bytes_read = (K * (M + N) + N) * X_element_size_byte; - c.bytes_written = M * N * X_element_size_byte; - c.params_bytes = (K * N + N) * X_element_size_byte; + c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type()); + c.bytes_written = M * N * sizeof(X.data_type()); + c.params_bytes = (K * N + N) * sizeof(X.data_type()); return c; } @@ -96,11 +94,7 @@ OpSchema::Cost CostInferenceForFCGradient( CAFFE_ENFORCE_LT(0, out.size()); const TensorShape dW = out[0]; - auto const& dW_element_size_byte = - DataTypeToTypeMeta(dW.data_type()).itemsize(); const TensorShape db = out[1]; - auto const& db_element_size_byte = - DataTypeToTypeMeta(db.data_type()).itemsize(); auto axis = helper.GetSingleArgument("axis", 1); const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size()); @@ -117,17 +111,15 @@ OpSchema::Cost CostInferenceForFCGradient( uint64_t size_db = nElemFromDim(db); c.flops = M * N * (2 * K + 1); - c.bytes_written = - size_dW * dW_element_size_byte + size_db * db_element_size_byte; + c.bytes_written = (size_dW + size_db) * sizeof(float); c.params_bytes = (K * N + N) * sizeof(float); if (out.size() == 3) { const TensorShape dX = out[2]; uint64_t size_dX = nElemFromDim(dX); - auto const& dX_element_size_byte = - DataTypeToTypeMeta(dX.data_type()).itemsize(); + c.flops += 2 * M * N * K; - c.bytes_written += size_dX * dX_element_size_byte; + c.bytes_written += size_dX * sizeof(float); } return c; } diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc index 55c73a5be22c4..c3eaf05db0e8f 100644 --- a/caffe2/operators/one_hot_ops.cc +++ b/caffe2/operators/one_hot_ops.cc @@ -2,7 +2,6 @@ #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" -#include "caffe2/core/types.h" namespace caffe2 { @@ -79,21 +78,12 @@ OpSchema::Cost CostInferenceForBatchOneHot( const auto& length = in[1]; const auto& values = in[2]; - auto const& data_element_size_byte = - DataTypeToTypeMeta(data.data_type()).itemsize(); - auto const& length_element_size_byte = - DataTypeToTypeMeta(length.data_type()).itemsize(); - auto const& values_element_size_byte = - DataTypeToTypeMeta(values.data_type()).itemsize(); - auto const& output_element_size_byte = - DataTypeToTypeMeta(output.data_type()).itemsize(); - - uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte; - uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte; - uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte; + uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type()); + uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type()); + uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type()); c.flops = 0; c.bytes_read = nBytesData + nBytesLength + nBytesValues; - c.bytes_written = nElemFromDim(output) * output_element_size_byte; + c.bytes_written = nElemFromDim(output) * sizeof(output.data_type()); c.params_bytes = 0; return c; } @@ -155,15 +145,15 @@ bool BatchBucketOneHotOp::RunOnDevice() { for (int64_t j = 0; j < D; j++) { // here we assume the boundary values for each feature are sorted int64_t lower_bucket_idx = std::lower_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; int64_t upper_bucket_idx = std::upper_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2; diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 561da9189b388..8b5e116024b81 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -1,7 +1,6 @@ #include "caffe2/operators/utility_ops.h" #include #include -#include "caffe2/core/types.h" #include "caffe2/utils/eigen_utils.h" namespace caffe2 { @@ -35,11 +34,9 @@ OpSchema::Cost CostInferenceForWeightedSum( const auto& nElem = nElemFromDim(X0); const auto& nInputs = in.size(); c.flops = (nInputs - 1) * nElem; - auto const& X0_element_size_byte = - DataTypeToTypeMeta(X0.data_type()).itemsize(); - c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte; - c.bytes_written = nElem * X0_element_size_byte; - c.params_bytes = (nInputs / 2) * X0_element_size_byte; + c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type()); + c.bytes_written = nElem * sizeof(X0.data_type()); + c.params_bytes = (nInputs / 2) * sizeof(X0.data_type()); return c; } @@ -51,7 +48,9 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp); REGISTER_CPU_OPERATOR(SumInt, SumOp); REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp); REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp); -REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp); +REGISTER_CPU_OPERATOR( + ScatterWeightedSum, + ScatterWeightedSumOp); REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); REGISTER_CPU_OPERATOR(Scatter, ScatterOp); diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py index 7dab4d6bd5d1f..996b330be4947 100644 --- a/caffe2/python/operator_test/concat_op_cost_test.py +++ b/caffe2/python/operator_test/concat_op_cost_test.py @@ -7,39 +7,33 @@ class TestConcatOpCost(TestCase): def test_columnwise_concat(self): - def _test_columnwise_concat_for_type(dtype): - workspace.ResetWorkspace() - workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)) - workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype)) - concat_op = core.CreateOperator( - "Concat", - ["input_1", "input_2"], - ["output", "split_info"], - ) - workspace.RunOperatorOnce(concat_op) - - output = workspace.FetchBlob("output") - self.assertTupleEqual(output.shape, (2, 4)) - np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]]) + workspace.ResetWorkspace() + workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)) + workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32)) + concat_op = core.CreateOperator( + "Concat", + ["input_1", "input_2"], + ["output", "split_info"], + ) + workspace.RunOperatorOnce(concat_op) - flops, bytes_written, bytes_read = workspace.GetOperatorCost( - concat_op, concat_op.input - ) + output = workspace.FetchBlob("output") + self.assertTupleEqual(output.shape, (2, 4)) + np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]]) - self.assertEqual(flops, 0) - self.assertEqual( - bytes_read, - sum(workspace.FetchBlob(b).nbytes for b in concat_op.input), - ) - self.assertEqual( - bytes_written, - sum(workspace.FetchBlob(b).nbytes for b in concat_op.output), - ) + flops, bytes_written, bytes_read = workspace.GetOperatorCost( + concat_op, concat_op.input + ) - [ - _test_columnwise_concat_for_type(t) - for t in [np.int64, np.float, np.half, np.int8] - ] + self.assertEqual(flops, 0) + self.assertEqual( + bytes_read, + sum(workspace.FetchBlob(b).nbytes for b in concat_op.input), + ) + self.assertEqual( + bytes_written, + sum(workspace.FetchBlob(b).nbytes for b in concat_op.output), + ) def test_split_then_concat(self): workspace.ResetWorkspace() diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 1bf7b607e1b7e..afb2065027075 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -60,7 +60,7 @@ def testGetOperatorCost(self): self.assertTupleEqual( op_cost, namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])( - 1152, 256, 4168 + 1152, 256, 2084 ), ) diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc index 0b6f604b48cdb..0de50f03e62d5 100644 --- a/caffe2/sgd/adagrad_op.cc +++ b/caffe2/sgd/adagrad_op.cc @@ -1,5 +1,4 @@ #include "adagrad_op.h" -#include "caffe2/core/types.h" namespace caffe2 { @@ -24,30 +23,22 @@ static OpSchema::Cost CostInferenceForAdagrad( // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = grad_size * 10; - auto const& moment_element_size_byte = - DataTypeToTypeMeta(moment.data_type()).itemsize(); - auto const& param_element_size_byte = - DataTypeToTypeMeta(param.data_type()).itemsize(); - auto const& grad_element_size_byte = - DataTypeToTypeMeta(grad.data_type()).itemsize(); - auto const& lr_element_size_byte = - DataTypeToTypeMeta(lr.data_type()).itemsize(); uint64_t bytes_written = - grad_size * param_element_size_byte + moment_element_size_byte; + grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type())); if (output_size == 3) { // also need to output effective learning rate in this case // assume it's the same data type as lr - bytes_written += grad_size * lr_element_size_byte; + bytes_written += grad_size * sizeof(lr.data_type()); } else if (output_size == 4) { // also need to output effective learning rate and updates in this case // assume update is the same data type as param bytes_written += - grad_size * (lr_element_size_byte + param_element_size_byte); + grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type())); } c.bytes_written = bytes_written; c.bytes_read = c.bytes_written + - grad_size * (grad_element_size_byte + lr_element_size_byte); + grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type())); return c; } @@ -111,18 +102,10 @@ static OpSchema::Cost CostInferenceForSparseAdagrad( // (optimistically count sqrt as one flop). // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = grad_size * 7; - auto const& param_element_size_byte = - DataTypeToTypeMeta(param.data_type()).itemsize(); - auto const& moment_element_size_byte = - DataTypeToTypeMeta(moment.data_type()).itemsize(); c.bytes_written = - grad_size * (param_element_size_byte + moment_element_size_byte); - auto const& grad_element_size_byte = - DataTypeToTypeMeta(grad.data_type()).itemsize(); - auto const& indices_element_size_byte = - DataTypeToTypeMeta(indices.data_type()).itemsize(); - c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte + - n * indices_element_size_byte; + grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type())); + c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) + + n * sizeof(indices.data_type()); return c; } @@ -170,16 +153,6 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad( OpSchema::Cost c; if (n > 0) { - auto const& param_element_size_byte = - DataTypeToTypeMeta(param.data_type()).itemsize(); - auto const& moment_element_size_byte = - DataTypeToTypeMeta(moment.data_type()).itemsize(); - auto const& grad_element_size_byte = - DataTypeToTypeMeta(grad.data_type()).itemsize(); - auto const& indices_element_size_byte = - DataTypeToTypeMeta(indices.data_type()).itemsize(); - auto const& lr_element_size_byte = - DataTypeToTypeMeta(lr.data_type()).itemsize(); auto block_size = grad_size / n; if (block_size == 1) { // +2: applying weight decay and add to grads @@ -188,22 +161,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad( // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = n * 9; c.bytes_written = - n * (param_element_size_byte + moment_element_size_byte); + n * (sizeof(param.data_type()) + sizeof(moment.data_type())); c.bytes_read = c.bytes_written + n * - (grad_element_size_byte + indices_element_size_byte + - lr_element_size_byte); + (sizeof(grad.data_type()) + sizeof(indices.data_type()) + + sizeof(lr.data_type())); } else { // 5 per block (not counting index transforms) // 8 for each value of a block // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = n * (5 + (block_size * 8)); - c.bytes_written = n * moment_element_size_byte + - n * block_size * param_element_size_byte; + c.bytes_written = + n * sizeof(moment.data_type()) + n * block_size * (param.data_type()); - c.bytes_read = c.bytes_written + n * lr_element_size_byte + + c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) + 2 * n * block_size * - (grad_element_size_byte + param_element_size_byte); + (sizeof(grad.data_type()) + sizeof(param.data_type())); } } return c; From ad4848565e1d9f4d408c60614f213acb52035181 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Mon, 30 Aug 2021 15:03:15 -0700 Subject: [PATCH 365/530] Enable Half, BFloat16, and Complex dtypes for coo-coo sparse matmul [CUDA] (#59980) Summary: This PR enables Half, BFloat16, ComplexFloat, and ComplexDouble support for matrix-matrix multiplication of COO sparse matrices. The change is applied only to CUDA 11+ builds. `cusparseSpGEMM` also supports `CUDA_C_16F` (complex float16) and `CUDA_C_16BF` (complex bfloat16). PyTorch also supports the complex float16 dtype (`ScalarType::ComplexHalf`), but there is no convenient dispatch, so this dtype is omitted in this PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/59980 Reviewed By: ngimel Differential Revision: D29699456 Pulled By: cpuhrsch fbshipit-source-id: 407ae53392acb2f92396a62a57cbaeb0fe6e950b --- aten/src/ATen/cuda/CUDADataType.h | 61 +++++++++++++++++++ .../ATen/native/sparse/cuda/SparseMatMul.cu | 54 ++++++++++------ test/test_sparse.py | 44 +++++++------ torch/testing/_internal/common_cuda.py | 1 + torch/utils/hipify/cuda_to_hip_mappings.py | 31 +++++----- 5 files changed, 139 insertions(+), 52 deletions(-) create mode 100644 aten/src/ATen/cuda/CUDADataType.h diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h new file mode 100644 index 0000000000000..71c9af9af8aac --- /dev/null +++ b/aten/src/ATen/cuda/CUDADataType.h @@ -0,0 +1,61 @@ +#pragma once + +#include + +#include +#include + +namespace at { +namespace cuda { + +template +cudaDataType getCudaDataType() { + TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to cudaDataType.") +} + +template<> cudaDataType getCudaDataType() { + return CUDA_R_16F; +} +template<> cudaDataType getCudaDataType() { + return CUDA_R_32F; +} +template<> cudaDataType getCudaDataType() { + return CUDA_R_64F; +} +template<> cudaDataType getCudaDataType>() { + return CUDA_C_16F; +} +template<> cudaDataType getCudaDataType>() { + return CUDA_C_32F; +} +template<> cudaDataType getCudaDataType>() { + return CUDA_C_64F; +} + +// HIP doesn't define integral types +#ifndef __HIP_PLATFORM_HCC__ +template<> cudaDataType getCudaDataType() { + return CUDA_R_8U; +} +template<> cudaDataType getCudaDataType() { + return CUDA_R_8I; +} +template<> cudaDataType getCudaDataType() { + return CUDA_R_32I; +} +#endif + +#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +template<> cudaDataType getCudaDataType() { + return CUDA_R_16I; +} +template<> cudaDataType getCudaDataType() { + return CUDA_R_64I; +} +template<> cudaDataType getCudaDataType() { + return CUDA_R_16BF; +} +#endif + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu index d5f31a1980bac..a08c93d1d71bd 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -118,14 +119,7 @@ struct csrMatrixRef { nnz_{nnz}, size_{size} { #if IS_CUSPARSE11_AVAILABLE() - cudaDataType cuda_data_type; - if ( std::is_same::value ) { - cuda_data_type = CUDA_R_32F; - } else if ( std::is_same::value) { - cuda_data_type = CUDA_R_64F; - } else { - TORCH_CHECK(false, "Tensor types must be either float32 or float64"); - } + cudaDataType cuda_data_type = at::cuda::getCudaDataType(); TORCH_CUDASPARSE_CHECK(cusparseCreateCsr( &description_, this->size(0), @@ -192,8 +186,14 @@ struct CusparseMatrixMultiplyOp { cusparseSpGEMMDescr_t spgemmDesc; CusparseMatrixMultiplyOp() { - static_assert(std::is_same::value || std::is_same::value, - "cusparse csr sparse-sparse MM only supports data type of float and double."); + static_assert( + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same, scalar_t>::value || + std::is_same, scalar_t>::value, + "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double."); // SpGEMM Computation TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_createDescr(&spgemmDesc)); } @@ -212,14 +212,6 @@ struct CusparseMatrixMultiplyOp { const int B_num_cols = B.size(1); - cudaDataType computeType; - if ( std::is_same::value ) { - computeType = CUDA_R_32F; - } else if ( std::is_same::value) { - computeType = CUDA_R_64F; - } else { - TORCH_CHECK(false, "Tensor types must be either float32 or float64"); - } csrOutput out({A.size(0), B.size(1)}); out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt)); @@ -252,6 +244,16 @@ struct CusparseMatrixMultiplyOp { cusparseSpMatDescr_t matC = C.description_; //-------------------------------------------------------------------------- + cudaDataType computeType = at::cuda::getCudaDataType(); + + // If a specific GPU model does not provide native support for a given data type, + // the routine returns CUSPARSE_STATUS_ARCH_MISMATCH error + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(prop->major >= 5 && !((10*prop->major + prop->minor) < 53 && computeType == CUDA_R_16F), + "sparse_mm: CUDA Float16 requires compute capability >= 53 (current: ", prop->major, prop->minor, ")"); + TORCH_CHECK(!(prop->major < 8 && computeType == CUDA_R_16BF), + "sparse_mm: CUDA BFloat16 requires compute capability >= 80 (current: ", prop->major, prop->minor, ")"); + // ask bufferSize1 bytes for external memory TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_workEstimation( handle, @@ -646,8 +648,14 @@ void sparse_sparse_matmul_cuda_kernel( const Tensor& mat1, const Tensor& mat2) { - static_assert(std::is_same::value || std::is_same::value, - "sparse_sparse_matmul_cuda_kernel only supports float and double value types"); + static_assert( + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same, scalar_t>::value || + std::is_same, scalar_t>::value, + "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double."); Tensor mat1_indices_ = mat1._indices().contiguous(); Tensor mat1_values = mat1._values().contiguous(); @@ -775,9 +783,15 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) { auto output = at::native::empty_like(mat1_); output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0); +#if IS_CUSPARSE11_AVAILABLE() + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] { + sparse_sparse_matmul_cuda_kernel(output, mat1_.coalesce(), mat2_.coalesce()); + }); +#else AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] { sparse_sparse_matmul_cuda_kernel(output, mat1_.coalesce(), mat2_.coalesce()); }); +#endif return output; } diff --git a/test/test_sparse.py b/test/test_sparse.py index 333f29f13138e..aaf045c4b0ea0 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -12,8 +12,12 @@ from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version from numbers import Number from typing import Dict, Any +from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes +from torch.testing._internal.common_cuda import \ + (SM53OrLater, SM80OrLater, CUDA11OrLater) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, onlyCPU, onlyCUDA, deviceCountAtLeast) + (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, + deviceCountAtLeast) from torch.testing._internal.common_methods_invocations import \ (sparse_unary_ufuncs) @@ -3217,8 +3221,13 @@ def sparse_log(x): # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA @skipIfRocm @coalescedonoff - @dtypes(torch.double) - @dtypesIfCPU(torch.double, torch.cdouble) + @dtypes(*get_all_complex_dtypes(), + *get_all_fp_dtypes(include_half=False, include_bfloat16=False)) + @dtypesIfCUDA(*(get_all_complex_dtypes() if CUDA11OrLater else ()), + *get_all_fp_dtypes( + include_half=(CUDA11OrLater and SM53OrLater), + include_bfloat16=(CUDA11OrLater and SM80OrLater))) + @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2}) def test_sparse_matmul(self, device, dtype, coalesced): """ This function test `torch.sparse.mm` when both the mat1 and mat2 are sparse tensors. @@ -3328,22 +3337,23 @@ def test_sparse_matmul(sparse_dims, nnz, shape_a, shape_b): r2 = torch.sparse.mm(a, b) self.assertEqual(r1, r2) - a.requires_grad_(True) - b.requires_grad_(True) + if dtype in [torch.double, torch.cdouble]: + a.requires_grad_(True) + b.requires_grad_(True) - # check autograd support on sparse matmul - def fn(D1, D2): - return torch.sparse.mm(D1, D2).to_dense() + # check autograd support on sparse matmul + def fn(D1, D2): + return torch.sparse.mm(D1, D2).to_dense() - if a.is_cuda: - # For cuda, `nondet_tol` is set with `1e-5` - # This is because cuSparse sometimes returns approximate zero values like `~e-323` - # TODO: Check this cuSparse issue. - # This happens when you do chain multiplication `torch.sparse.mm` operations - gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5) - else: - gradcheck(fn, (a, b), check_sparse_nnz=True) - grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b) + if a.is_cuda: + # For cuda, `nondet_tol` is set with `1e-5` + # This is because cuSparse sometimes returns approximate zero values like `~e-323` + # TODO: Check this cuSparse issue. + # This happens when you do chain multiplication `torch.sparse.mm` operations + gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5) + else: + gradcheck(fn, (a, b), check_sparse_nnz=True) + grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b) def test_error_cases(): def fn(sparse_dims, nnz, shape_a, shape_b): diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index 5d0849bb8407d..36e7f8a178577 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -20,6 +20,7 @@ CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.') SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3) SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0) +SM80OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0) TEST_MAGMA = TEST_CUDA if TEST_CUDA: diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index 558acc24ef3c8..6b60516efe322 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -554,6 +554,7 @@ ), ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)), ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)), + ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)), ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)), ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)), ( @@ -3786,21 +3787,21 @@ ), ), ("cudaDataType_t", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("cudaDataType", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_16F", ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_16F", ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_32F", ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_32F", ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_64F", ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_64F", ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_8I", ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_8I", ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_8U", ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_8U", ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_32I", ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_32I", ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_32U", ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_32U", ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), ( "MAJOR_VERSION", ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), From a7ae73a2380c3e45394998d2d1d9bceb14f2ee55 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 30 Aug 2021 15:03:40 -0700 Subject: [PATCH 366/530] BUG Fixes regression for nllloss gradcheck (#64203) Summary: Fixes https://github.com/pytorch/pytorch/issues/64163 This PR includes the fix and the opinfo from https://github.com/pytorch/pytorch/pull/63854/ for non-regression testing. cc albanD mruberry jbschlosser Pull Request resolved: https://github.com/pytorch/pytorch/pull/64203 Reviewed By: albanD Differential Revision: D30647522 Pulled By: jbschlosser fbshipit-source-id: 2974d299763505908fa93532aca2bd5d5b71f2e9 --- aten/src/ATen/native/cuda/Loss.cu | 10 ++-- .../_internal/common_methods_invocations.py | 47 ++++++++++++++++++- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu index ac9c3c0d8130f..2087f19dd3486 100644 --- a/aten/src/ATen/native/cuda/Loss.cu +++ b/aten/src/ATen/native/cuda/Loss.cu @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -207,7 +208,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_1d( bool size_average, int n_classes, int64_t ignore_index) { - CUDA_KERNEL_ASSERT(threadIdx.x == 0 && threadIdx.y == 0 & threadIdx.z == 0); + CUDA_KERNEL_ASSERT(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0); int t = static_cast(*target); if (t != static_cast(ignore_index)) { @@ -263,7 +264,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_2d( *total_weight = static_cast(total_weight_acc); if (size_average && nframe == 0) { // Mean reduction on empty tensors produces NaN - *output = std::numeric_limits::quiet_NaN(); + *output = std::numeric_limits::quiet_NaN(); } else if (size_average && total_weight_acc != 0) { *output = static_cast(output_acc / total_weight_acc); } else { @@ -286,7 +287,7 @@ void nll_loss_forward_out_cuda_template( auto weight_ = weight.defined() ? weight.contiguous() : weight; - if (reduction == Reduction::None & n_dims == 2) { + if (reduction == Reduction::None && n_dims == 2) { output.resize_({batch_size}); if (batch_size == 0) { // This guards from unnecessary operations and launching CUDA kernel with @@ -365,7 +366,8 @@ void nll_loss_forward_out_cuda_template( target.scalar_type(), "nll_loss_forward_reduce_cuda_kernel_2d_index", [&] { - nll_loss_forward_reduce_cuda_kernel_2d + using accscalar_t = at::acc_type; + nll_loss_forward_reduce_cuda_kernel_2d <<<1, NLL_LOSS_THREADS, 0, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 1349a29e9d7fe..52e8d73c6200e 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -13,7 +13,7 @@ from torch._six import inf import collections.abc -from typing import Any, Callable, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict from torch.testing import \ (make_non_contiguous, floating_types, floating_types_and, complex_types, @@ -5221,6 +5221,36 @@ def sample_inputs_grid_sample(op_info, device, dtype, requires_grad, **kwargs): return sample_inputs +def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs): + batch_size, num_classes = shape = (2, 3) + + input_shape_and_kwargs: List[Tuple[Tuple[int, ...], Dict[str, Any]]] = [ + ((*shape, 1), dict()), + ((*shape, 1, 2), dict()), + ((*shape, 1, 2, 3), dict()), + (shape, dict(weight=make_tensor((num_classes,), device=device, dtype=dtype).abs())), + (shape, dict(ignore_index=num_classes // 2)), + (shape, dict(reduction="sum")), + (shape, dict(reduction="mean")), + ] + + sample_inputs = [] + for input_shape, kwargs in input_shape_and_kwargs: + input = make_tensor(input_shape, device=device, dtype=dtype, requires_grad=requires_grad) + + target = make_tensor( + (batch_size, *input_shape[2:]), + low=0, + high=num_classes, + device=device, + dtype=torch.long, + requires_grad=requires_grad + ) + + sample_inputs.append(SampleInput(input, args=(target,), kwargs=kwargs)) + + return sample_inputs + foreach_unary_op_db: List[OpInfo] = [ ForeachFuncInfo('exp'), ForeachFuncInfo('acos'), @@ -9044,6 +9074,21 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): SkipInfo('TestReductions', 'test_dim_none_keepdim'), ), ), + OpInfo( + "nn.functional.nll_loss", + ref=_NOTHING, + dtypesIfCPU=floating_types_and(torch.bfloat16), + dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + supports_out=False, + sample_inputs_func=sample_inputs_nll_loss, + skips=( + SkipInfo( + "TestJit", + "test_variant_consistency_jit", + dtypes=(torch.float32,), + ), + ), + ), ] # Common operator groupings From 5401159b8f8988c7dc7489d44e71192bb679cf85 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Mon, 30 Aug 2021 15:58:50 -0700 Subject: [PATCH 367/530] OpInfo for nn.functional.interpolate (#61956) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61956 Each mode goes through a different implementation so they are listed as different variants. Test Plan: - run tests Reviewed By: malfet Differential Revision: D30013751 Pulled By: zou3519 fbshipit-source-id: 4253b40b55667d7486ef2d98b441c13d807ab292 --- .../_internal/common_methods_invocations.py | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 52e8d73c6200e..04db52b2e607b 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2535,6 +2535,48 @@ def sample_inputs_hardswish(self, device, dtype, requires_grad): requires_grad=requires_grad, low=-5, high=5)) for _ in range(1, N)] return tensors +def sample_inputs_interpolate(mode, self, device, dtype, requires_grad): + N, C = 2, 3 + D = 4 + S = 3 + L = 5 + + align_corners_options: Tuple[Any, ...] = (None,) + if mode in ('linear', 'bilinear', 'bicubic', 'trilinear'): + align_corners_options = (True, False, None) + ranks_for_mode = { + 'nearest': [1, 2, 3], + 'linear': [1], + 'bilinear': [2], + 'bicubic': [2], + 'trilinear': [3], + 'area': [1, 2, 3] + } + + def shape(size, rank, with_batch_channel=True): + if with_batch_channel: + return tuple([N, C] + ([size] * rank)) + return tuple([size] * rank) + + make_arg = partial(make_tensor, device=device, dtype=dtype, + requires_grad=requires_grad, low=-1, high=1) + + sample_inputs = [] + for align_corners in align_corners_options: + for rank in ranks_for_mode[mode]: + sample_inputs.extend([ + SampleInput(make_arg(shape(D, rank)), + args=(shape(S, rank, False), None, mode, align_corners)), + SampleInput(make_arg(shape(D, rank)), + args=(shape(L, rank, False), None, mode, align_corners)), + SampleInput(make_arg(shape(D, rank)), + args=(None, 1.7, mode, align_corners)), + SampleInput(make_arg(shape(D, rank)), + args=(None, 0.6, mode, align_corners)), + ]) + + return sample_inputs + def sample_inputs_gelu(self, device, dtype, requires_grad): N = 5 tensors = [SampleInput(make_tensor((N * 2, N * 2), device=device, dtype=dtype, @@ -7227,6 +7269,78 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypes=floating_types_and(torch.half), sample_inputs_func=sample_inputs_nn_unfold, skips=( + # JIT alias info internal asserts here + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + supports_out=False), + OpInfo('nn.functional.interpolate', + aten_name="interpolate", + variant_test_name='nearest', + supports_autograd=True, + dtypesIfCPU=floating_types_and(torch.uint8), + dtypesIfCUDA=floating_types_and(torch.half, torch.uint8), + sample_inputs_func=partial(sample_inputs_interpolate, 'nearest'), + skips=( + # JIT alias info internal asserts here + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + supports_out=False), + OpInfo('nn.functional.interpolate', + aten_name="interpolate", + variant_test_name='linear', + supports_autograd=True, + dtypesIfCUDA=floating_types_and(torch.half), + sample_inputs_func=partial(sample_inputs_interpolate, 'linear'), + skips=( + # JIT alias info internal asserts here + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + supports_out=False), + OpInfo('nn.functional.interpolate', + aten_name="interpolate", + variant_test_name='bilinear', + supports_autograd=True, + dtypesIfCUDA=floating_types_and(torch.half), + gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, + sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'), + skips=( + # JIT alias info internal asserts here + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + supports_out=False), + OpInfo('nn.functional.interpolate', + aten_name="interpolate", + variant_test_name='bicubic', + supports_autograd=True, + dtypesIfCUDA=floating_types_and(torch.half), + sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'), + gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, + skips=( + # JIT alias info internal asserts here + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + supports_out=False), + OpInfo('nn.functional.interpolate', + aten_name="interpolate", + variant_test_name='trilinear', + supports_autograd=True, + dtypesIfCUDA=floating_types_and(torch.half), + gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, + sample_inputs_func=partial(sample_inputs_interpolate, 'trilinear'), + skips=( + # JIT alias info internal asserts here + SkipInfo('TestJit', 'test_variant_consistency_jit'), + ), + supports_out=False), + OpInfo('nn.functional.interpolate', + aten_name="interpolate", + variant_test_name='area', + supports_autograd=True, + dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + sample_inputs_func=partial(sample_inputs_interpolate, 'area'), + gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, + skips=( + # JIT alias info internal asserts here SkipInfo('TestJit', 'test_variant_consistency_jit'), ), supports_out=False), From 1f16c22dc8251f01627ee73ad1ef69bd18e51447 Mon Sep 17 00:00:00 2001 From: Harut Movsisyan Date: Mon, 30 Aug 2021 16:16:45 -0700 Subject: [PATCH 368/530] [Static Runtime] Implement aten::cumsum out variant (#64159) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64159 Test Plan: Confirm out variant is called for both versions: ``` > buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1 ``` Reviewed By: mikeiovine Differential Revision: D30622819 fbshipit-source-id: a2c8c7f969dae5f507718fb3d513e1fb4f026736 --- benchmarks/static_runtime/test_scripts.h | 10 +++++++++ .../static_runtime/test_static_runtime.cc | 22 +++++++++++++++++++ torch/csrc/jit/runtime/static/ops.cpp | 22 +++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 7fdb113c4ed45..e26437fe4a6f9 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -801,3 +801,13 @@ const std::string cat_script = R"IR( %ret: Tensor = aten::cat(%ten_list2, %dim) return (%ret) )IR"; + +const auto cumsum_script = R"JIT( + def forward(self, a: Tensor, dim: int): + return torch.cumsum(a, dim).clone() +)JIT"; + +const auto cumsum_script_dtype = R"JIT( + def forward(self, a: Tensor, dim: int, dtype: int): + return torch.cumsum(a, dim, dtype=dtype).clone() +)JIT"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index b7201baa1e182..aa5cd35e38e56 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1311,3 +1311,25 @@ TEST(StaticRuntime, IndividualOps_Cat) { std::vector args1{c, d, 1}; testStaticRuntime(cat_script, args0, args1); } + + +TEST(StaticRuntime, IndividualOps_Cumsum) { + auto a = at::randn({2, 3}); + std::vector args0{a, 0}; + testStaticRuntime(cumsum_script, args0); + + auto b = at::randn({4, 3}); + std::vector args1{b, 1}; + testStaticRuntime(cumsum_script, args0, args1); +} + +TEST(StaticRuntime, IndividualOps_CumsumDtype) { + auto a = at::randn({1, 2}); + auto dtype = at::ScalarType::Float; + std::vector args0{a, 0, dtype}; + testStaticRuntime(cumsum_script_dtype, args0); + + auto b = at::randn({3, 4}); + std::vector args1{b, 1, dtype}; + testStaticRuntime(cumsum_script_dtype, args0, args1); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index cf91f33a28c26..a73872b540258 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1733,6 +1733,28 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator { }; }); +REGISTER_OPERATOR_FUNCTOR(aten::cumsum, aten_cumsum, [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "aten::cumsum(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& input = p_node->Input(0).toTensor(); + const auto dim = p_node->Input(1).toInt(); + const auto dtype = p_node->Input(2).toOptional(); + + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::cpu::cumsum(input, dim, dtype); + return; + } + + auto& output = p_node->Output(0).toTensor(); + fastResizeToZero(output); + at::cpu::cumsum_out(output, input, dim, dtype); + }; +}); + namespace { void check_cat_no_zero_dim(const std::vector& tensors) { From ebc0aacf83a0446ed798a96059c05da815c73d3d Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 30 Aug 2021 18:36:33 -0700 Subject: [PATCH 369/530] [nnc] Fix half2float conversion and re-enable float16 (#64199) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64199 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30643865 Pulled By: bertmaher fbshipit-source-id: 9de6adca53bd08839328cbaf6364f7de9550264b --- test/test_jit_fuser_te.py | 44 ++++++++++++++++------ test/test_tensorexpr.py | 1 - torch/csrc/jit/passes/tensorexpr_fuser.cpp | 2 +- torch/csrc/jit/tensorexpr/half_support.h | 40 ++++++++++++++++++++ torch/csrc/jit/tensorexpr/ir_verifier.cpp | 12 ++++++ 5 files changed, 86 insertions(+), 13 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 6d2432aa151f8..918cc702d83d6 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -94,8 +94,7 @@ def setUp(self): torch.bool, ] self.fp_dtypes = [ - # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed - # torch.float16, + torch.float16, torch.float32, torch.float64, ] @@ -1130,8 +1129,7 @@ def foo(x): dtypes = [ torch.bool, torch.int, - # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed - # torch.float16, + torch.float16, torch.float32, torch.float64, ] @@ -1146,6 +1144,9 @@ def forward(self, x): bad_dtypes = [] for dtype, output_dtype, device, size in product(dtypes, dtypes, self.devices, sizes): + # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed + if dtype == torch.float16 and device == "cpu": + continue if dtype == output_dtype: continue @@ -1201,18 +1202,16 @@ def test_isnan(self): torch.int16, torch.int32, torch.int64, - # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed - # torch.float16, + torch.float16, torch.float32, torch.float64, torch.bool, ] for inp, device, dtype in product(inputs, self.devices, dtypes): - # TODO - if dtype == torch.float16 and not LLVM_ENABLED: + # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed + if dtype == torch.float16 and device == "cpu": continue - inp = inp.to(device=device, dtype=dtype) try: f = torch.jit.trace(lambda x: x.isnan(), (inp,)) @@ -1272,6 +1271,9 @@ def apply(fn): gpu_only = {torch.erf, torch.erfc} sizes = [(1,), (2,), (4, 4)] for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): + # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed + if dtype == torch.float16 and device == "cpu": + continue if op in gpu_only and device == "cpu": continue try: @@ -1323,6 +1325,8 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, binary_ops, devices): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device) y = self.data_for(dtype, device) @@ -1373,6 +1377,8 @@ def fn(x, y): "[[10, 3, 4], [4, 5]]", ] for dtype, size, device in product(self.dtypes, sizes, devices): + if dtype == torch.float16 and device == "cpu": + continue try: size_x, size_y = size x = self.data_for(dtype, device, size=size_x) @@ -1417,6 +1423,8 @@ def apply_with_scalar(fn, scalar): # only using scalar values relevant to particular ops scalars = [1.5, 3, 0, -2.0, -1] for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device) fn = apply_with_scalar(op, scalar) @@ -1449,6 +1457,8 @@ def apply_with_scalar(fn, scalar): # only using scalar values relevant to particular ops scalars = [1.5, 3, -2.0, -1] # skip 0 for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device) fn = apply_with_scalar(op, scalar) @@ -1484,6 +1494,8 @@ def apply_with_scalar(fn, scalar): # only using scalar values relevant to particular ops scalars = [1.5, 3, 0, -2.0, -1] for dtype, op, device, scalar in product(dtypes, binary_ops, self.devices, scalars): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device) fn = apply_with_scalar(op, scalar) @@ -1512,6 +1524,8 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, ternary_ops, devices): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device) y = self.data_for(dtype, device) @@ -1541,6 +1555,8 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, ternary_ops, devices): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device, size=[5, 3, 128, 128]) y = self.data_for(dtype, device, size=[3]) @@ -1572,6 +1588,8 @@ def apply(fn): torch.cat, ] for dtype, op, device in product(self.dtypes, list_ops, devices): + if dtype == torch.float16 and device == "cpu": + continue try: x = self.data_for(dtype, device, size=[5, 4, 1, 7]) y = self.data_for(dtype, device, size=[5, 4, 1, 7]) @@ -1603,6 +1621,8 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, ops, devices): + if dtype == torch.float16 and device == "cpu": + continue try: cond = self.data_for(torch.bool, device) x = self.data_for(dtype, device) @@ -1768,7 +1788,10 @@ def test_type_as_cat(self): with inline_fusion_groups(): def eager(x, y): return torch.cat((x, y.type_as(x)), dim=1) - for dtype1, dtype2 in product(self.dtypes, self.dtypes): + dtypes = self.dtypes.copy() + # CPU fuser doesn't support float16. + dtypes.remove(torch.float16) + for dtype1, dtype2 in product(dtypes, dtypes): x = torch.randint(2, (1, 13,)).to(dtype1) zero = torch.tensor([[0]]).to(dtype2) one = torch.tensor([[1]]).to(dtype2) @@ -1936,7 +1959,6 @@ def bn_neither(i, x): for fn in [bn, bn_no_weight, bn_no_bias, bn_neither]: test(fn, (i, x)) - works_list = [ '__radd__', '__rdiv__', diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 47c7e689aa6a4..366c262ad7c1d 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -1222,7 +1222,6 @@ def bias_gelu(bias, y): x = warmup_and_run_forward(traced, a, b) self.assertLastGraphAllFused() - @unittest.skip("float16 is not supported yet.") def test_half_bn_relu(self): devices = ["cuda"] if torch.cuda.is_available() else [] diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 1d5128c7e71e2..a3e37072a032d 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -966,7 +966,7 @@ class TensorExprFuser { // but on top of that Float16 has a few kinks on LLVM. Thus, on CPU we // additionally disable it until we either move to a more stable version // or find workarounds. - if (*st == c10::ScalarType::Half) { + if (*st == c10::ScalarType::Half && *device == c10::kCPU) { return false; } diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h index eaf74d3c79d82..674af8a764928 100644 --- a/torch/csrc/jit/tensorexpr/half_support.h +++ b/torch/csrc/jit/tensorexpr/half_support.h @@ -128,6 +128,46 @@ class HalfRewriter : public IRMutator { return v; } + template + ExprPtr mutateArithmetic(T v) { + IRMutator::mutate(v); + if (v->dtype().scalar_type() == c10::kHalf) { + v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat)); + } + return v; + } + + ExprPtr mutate(AddPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(SubPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(MulPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(DivPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(MaxPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(MinPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(CompareSelectPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(BroadcastPtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(IfThenElsePtr v) override { + return mutateArithmetic(v); + } + ExprPtr mutate(IntrinsicsPtr v) override { + return mutateArithmetic(v); + } + private: std::unordered_set inserted_half_casts_; std::unordered_map var_map; diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.cpp b/torch/csrc/jit/tensorexpr/ir_verifier.cpp index f7adbdee93992..f31a935291c33 100644 --- a/torch/csrc/jit/tensorexpr/ir_verifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_verifier.cpp @@ -119,7 +119,19 @@ void IRVerifier::visit(IfThenElsePtr v) { } void IRVerifier::visit(IntrinsicsPtr v) { + if (v->op_type() == kIsNan) { + if (v->dtype().scalar_type() != c10::kInt) { + throw malformed_ir("bad dtype in intrinsic arg"); + } + IRVisitor::visit(v); + return; + } // TODO: add a check for OpArgCount and op_type + for (auto const& param : v->params()) { + if (param->dtype() != v->dtype()) { + throw malformed_ir("bad dtype in intrinsic arg"); + } + } IRVisitor::visit(v); } From 4bd03b02424d93b72f15e28c542ede13f88ea929 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Mon, 30 Aug 2021 18:39:50 -0700 Subject: [PATCH 370/530] Add python mode (#63496) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63496 This PR adds a (private) enable_python_mode context manager. (see torch/utils/_python_dispatch.py). enable_python_mode accepts the type of a __torch_dispatch__ object as its argument. Whenever an operator gets called inside of the context manager, it dispatches to the __torch_dispatch__ of the passed-in type. Example usage: ``` with enable_python_mode(LoggingTensor): z = torch.empty([]) assert isinstance(z, LoggingTensor) ``` There are quite a few changes that were made to support this. First, we added TorchDispatchTypeObject, a C++ struct that represents the type of a `__torch_dispatch__` object (e.g. LoggingTensor). It holds both the PyObject* representing the class and a PyInterpreter* so we know which Python interpreter it came from. Next, we updated the concrete_dispatch_fn in python_variable.cpp to accept a `const std::shared_ptr&` argument. When this is null, dispatching happens as usual. When it is non-null, we prepend the TorchDispatchTypeObject's PyObject* to the overloaded args list so that it is considered first for dispatch. To get that to work, we changed how `handle_torch_dispatch_no_python_arg_parser` works. The "overloaded args list" previously only consisted of Tensor PyObjects, but now it can have types in addition to Tensors! - We renamed `append_overloaded_arg` to `append_overloaded_arg` - We added a new `append_overloaded_type` that appends a type to overloaded_args - We added special handling in `handle_torch_dispatch_no_python_arg_parser` and `append_overloaded_arg` to handle types in addition to Tensors. Then, there is PythonMode and PythonModeTLS. - We reuse the DispatchKey::Python dispatch key as a mode key - We use PythonMode::enter and PythonMode::exit to enable/disable DispatchKey::Python and set the PythonModeTLS. - PythonModeTLS stores a TorchDispatchTypeObject as metadata. - PythonMode is in libtorch_python, and PythonModeTLS is in ATen. This split is due to the libtorch_python library boundary (because we need to save TLS in ATen/ThreadLocalState) - We modify the PythonFallbackKernel to look up the relevant TorchDispatchTypeObject (if Python Mode is active) and dispatch using it. There are two more miscellaneous changes: - internal_new_from_data (torch/csrc/utils/tensor_new.cpp) gets an exclude guard. enable_python_mode currently does not handle torch.tensor and the exclude guard is to prevent a bug. Future: - This PR does not allow for the nesting of Python modes. In the future we should be able to enable this with a more sane no_dispatch API and by changing the TLS to a stack. For now I did not need this for CompositeImplicitAutograd testing. Test Plan: - new tests Reviewed By: malfet, albanD Differential Revision: D30543236 Pulled By: zou3519 fbshipit-source-id: ef5444d96a5a957d1657b7e37dce80f9a497d452 --- aten/src/ATen/PythonModeTLS.cpp | 26 +++++++ aten/src/ATen/PythonModeTLS.h | 17 +++++ aten/src/ATen/ThreadLocalState.cpp | 3 + aten/src/ATen/ThreadLocalState.h | 3 + aten/src/ATen/core/PythonFallbackKernel.cpp | 13 +++- c10/core/TensorImpl.cpp | 20 ++++- c10/core/TensorImpl.h | 35 ++++++++- test/run_test.py | 1 + test/test_python_dispatch.py | 81 ++++++++++++++++++++- tools/build_variables.bzl | 2 + torch/_C/__init__.pyi.in | 2 + torch/csrc/autograd/init.cpp | 17 +++++ torch/csrc/autograd/python_mode.cpp | 27 +++++++ torch/csrc/autograd/python_mode.h | 17 +++++ torch/csrc/autograd/python_variable.cpp | 38 ++++++++-- torch/csrc/utils/python_arg_parser.cpp | 39 ++++++++-- torch/csrc/utils/python_arg_parser.h | 11 ++- torch/csrc/utils/tensor_new.cpp | 1 + torch/utils/_python_dispatch.py | 34 +++++++++ 19 files changed, 366 insertions(+), 21 deletions(-) create mode 100644 aten/src/ATen/PythonModeTLS.cpp create mode 100644 aten/src/ATen/PythonModeTLS.h create mode 100644 torch/csrc/autograd/python_mode.cpp create mode 100644 torch/csrc/autograd/python_mode.h create mode 100644 torch/utils/_python_dispatch.py diff --git a/aten/src/ATen/PythonModeTLS.cpp b/aten/src/ATen/PythonModeTLS.cpp new file mode 100644 index 0000000000000..b53043ca84147 --- /dev/null +++ b/aten/src/ATen/PythonModeTLS.cpp @@ -0,0 +1,26 @@ +#include + +namespace at { namespace impl { + +thread_local std::shared_ptr pythonModeState; + +void PythonModeTLS::set_state(const std::shared_ptr& state) { + pythonModeState = state; + if (state) { + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); + } else { + PythonModeTLS::reset_state(); + } +} + +const std::shared_ptr& PythonModeTLS::get_state() { + return pythonModeState; +} + +void PythonModeTLS::reset_state() { + pythonModeState.reset((TorchDispatchTypeObject*)nullptr); + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); +} + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/PythonModeTLS.h b/aten/src/ATen/PythonModeTLS.h new file mode 100644 index 0000000000000..be52b182c659b --- /dev/null +++ b/aten/src/ATen/PythonModeTLS.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include + +namespace at { +namespace impl { + +struct TORCH_API PythonModeTLS { + static void set_state(const std::shared_ptr& state); + static const std::shared_ptr& get_state(); + static void reset_state(); +}; + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 98c2519e045ce..19cfa89967ccb 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -17,6 +17,7 @@ ThreadLocalState::ThreadLocalState() saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks(); bumped_record_all_functions_ = at::checkRecordAllFunctions(); + python_mode_state_ = at::impl::PythonModeTLS::get_state(); } void ThreadLocalState::set_grad_mode(bool enabled) { @@ -30,6 +31,8 @@ void ThreadLocalState::setThreadLocalState( // restore the dispatch key set TLS at the same time. c10::AutogradState::set_tls_state(state.autograd_tls_); + at::impl::PythonModeTLS::set_state(state.python_mode_state_); + at::set_record_function_tls_(state.rf_tls_); SavedTensorDefaultHooks::set_hooks( diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index 41146912819b4..c99ca6158ffa5 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -6,6 +6,7 @@ #include #include +#include namespace at { @@ -40,6 +41,8 @@ class TORCH_API ThreadLocalState { // TLS for AutogradModes AutogradState autograd_tls_; + std::shared_ptr python_mode_state_; + // TLS for saved tensors default hooks std::pair saved_tensors_default_hooks_; diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 276eabfe458c0..8e77d0952ec75 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -1,9 +1,18 @@ #include #include +#include namespace { void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { + // If Python Mode is active, use its PyInterpreter for dispatch + const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state(); + if (maybe_python_mode_state) { + maybe_python_mode_state->pyinterpreter()->dispatch(op, stack, maybe_python_mode_state); + return; + } + + // Otherwise, find a PyInterpreter on a Tensor const auto& schema = op.schema(); const auto num_arguments = schema.arguments().size(); // It is safe to dispatch on the very first Tensor with a pyobj_interpreter @@ -15,7 +24,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { if (ivalue.isTensor()) { auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_interpreter(); if (interpreter) { - interpreter->dispatch(op, stack); + interpreter->dispatch(op, stack, nullptr); return; } } else if (ivalue.isTensorList()) { @@ -24,7 +33,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { for (const auto& nv : ivalue.toListRef()) { auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter(); if (interpreter) { - interpreter->dispatch(op, stack); + interpreter->dispatch(op, stack, nullptr); return; } } diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index de829c493732d..9a72659711743 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -40,7 +40,8 @@ static c10::intrusive_ptr noop_detach_fn( static void noop_dispatch_fn( const PyInterpreter*, const c10::OperatorHandle& op, - torch::jit::Stack* stack) { + torch::jit::Stack* stack, + const std::shared_ptr& type) { TORCH_INTERNAL_ASSERT( 0, "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died"); @@ -608,6 +609,23 @@ void TensorImpl::copy_tensor_metadata( } } +TorchDispatchTypeObject::TorchDispatchTypeObject( + PyObject* type_object, + c10::impl::PyInterpreter* pyinterpreter) + : data_(type_object), pyinterpreter_(pyinterpreter) {} + +TorchDispatchTypeObject::~TorchDispatchTypeObject() { + pyinterpreter_->decref(data_); +} + +c10::impl::PyInterpreter* TorchDispatchTypeObject::pyinterpreter() const { + return pyinterpreter_; +} + +PyObject* TorchDispatchTypeObject::ptr() const { + return data_; +} + namespace impl { namespace { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 7051e36b35516..d110a17b46590 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -161,6 +161,9 @@ struct C10_API AutogradMetaInterface { virtual ~AutogradMetaInterface(); }; +// forward declared +struct TorchDispatchTypeObject; + namespace impl { // Unfortunately, the definition of AutogradMeta lives in a separate @@ -255,7 +258,8 @@ struct C10_API PyInterpreter { using dispatch_sig = void( const PyInterpreter*, const c10::OperatorHandle&, - torch::jit::Stack* stack); + torch::jit::Stack* stack, + const std::shared_ptr& type); PyInterpreter( name_sig* name_fn, @@ -299,8 +303,9 @@ struct C10_API PyInterpreter { // Invoke the Python boxed fallback dispatch to go back into Python __ubsan_ignore_function__ void dispatch( const c10::OperatorHandle& op, - torch::jit::Stack* stack) const { - return (*dispatch_fn_)(this, op, stack); + torch::jit::Stack* stack, + const std::shared_ptr& type) const { + return (*dispatch_fn_)(this, op, stack, type); } // Disarm this PyInterpreter, making all of its methods noops. @@ -348,6 +353,30 @@ struct C10_API NamedTensorMetaInterface { }; }; +// NOTE [What is TorchDispatchTypeObject?] +// A TorchDispatchTypeObject represents the type of a Tensor subclass that has +// a __torch_dispatch__ classmethod. Concretely, it holds the class as a +// PyObject* and a PyInterpreter* that says which python interpreter the class +// came from. +// +// See NOTE [dispatch_fn's type argument] for more details +struct C10_API TorchDispatchTypeObject { + // Steals a reference to type_object + TorchDispatchTypeObject( + PyObject* type_object, + c10::impl::PyInterpreter* pyinterpreter); + + // Releases the stolen reference to type_object + ~TorchDispatchTypeObject(); + + c10::impl::PyInterpreter* pyinterpreter() const; + PyObject* ptr() const; + + private: + PyObject* data_; + c10::impl::PyInterpreter* pyinterpreter_; +}; + // NOTE [ Version Counter Sharing ] // // Every Tensor has a version counter. Version counters are incremented whenever diff --git a/test/run_test.py b/test/run_test.py index dd95e13de8e36..615aaf912c314 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -103,6 +103,7 @@ "test_optim", "test_functional_optim", "test_pytree", + "test_python_dispatch", "test_mobile_optimizer", "test_set_default_mobile_cpu_allocator", "test_xnnpack_integration", diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index 0f5b6b9cbd70e..e474f1f4783f0 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -1,6 +1,7 @@ import torch from torch.testing._internal.common_utils import TestCase, run_tests from torch.utils._pytree import tree_map +from torch.utils._python_dispatch import enable_python_mode from typing import Iterator, List import logging @@ -50,7 +51,10 @@ def unwrap(e): def wrap(e): return LoggingTensor(e) if isinstance(e, torch.Tensor) else e - rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) + # no_dispatch is only needed if you use enable_python_mode. + # It prevents infinite recursion. + with no_dispatch(): + rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs) return rs @@ -335,6 +339,81 @@ def backward(ctx, grad_output): $5 = torch._ops.aten.mul($4, $0) $6 = torch._ops.aten.add_($1, $5)''') + def test_enable_python_mode_error(self) -> None: + with self.assertRaisesRegex(ValueError, "__torch_dispatch__"): + with enable_python_mode(torch.Tensor): + pass + z = LoggingTensor(torch.empty([])) + with self.assertRaisesRegex(ValueError, "must be the type"): + with enable_python_mode(z): + pass + + def test_enable_python_mode_basic(self) -> None: + with enable_python_mode(LoggingTensor): + z = torch.empty([]) + self.assertTrue(isinstance(z, LoggingTensor)) + + def test_enable_python_mode_unrelated_tensors(self) -> None: + x = torch.randn([]) + y = torch.randn([]) + with enable_python_mode(LoggingTensor): + z = x + y + self.assertTrue(isinstance(z, LoggingTensor)) + + def test_enable_python_mode_subclass_priority(self) -> None: + class ErrorA(RuntimeError): + pass + + class ErrorB(RuntimeError): + pass + + class A(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + raise ErrorA + + class B(A): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + raise ErrorB + + a = A(torch.empty(1)) + b = B(torch.empty(1)) + with self.assertRaises(ErrorA): + a + a + + # B has precedence over A due to the subclass relationship + with self.assertRaises(ErrorB): + with enable_python_mode(A): + b + b + with self.assertRaises(ErrorB): + with enable_python_mode(B): + a + a + with self.assertRaises(ErrorB): + with enable_python_mode(B): + a + b + + def test_enable_python_mode_respects_no_dispatch(self) -> None: + with enable_python_mode(LoggingTensor): + z = torch.ones([2, 3]) + self.assertTrue(isinstance(z, LoggingTensor)) + with no_dispatch(): + expected = torch.ones([2, 3]) + self.assertEqual(z.elem, expected) + + def test_nested_enable_python_mode(self) -> None: + with self.assertRaisesRegex(RuntimeError, "has already been set"): + with enable_python_mode(LoggingTensor): + with enable_python_mode(LoggingTensor): + pass if __name__ == '__main__': run_tests() diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 34846b5d6c7b3..dd89981094d4f 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -666,6 +666,7 @@ libtorch_python_core_sources = [ "torch/csrc/autograd/init.cpp", "torch/csrc/autograd/python_anomaly_mode.cpp", "torch/csrc/autograd/python_saved_variable_hooks.cpp", + "torch/csrc/autograd/python_mode.cpp", "torch/csrc/autograd/python_cpp_function.cpp", "torch/csrc/autograd/python_engine.cpp", "torch/csrc/autograd/python_function.cpp", @@ -793,6 +794,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/ParallelNativeTBB.cpp", "aten/src/ATen/ParallelOpenMP.cpp", "aten/src/ATen/ParallelThreadPoolNative.cpp", + "aten/src/ATen/PythonModeTLS.cpp", "aten/src/ATen/ScalarOps.cpp", "aten/src/ATen/SequenceNumber.cpp", "aten/src/ATen/SparseTensorImpl.cpp", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 3629150d15090..c847e8deced62 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -652,6 +652,8 @@ def __set_forward_AD_enabled(enabled: _bool) -> None: ... def __is_forward_AD_enabled() -> _bool: ... def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ... def _reset_default_hooks() -> None: ... +def _enter_python_mode(cls: Type) -> None: ... +def _exit_python_mode() -> None: ... class _InferenceMode(object): def __init__(self, mode: _bool) -> None: ... diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 697ca871f83c5..860aaec466218 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -494,6 +495,20 @@ static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyOb END_HANDLE_TH_ERRORS } +static PyObject * enter_python_mode(PyObject* _unused, PyObject* arg) { + HANDLE_TH_ERRORS + PythonMode::enter(arg); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject * exit_python_mode(PyObject* _unused, PyObject* arg) { + HANDLE_TH_ERRORS + PythonMode::exit(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + // autograd methods on torch._C static PyMethodDef methods[] = { // NOLINT {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr}, @@ -514,6 +529,8 @@ static PyMethodDef methods[] = { // NOLINT {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr}, {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr}, {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr}, + {"_enter_python_mode", enter_python_mode, METH_O, nullptr}, + {"_exit_python_mode", exit_python_mode, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr} }; diff --git a/torch/csrc/autograd/python_mode.cpp b/torch/csrc/autograd/python_mode.cpp new file mode 100644 index 0000000000000..435842631a5bb --- /dev/null +++ b/torch/csrc/autograd/python_mode.cpp @@ -0,0 +1,27 @@ +#include +#include +#include +#include +#include + +namespace torch { namespace autograd { + +void PythonMode::enter(PyObject* type) { + if (at::impl::PythonModeTLS::get_state()) { + TORCH_CHECK( + false, + "python mode has already been set. We do not yet support nested python ", + "mode. Please file us an issue and reset it before setting it again.") + } + // TorchDispatchTypeObject steals a reference, See NOTE [What is TorchDispatchTypeObject?] + Py_INCREF(type); + auto state = std::make_shared(type, getPyInterpreter()); + at::impl::PythonModeTLS::set_state(state); +} + +void PythonMode::exit() { + TORCH_INTERNAL_ASSERT(at::impl::PythonModeTLS::get_state(), "exiting Python Mode but it wasn't set!"); + at::impl::PythonModeTLS::reset_state(); +} + +}} diff --git a/torch/csrc/autograd/python_mode.h b/torch/csrc/autograd/python_mode.h new file mode 100644 index 0000000000000..03da51c1c49e0 --- /dev/null +++ b/torch/csrc/autograd/python_mode.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace torch { namespace autograd { + +struct TORCH_API PythonMode { + // Enter python mode, causing all operators to dispatch to the type's __torch_dispatch__. + // `type` is the type of a Tensor subclass that has __torch_dispatch__. + static void enter(PyObject* type); + + // Exit the current python mode. + static void exit(); +}; + +}} diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 50d6eb9ab7e05..abe90105cde2b 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -32,6 +32,7 @@ #include #include +#include #include @@ -64,7 +65,12 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) { return; pybind11::gil_scoped_acquire gil; - if (Py_REFCNT(pyobj) > 1) { + // Two possibilities: + // 1. We are decref-ing a tensor. Then we must be careful about + // PyObject resurrection (this only applies to Tensors, see THPVariable_clear). + // 2. We are decref-ing some other Python object. We don't do + // PyObject resurrection on non-Tensors, so we just carry on as usual + if (THPVariable_Check(pyobj) && Py_REFCNT(pyobj) > 1) { // It's still alive! This can happen if a weak ref resurrected // the PyObject without flipping ownership. At this point it is // too late to rescue the object, so just stub out the PyObject @@ -82,7 +88,11 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) { }; c10::intrusive_ptr concrete_detach_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self); -void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack); +void concrete_dispatch_fn( + const c10::impl::PyInterpreter*, + const c10::OperatorHandle& op, + torch::jit::Stack* stack, + const std::shared_ptr& type); class PyInterpreterHolder { public: @@ -1491,7 +1501,19 @@ bool isPythonTensor(const Tensor& tensor) { return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python); } -void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack) { +// NOTE [dispatch_fn's type argument] +// `type` is nullable and represents the PythonMode going on. +// Right now we only support a single PythonMode, but in the future we could +// change this to a stack of PythonModes. +// +// If `type` isn't null, then we consider the type for dispatch by prepending +// it to the overloaded_args list. `handle_torch_funciton_no_python_arg_parser` +// is responsible for doing overload resolution. +void concrete_dispatch_fn( + const c10::impl::PyInterpreter*, + const c10::OperatorHandle& op, + torch::jit::Stack* stack, + const std::shared_ptr& type) { const auto& schema = op.schema(); const auto num_returns = schema.returns().size(); @@ -1568,13 +1590,17 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa auto args = py::reinterpret_steal(PyTuple_New(positional_default_start)); py::dict kwargs; + if (type) { + append_overloaded_type(&overloaded_args, type->ptr()); + } + // Find overloaded tensors for (int64_t idx = 0; idx < arguments.size(); idx++) { const auto& ivalue = arguments[idx]; if (ivalue.isTensor()) { const auto& tensor = ivalue.toTensor(); if (isPythonTensor(tensor)) { - append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr()); + append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr()); } } else if (ivalue.isList()) { const auto& list = ivalue.toListRef(); @@ -1583,7 +1609,7 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa if (nv.isTensor()) { const auto& tensor = nv.toTensor(); if (isPythonTensor(tensor)) { - append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr()); + append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr()); } } } @@ -1633,7 +1659,7 @@ c10::intrusive_ptr concrete_detach_fn(const c10::impl::PyInterpreter Tensor self_t = Tensor(c10::intrusive_ptr::unsafe_reclaim_from_nonowning(const_cast(self))); auto self_p = py::reinterpret_steal(THPVariable_Wrap(self_t)); TORCH_INTERNAL_ASSERT(isPythonTensor(self_t)); - append_overloaded_arg(&overloaded_args, self_p.ptr()); + append_overloaded_tensor(&overloaded_args, self_p.ptr()); auto args = py::reinterpret_steal(PyTuple_New(1)); PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr()); diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 6115dcdfbe61a..3ee20c055bf94 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -200,12 +200,28 @@ auto handle_torch_function(PyObject* self, const std::string& func_name, PyObjec return ret.release().ptr(); } +// Note: [Overloaded args] +// An overloaded arg may be one of the following: +// - an instance of an object that has a __torch_function__ method +// - an instance of an object that has a __torch_dispatch__ classmethod +// - a class type that has a __torch_dispatch__ classmethod +// +// This function returns the type of the arg (if the arg is an instance), +// otherwise, it returns the arg. +static PyObject* get_type_of_overloaded_arg(PyObject* obj_or_type) { + if (PyType_Check(obj_or_type)) { + return obj_or_type; + } + return (PyObject*)Py_TYPE(obj_or_type); +} + +// See Note: [Overloaded args] for what they hold auto handle_torch_function_no_python_arg_parser(const std::vector &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name, const char* torch_function_name) -> PyObject* { // overloaded_args already all have unique types std::vector overloaded_types; overloaded_types.reserve(overloaded_args.size()); for (auto &arg : overloaded_args) { - overloaded_types.push_back(py::reinterpret_borrow((PyObject *) Py_TYPE(arg.ptr()))); + overloaded_types.push_back(py::reinterpret_borrow(get_type_of_overloaded_arg(arg.ptr()))); } py::tuple py_types = py::cast(overloaded_types); py::object ret; @@ -231,7 +247,7 @@ auto handle_torch_function_no_python_arg_parser(const std::vector &o ss << "no implementation found for '" << module_name << "." << func_name << "' on types that implement " << torch_function_name << ": ["; for (auto &arg : overloaded_args) { - ss << arg.ptr()->ob_type->tp_name; + ss << PyObject_Repr(get_type_of_overloaded_arg(arg.ptr())); if (!arg.is(overloaded_args.back())) { ss << ", "; } @@ -328,10 +344,11 @@ auto handle_torch_function_indexing(PyObject* self, PyObject* index, PyObject* v * */ -void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj) { +static void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj, bool obj_is_type) { bool class_not_seen_yet = true; + PyObject* obj_type = obj_is_type ? obj : (PyObject*)Py_TYPE(obj); for (auto &arg : *overloaded_args) { - if (Py_TYPE(obj) == Py_TYPE(arg.ptr())) { + if (obj_type == get_type_of_overloaded_arg(arg.ptr())) { // obj is the same type as another parameter we've seen in a prior // iteration of the loop over parameters so we already have an entry // with the proper __torch_function__ implementation to call, so skip @@ -343,7 +360,7 @@ void append_overloaded_arg(std::vector* overloaded_args, PyObject* o if (class_not_seen_yet) { int arg_index = overloaded_args->size(); for(const auto j : c10::irange(arg_index)) { - if (PyObject_IsInstance(obj, (PyObject*)(Py_TYPE((*overloaded_args)[j].ptr())))) { + if (PyObject_IsSubclass(obj_type, (PyObject*)(get_type_of_overloaded_arg((*overloaded_args)[j].ptr())))) { // obj is a subclass of another object we've seen already so its // __torch_function__ should be called first, therefore we // insert it into overloaded_args before the superclass @@ -358,6 +375,14 @@ void append_overloaded_arg(std::vector* overloaded_args, PyObject* o } } +void append_overloaded_tensor(std::vector* overloaded_args, PyObject* obj) { + append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/false); +} + +void append_overloaded_type(std::vector* overloaded_args, PyObject* obj) { + append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/true); +} + bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* overloaded_args) { if (THPVariable_CheckExact(obj)) { // torch.Tensor instances (not subclasses, except for Parameter) @@ -366,7 +391,7 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* ove if (check_has_torch_function(obj)) { // tensor subclasses and unrelated objects with __torch_function__ - append_overloaded_arg(overloaded_args, obj); + append_overloaded_tensor(overloaded_args, obj); return true; } else if (THPVariable_Check(obj)) { // tensor subclasses without __torch_function__ @@ -905,7 +930,7 @@ bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs, int i = 0; if (self != nullptr && check_has_torch_function(self)) { - append_overloaded_arg(&this->overloaded_args, self); + append_overloaded_tensor(&this->overloaded_args, self); } for (auto& param : params) { PyObject* obj = nullptr; diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index d132185ccaefb..6a05807e5a314 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -818,6 +818,15 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector * 'overloaded_args': the vector to append the overloaded args * 'obj': the input tensor that is overloaded */ -void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj); +void append_overloaded_tensor(std::vector* overloaded_args, PyObject* obj); + +/* Given an argument that is definitely a type and is definitely overloaded, + * append it to the overloaded arguments list. Use this only with __torch_dispatch__, + * where we operate on classes that have a __torch_dispatch__ classmethod. + * + * 'overloaded_args': the vector to append the overloaded type + * 'obj': the input class that has a __torch_dispatch__ classmethod. + */ +void append_overloaded_type(std::vector* overloaded_args, PyObject* obj); } // namespace torch diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 17d7acc37640c..25e9a5962614f 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -267,6 +267,7 @@ Tensor internal_new_from_data( { at::AutoDispatchBelowADInplaceOrView guard; // TODO: remove at::tracer::impl::NoTracerDispatchMode tracer_guard; + c10::impl::ExcludeDispatchKeyGuard pythonmode_guard(c10::DispatchKey::Python); // functorch uses FuncTorchDynamicLayerBackMode as a mode key to wrap all // tensors returned from operators in special TensorWrapper tensor extension // The problem with this is that TensorWrapper does not have storage so diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py new file mode 100644 index 0000000000000..a7cfae10c37c1 --- /dev/null +++ b/torch/utils/_python_dispatch.py @@ -0,0 +1,34 @@ +import torch +import contextlib +from typing import Iterator + +# Context manager that causes all pytorch operators to dispatch to the passed-in +# type's __torch_dispatch__ function. +# operation that accepts no tensors but returns a tensor. +# +# enable_python_mode is affected by torch._C._DisableTorchDispatch. +# +# NB: Calling an operator inside __torch_dispatch__ does go through +# __torch_dispatch__ again. Please use _DisableTorchDispatch inside +# __torch_dispatch__ to prevent infinite recursion. +# +# TODO: Limitations and things about enable_python_mode we should fix before exposing it: +# - it currently cannot be nested. This should be simple to implement; we need a +# stack of TorchDispatchTypeObjects and the next bullet point. +# - We need a better user-facing api for torch._C._DisableTorchDispatch that +# is able to selectively disable __torch_dispatch__ of a particular class. +# - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor) +# - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694) +@contextlib.contextmanager +def enable_python_mode(cls) -> Iterator[None]: + if not hasattr(cls, '__torch_dispatch__'): + raise ValueError('The class passed to enable_python_mode ' + 'must have a __torch_dispatch__ classmethod') + if not isinstance(cls, type) or not issubclass(cls, (torch.Tensor,)): + raise ValueError('The argument passed to enable_python_mode ' + 'must be the type of a Tensor subclass') + torch._C._enter_python_mode(cls) + try: + yield + finally: + torch._C._exit_python_mode() From af85bc5ffd1d4ad52e0fed255aa7afe2fdfbc5e2 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Mon, 30 Aug 2021 18:41:08 -0700 Subject: [PATCH 371/530] Replace group_by_key by group_by IterDataPipe (#64220) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64220 Remove `ByKeyGrouperIterDataPipe` due to duplicated functionality. Fix a bug in `GrouperIterDataPipe` using the existing test. Test Plan: Imported from OSS Reviewed By: VitalyFedyunin Differential Revision: D30650542 Pulled By: ejguan fbshipit-source-id: 666b4d28282fb4f49f3ff101b8d08be16a50d836 --- test/test_datapipe.py | 22 +++- torch/utils/data/datapipes/iter/__init__.py | 4 +- torch/utils/data/datapipes/iter/grouping.py | 121 +------------------- 3 files changed, 23 insertions(+), 124 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 86e53fa699142..c35698e057c89 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -299,7 +299,7 @@ def _helper(prior_dp, dp, channel_first=False): _helper(cached, datapipe4, channel_first=True) # TODO(VitalyFedyunin): Generates unclosed buffer warning, need to investigate - def test_groupbykey_iterable_datapipe(self): + def test_groupby_iterable_datapipe(self): temp_dir = self.temp_dir.name temp_tarfile_pathname = os.path.join(temp_dir, "test_tar.tar") file_list = [ @@ -316,13 +316,25 @@ def test_groupbykey_iterable_datapipe(self): datapipe1 = dp.iter.FileLister(temp_dir, '*.tar') datapipe2 = dp.iter.FileLoader(datapipe1) datapipe3 = dp.iter.TarArchiveReader(datapipe2) - datapipe4 = dp.iter.ByKeyGrouper(datapipe3, group_size=2) - expected_result = [("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), ( - "f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.json", "h.txt")] + def group_fn(data): + filepath, _ = data + return os.path.basename(filepath).split(".")[0] + + datapipe4 = dp.iter.Grouper(datapipe3, group_key_fn=group_fn, group_size=2) + + def order_fn(data): + data.sort(key=lambda f: f[0], reverse=True) + return data + + datapipe5 = dp.iter.Mapper(datapipe4, fn=order_fn) # type: ignore[var-annotated] + + expected_result = [ + ("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), + ("f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.txt", "h.json")] count = 0 - for rec, expected in zip(datapipe4, expected_result): + for rec, expected in zip(datapipe5, expected_result): count = count + 1 self.assertEqual(os.path.basename(rec[0][0]), expected[0]) self.assertEqual(os.path.basename(rec[1][0]), expected[1]) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index f302fd3a2b7ea..8478577c8f2aa 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -19,7 +19,7 @@ from torch.utils.data.datapipes.iter.grouping import ( BatcherIterDataPipe as Batcher, BucketBatcherIterDataPipe as BucketBatcher, - ByKeyGrouperIterDataPipe as ByKeyGrouper, + GrouperIterDataPipe as Grouper, ) from torch.utils.data.datapipes.iter.httpreader import ( HTTPReaderIterDataPipe as HttpReader, @@ -48,12 +48,12 @@ __all__ = ['Batcher', 'BucketBatcher', - 'ByKeyGrouper', 'Collator', 'Concater', 'FileLister', 'FileLoader', 'Filter', + 'Grouper', 'HttpReader', 'IterableWrapper', 'LineReader', diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index 5f449489ac756..f47299ce1cf9c 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -1,12 +1,10 @@ -import functools -import os import random import warnings from collections import defaultdict from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk -from typing import Any, Callable, Dict, Iterator, List, Optional, Sized, Tuple, TypeVar, DefaultDict +from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar T_co = TypeVar('T_co', covariant=True) @@ -225,35 +223,6 @@ def __len__(self) -> int: raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) -# defaut group key is the file pathname without the extension. -# Assuming the passed in data is a tuple and 1st item is file's pathname. -def default_group_key_fn(dataitem: Tuple[str, Any]): - return os.path.splitext(dataitem[0])[0] - - -def default_sort_data_fn(datalist: List[Tuple[str, Any]]): - txt_ext = ['.json', '.jsn', '.txt', '.text'] - - def cmp_fn(a: Tuple[str, Any], b: Tuple[str, Any]): - a_is_txt = os.path.splitext(a[0])[1] in txt_ext - b_is_txt = os.path.splitext(b[0])[1] in txt_ext - - # if a is txt but b is not, b go front - if a_is_txt and not b_is_txt: - return 1 - # if a is not txt but b is txt, a go front - if not a_is_txt and b_is_txt: - return -1 - # if a and b both are or are not txt, sort in alphabetic order - if a[0] < b[0]: - return -1 - elif a[0] > b[0]: - return 1 - return 0 - - return sorted(datalist, key=functools.cmp_to_key(cmp_fn)) - - @functional_datapipe('groupby') class GrouperIterDataPipe(IterDataPipe): # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks) @@ -309,6 +278,9 @@ def __iter__(self): for x in self.datapipe: key = self.group_key_fn(x) + buffer_elements[key].append(x) + buffer_size += 1 + if self.group_size is not None and self.group_size == len(buffer_elements[key]): yield self.wrapper_class(buffer_elements[key]) buffer_size -= len(buffer_elements[key]) @@ -319,92 +291,7 @@ def __iter__(self): if result_to_yield is not None: yield self.wrapper_class(result_to_yield) - buffer_elements[key].append(x) - buffer_size += 1 - while buffer_size: (result_to_yield, buffer_size) = self._remove_biggest_key(buffer_elements, buffer_size) if result_to_yield is not None: yield self.wrapper_class(result_to_yield) - - -@functional_datapipe('group_by_key') -class ByKeyGrouperIterDataPipe(IterDataPipe[list]): - r""" :class:`GroupByKeyIterDataPipe`. - - Iterable datapipe to group data from input iterable by keys which are generated from `group_key_fn`, - yields a list with `group_size` items in it, each item in the list is a tuple of key and data - - args: - datapipe: Iterable datapipe that provides data. (typically str key (eg. pathname) and data stream in tuples) - group_size: the size of group - max_buffer_size: the max size of stream buffer which is used to store not yet grouped but iterated data - group_key_fn: a function which is used to generate group key from the data in the input datapipe - sort_data_fn: a function which is used to sort the grouped data before yielding back - length: a nominal length of the datapipe - """ - datapipe: IterDataPipe[Tuple[str, Any]] - group_size: int - max_buffer_size: int - group_key_fn: Callable - sort_data_fn: Callable - curr_buffer_size: int - stream_buffer: Dict[str, List[Tuple[str, Any]]] - length: int - - def __init__( - self, - datapipe: IterDataPipe[Tuple[str, Any]], - *, - group_size: int, - max_buffer_size: Optional[int] = None, - group_key_fn: Callable = default_group_key_fn, - sort_data_fn: Callable = default_sort_data_fn, - length: int = -1): - super().__init__() - - assert group_size > 0 - self.datapipe = datapipe - self.group_size = group_size - - # default max buffer size is group_size * 10 - self.max_buffer_size = max_buffer_size if max_buffer_size is not None else group_size * 10 - assert self.max_buffer_size >= self.group_size - - self.group_key_fn = group_key_fn # type: ignore[assignment] - self.sort_data_fn = sort_data_fn # type: ignore[assignment] - self.curr_buffer_size = 0 - self.stream_buffer = {} - self.length = length - - def __iter__(self) -> Iterator[list]: - if self.group_size == 1: - for data in self.datapipe: - yield [data] - else: - for data in self.datapipe: - key = self.group_key_fn(data) - if key not in self.stream_buffer: - self.stream_buffer[key] = [] - res = self.stream_buffer[key] - res.append(data) - if len(res) == self.group_size: - yield self.sort_data_fn(res) - del self.stream_buffer[key] - self.curr_buffer_size = self.curr_buffer_size - self.group_size + 1 - else: - if self.curr_buffer_size == self.max_buffer_size: - raise OverflowError( - "stream_buffer is overflow, please adjust the order of data " - "in the input datapipe or increase the buffer size!") - self.curr_buffer_size = self.curr_buffer_size + 1 - - if self.curr_buffer_size > 0: - msg = "Not able to group [{}] with group size {}.".format( - ','.join([v[0] for _, vs in self.stream_buffer.items() for v in vs]), str(self.group_size)) - raise RuntimeError(msg) - - def __len__(self) -> int: - if self.length == -1: - raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) - return self.length From a49907f984670781a718ef6aa0046709886eae5a Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Mon, 30 Aug 2021 18:41:08 -0700 Subject: [PATCH 372/530] Modify inline doc for DataPipe (#64221) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64221 List of tasks in this PR - [x] Add inline doc for DataPipe - [x] Improve the inline doc - [x] Expose DataPipe to `datapipes.iter` (`UnBatcher`) Note: `Forker`, `Demux`, `Mux` are exposed in another PR authored by Kevin - [x] Add correct typing to DataPipe - [x] Unify the argument to `datapipe` rather than `source_datapipe` Test Plan: Imported from OSS Reviewed By: VitalyFedyunin Differential Revision: D30650541 Pulled By: ejguan fbshipit-source-id: c09d1b9742b8097d8e645c15947cef80c876877b --- torch/utils/data/datapipes/iter/__init__.py | 2 + torch/utils/data/datapipes/iter/callable.py | 14 ++++--- .../data/datapipes/iter/combinatorics.py | 9 ++-- torch/utils/data/datapipes/iter/combining.py | 8 ++-- torch/utils/data/datapipes/iter/filelister.py | 21 +++++----- torch/utils/data/datapipes/iter/fileloader.py | 5 ++- torch/utils/data/datapipes/iter/grouping.py | 42 +++++++++++++------ torch/utils/data/datapipes/iter/httpreader.py | 12 +++--- torch/utils/data/datapipes/iter/linereader.py | 9 ++-- .../data/datapipes/iter/routeddecoder.py | 6 ++- torch/utils/data/datapipes/iter/selecting.py | 7 ++-- .../utils/data/datapipes/iter/streamreader.py | 14 ++++--- .../data/datapipes/iter/tararchivereader.py | 12 +++--- torch/utils/data/datapipes/iter/utils.py | 7 ++++ .../data/datapipes/iter/ziparchivereader.py | 15 ++++--- 15 files changed, 114 insertions(+), 69 deletions(-) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index 8478577c8f2aa..b55bbf6667509 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -20,6 +20,7 @@ BatcherIterDataPipe as Batcher, BucketBatcherIterDataPipe as BucketBatcher, GrouperIterDataPipe as Grouper, + UnBatcherIterDataPipe as UnBatcher, ) from torch.utils.data.datapipes.iter.httpreader import ( HTTPReaderIterDataPipe as HttpReader, @@ -63,6 +64,7 @@ 'Shuffler', 'StreamReader', 'TarArchiveReader', + 'UnBatcher', 'ZipArchiveReader', 'Zipper'] diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py index 18f6f17fff156..2c5ca3d024392 100644 --- a/torch/utils/data/datapipes/iter/callable.py +++ b/torch/utils/data/datapipes/iter/callable.py @@ -31,14 +31,15 @@ class MapperIterDataPipe(IterDataPipe[T_co]): Iterable DataPipe to run a function over each item from the source DataPipe. The function can be any regular python function or partial object. Lambda function is not recommended as it is not supported by pickle. - args: + + Args: datapipe: Source Iterable DataPipe fn: Function called over each item fn_args: Positional arguments for `fn` fn_kwargs: Keyword arguments for `fn` - nesting_level: Determines which level the fn gets applied to, by default it applies to the top level (= 0) - This also accepts -1 as input to apply the function to the lowest nesting level. It currently doesn't support - argument < -1. + nesting_level: Determines which level the fn gets applied to, by default it applies to the top level (= 0). + This also accepts -1 as input to apply the function to the lowest nesting level. It currently doesn't support + argument < -1. """ datapipe: IterDataPipe fn: Callable @@ -112,10 +113,11 @@ class CollatorIterDataPipe(MapperIterDataPipe): Iterable DataPipe to collate samples from datapipe to Tensor(s) by `util_.collate.default_collate`, or customized Data Structure by collate_fn. - args: + + Args: datapipe: Iterable DataPipe being collated collate_fn: Customized collate function to collect and combine data or a batch of data. - Default function collates to Tensor(s) based on data type. + Default function collates to Tensor(s) based on data type. fn_args: Positional arguments for `collate_fn` fn_kwargs: Keyword arguments for `collate_fn` diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py index d1a7dd0368221..4d6fac749729d 100644 --- a/torch/utils/data/datapipes/iter/combinatorics.py +++ b/torch/utils/data/datapipes/iter/combinatorics.py @@ -10,10 +10,11 @@ class SamplerIterDataPipe(IterDataPipe[T_co]): r""" :class:`SamplerIterDataPipe`. Iterable DataPipe to generate sample elements. - args: - datapipe: IterDataPipe sampled from + + Args: + datapipe: IterDataPipe to sample from sampler: Sampler class to genereate sample elements from input DataPipe. - Default is :class:`SequentialSampler` for IterDataPipe + Default is :class:`SequentialSampler` for IterDataPipe """ datapipe: IterDataPipe sampler: Sampler @@ -63,7 +64,7 @@ class ShufflerIterDataPipe(IterDataPipe[T_co]): mode (:attr:`num_worker > 0`), `worker_init_fn` is used to set up a random seed for each worker process. - args: + Args: datapipe: The IterDataPipe being shuffled buffer_size: The buffer size for shuffling (default to 10000) unbatch_level: Specifies if it necessary to unbatch source data before diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index 4b28e0926c42b..879e8be27ff0c 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -11,7 +11,8 @@ class ConcaterIterDataPipe(IterDataPipe): r""" :class:`ConcaterIterDataPipe`. Iterable DataPipe to concatenate multiple Iterable DataPipes. - args: + + Args: datapipes: Iterable DataPipes being concatenated """ datapipes: Tuple[IterDataPipe] @@ -97,12 +98,13 @@ def __iter__(self): @functional_datapipe('zip') class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]): - r""" :class:`ZipIterDataPipe`. + r""" :class:`ZipperIterDataPipe`. Iterable DataPipe aggregates elements into a tuple from each of the input DataPipe. The output DataPipe is stopped when the shortest input DataPipe is exhausted. - args: + + Args: *datapipes: Iterable DataPipes being aggregated """ datapipes: Tuple[IterDataPipe] diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py index 48fdce9f52ef5..aef147d2d2941 100644 --- a/torch/utils/data/datapipes/iter/filelister.py +++ b/torch/utils/data/datapipes/iter/filelister.py @@ -6,11 +6,12 @@ class FileListerIterDataPipe(IterDataPipe[str]): r""" :class:`FileListerIterDataPipe` Iterable DataPipe to load file pathname(s) (path + filename), yield pathname from given disk root dir. - args: - root : root dir - mask : a unix style filter string or string list for filtering file name(s) - abspath : whether to return relative pathname or absolute pathname - length : a nominal length of the datapipe + + Args: + root: Root directory + mask: Unix style filter string or string list for filtering file name(s) + abspath: Whether to return relative pathname or absolute pathname + length: Nominal length of the datapipe """ def __init__( @@ -22,11 +23,11 @@ def __init__( abspath: bool = False, length: int = -1): super().__init__() - self.root : str = root - self.masks : Union[str, List[str]] = masks - self.recursive : bool = recursive - self.abspath : bool = abspath - self.length : int = length + self.root: str = root + self.masks: Union[str, List[str]] = masks + self.recursive: bool = recursive + self.abspath: bool = abspath + self.length: int = length def __iter__(self) -> Iterator[str] : yield from get_file_pathnames_from_root(self.root, self.masks, self.recursive, self.abspath) diff --git a/torch/utils/data/datapipes/iter/fileloader.py b/torch/utils/data/datapipes/iter/fileloader.py index 2b73e4e156b70..7c048fc054378 100644 --- a/torch/utils/data/datapipes/iter/fileloader.py +++ b/torch/utils/data/datapipes/iter/fileloader.py @@ -10,13 +10,14 @@ class FileLoaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): Iterable Datapipe to load file streams from given pathnames, yield pathname and file stream in a tuple. - args: + + Args: datapipe: Iterable datapipe that provides pathnames mode: An optional string that specifies the mode in which the file is opened by `open()`. It defaults to 'b' which means open for reading in binary mode. Another option is 't' for text mode - length: a nominal length of the datapipe + length: Nominal length of the datapipe Note: The opened file handles will be closed by Python's GC periodly. Users can choose diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index f47299ce1cf9c..aece256d10650 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -30,26 +30,27 @@ def __iter__(self): @functional_datapipe('batch') -class BatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]): +class BatcherIterDataPipe(IterDataPipe[DataChunk]): r""" :class:`BatcherIterDataPipe`. Iterable DataPipe to create mini-batches of data. An outer dimension will be added as `batch_size` if `drop_last` is set to `True`, or `length % batch_size` for the last batch if `drop_last` is set to `False`. - args: + + Args: datapipe: Iterable DataPipe being batched batch_size: The size of each batch drop_last: Option to drop the last batch if it's not full unbatch_level: Specifies if it necessary to unbatch source data before applying new batching rule """ - datapipe: IterDataPipe[T_co] + datapipe: IterDataPipe batch_size: int drop_last: bool length: Optional[int] def __init__(self, - datapipe: IterDataPipe[T_co], + datapipe: IterDataPipe, batch_size: int, drop_last: bool = False, unbatch_level: int = 0, @@ -66,8 +67,8 @@ def __init__(self, self.length = None self.wrapper_class = DataChunk - def __iter__(self) -> Iterator[DataChunk[T_co]]: - batch: List[T_co] = [] + def __iter__(self) -> Iterator[DataChunk]: + batch: List = [] for x in self.datapipe: batch.append(x) if len(batch) == self.batch_size: @@ -96,13 +97,16 @@ class UnBatcherIterDataPipe(IterDataPipe): Iterable DataPipe to undo batching of data. In other words, it flattens the data up to the specified level within a batched DataPipe. - args: + + Args: datapipe: Iterable DataPipe being un-batched unbatch_level: Defaults to `1` (only flattening the top level). If set to `2`, it will flatten the top 2 levels, - and `-1` will flatten the entire DataPipe. + and `-1` will flatten the entire DataPipe. """ - def __init__(self, datapipe, unbatch_level: int = 1): + def __init__(self, + datapipe: IterDataPipe, + unbatch_level: int = 1): self.datapipe = datapipe self.unbatch_level = unbatch_level @@ -143,7 +147,8 @@ class BucketBatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]): Iterable DataPipe to create mini-batches of data from sorted bucket. An outer dimension will be added as `batch_size` if `drop_last` is set to `True`, or `length % batch_size` for the last batch if `drop_last` is set to `False`. - args: + + Args: datapipe: Iterable DataPipe being batched batch_size: The size of each batch drop_last: Option to drop the last batch if it's not full @@ -224,8 +229,21 @@ def __len__(self) -> int: @functional_datapipe('groupby') -class GrouperIterDataPipe(IterDataPipe): - # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks) +class GrouperIterDataPipe(IterDataPipe[DataChunk]): + r""":class:`GrouperIterDataPipe`. + + Iterable datapipe to group data from input IterDataPipe by keys which are generated from `group_key_fn`, + and yield a DataChunk with size ranging from `guaranteed_group_size` to `group_size`. + + Args: + datapipe: Iterable datapipe to be grouped + group_key_fn: Function used to generate group key from the data of the source datapipe + buffer_size: The size of buffer for ungrouped data + group_size: The size of each group + unbatch_level: Specifies if it necessary to unbatch source data before grouping + guaranteed_group_size: The guaranteed minimum group size + drop_remaining: Specifies if the group smaller than `guaranteed_group_size` will be dropped from buffer + """ def __init__(self, datapipe: IterDataPipe[T_co], group_key_fn: Callable, diff --git a/torch/utils/data/datapipes/iter/httpreader.py b/torch/utils/data/datapipes/iter/httpreader.py index c663a18cdaab8..747b5d567e4cd 100644 --- a/torch/utils/data/datapipes/iter/httpreader.py +++ b/torch/utils/data/datapipes/iter/httpreader.py @@ -10,16 +10,18 @@ class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): Iterable DataPipe to load file url(s) (http url(s) pointing to file(s)), yield file url and IO stream in a tuple - args: - timeout : timeout for http request + + Args: + datapipe: Iterable DataPipe providing urls + timeout: Timeout for http request """ - def __init__(self, source_datapipe, timeout=None): - self.source_datapipe = source_datapipe + def __init__(self, datapipe, timeout=None): + self.datapipe = datapipe self.timeout = timeout def __iter__(self): - for furl in self.source_datapipe: + for furl in self.datapipe: try: if self.timeout is None: r = urllib.urlopen(furl) diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py index 2b15b93c9c60a..04b992d647b77 100644 --- a/torch/utils/data/datapipes/iter/linereader.py +++ b/torch/utils/data/datapipes/iter/linereader.py @@ -7,12 +7,15 @@ class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]): Iterable DataPipe to load file name and stream as source IterDataPipe and yield filename and line(s). + + Args: + datapipe: Iterable DataPipe providing file name and string file stream """ - def __init__(self, source_datapipe): - self.source_datapipe = source_datapipe + def __init__(self, datapipe): + self.datapipe = datapipe def __iter__(self): - for file_name, stream in self.source_datapipe: + for file_name, stream in self.datapipe: for line in stream: yield file_name, line diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py index f149c074e63fe..ea47742f8e80b 100644 --- a/torch/utils/data/datapipes/iter/routeddecoder.py +++ b/torch/utils/data/datapipes/iter/routeddecoder.py @@ -6,7 +6,8 @@ Decoder, basichandlers as decoder_basichandlers, imagehandler as decoder_imagehandler, - extension_extract_fn) + extension_extract_fn +) @functional_datapipe('decode') @@ -15,7 +16,8 @@ class RoutedDecoderIterDataPipe(IterDataPipe[Tuple[str, Any]]): Iterable datapipe to decode binary streams from input DataPipe, yield pathname and decoded data in a tuple. - args: + + Args: datapipe: Iterable datapipe that provides pathname and binary stream in tuples handlers: Optional user defined decoder handlers. If None, basic and image decoder handlers will be set as default. If multiple handles are provided, the priority diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index 83872cebdb53d..a89bfdfb39e5c 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -11,15 +11,16 @@ class FilterIterDataPipe(MapperIterDataPipe): r""" :class:`FilterIterDataPipe`. Iterable DataPipe to filter elements from datapipe according to filter_fn. - args: + + Args: datapipe: Iterable DataPipe being filtered filter_fn: Customized function mapping an element to a boolean. fn_args: Positional arguments for `filter_fn` fn_kwargs: Keyword arguments for `filter_fn` drop_empty_batches: By default, drops batch if it is empty after filtering instead of keeping an empty list nesting_level: Determines which level the fn gets applied to, by default it applies to the top level (= 0). - This also accepts -1 as input to apply filtering to the lowest nesting level. It currently doesn't support - argument < -1. + This also accepts -1 as input to apply filtering to the lowest nesting level. + It currently doesn't support argument < -1. """ drop_empty_batches: bool diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py index f74efe746a759..197fb8e2b3005 100644 --- a/torch/utils/data/datapipes/iter/streamreader.py +++ b/torch/utils/data/datapipes/iter/streamreader.py @@ -7,16 +7,18 @@ class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]): Iterable DataPipe to load IO stream with label name, and to yield bytes with label name in a tuple - args: - chunk : bytes to read from stream on each iteration. - If None, stream reads to the EOF. + + Args: + datapipe: Iterable DataPipe provides url and byte stream + chunk: Number of bytes to be read from stream per iteration. + If None, all bytes will be read util the EOF. """ - def __init__(self, source_datapipe, chunk=None): - self.source_datapipe = source_datapipe + def __init__(self, datapipe, chunk=None): + self.datapipe = datapipe self.chunk = chunk def __iter__(self): - for (furl, stream) in self.source_datapipe: + for furl, stream in self.datapipe: while True: d = stream.read(self.chunk) if not d: diff --git a/torch/utils/data/datapipes/iter/tararchivereader.py b/torch/utils/data/datapipes/iter/tararchivereader.py index 9145f5f1dbc11..c34583a4d9420 100644 --- a/torch/utils/data/datapipes/iter/tararchivereader.py +++ b/torch/utils/data/datapipes/iter/tararchivereader.py @@ -12,9 +12,11 @@ class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): Iterable datapipe to extract tar binary streams from input iterable which contains tuples of pathname and tar binary stream, yields pathname and extracted binary stream in a tuple. - args: + + Args: datapipe: Iterable datapipe that provides pathname and tar binary stream in tuples - mode: File mode used by `tarfile.open` to read file object. Mode has to be a string of the form 'filemode[:compression]' + mode: File mode used by `tarfile.open` to read file object. + Mode has to be a string of the form 'filemode[:compression]' length: a nominal length of the datapipe Note: @@ -24,13 +26,13 @@ class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): """ def __init__( self, - datapipe : Iterable[Tuple[str, BufferedIOBase]], + datapipe: Iterable[Tuple[str, BufferedIOBase]], mode: str = "r:*", - length : int = -1 + length: int = -1 ): super().__init__() self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe - self.mode = mode + self.mode: str = mode self.length: int = length def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py index ee04abc455fba..9ba80e3576f77 100644 --- a/torch/utils/data/datapipes/iter/utils.py +++ b/torch/utils/data/datapipes/iter/utils.py @@ -2,6 +2,13 @@ class IterableWrapperIterDataPipe(IterDataPipe): + r""":class:`IterableWrapperIterDataPipe`. + + Iterable datapipe that wraps an iterable object. + + Args: + iterable: Iterable object to be wrapped into an IterDataPipe + """ def __init__(self, iterable): self.iterable = iterable diff --git a/torch/utils/data/datapipes/iter/ziparchivereader.py b/torch/utils/data/datapipes/iter/ziparchivereader.py index e98bd179760c8..881d00598151a 100644 --- a/torch/utils/data/datapipes/iter/ziparchivereader.py +++ b/torch/utils/data/datapipes/iter/ziparchivereader.py @@ -13,9 +13,10 @@ class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): Iterable data pipe to extract zip binary streams from input iterable which contains tuples of pathname and zip binary stream, yields pathname and extracted binary stream in a tuple. - args: + + Args: datapipe: Iterable datapipe that provides pathname and zip binary stream in tuples - length: a nominal length of the datapipe + length: Nominal length of the datapipe Note: The opened file handles will be closed automatically if the default DecoderDataPipe @@ -24,12 +25,11 @@ class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): """ def __init__( self, - datapipe : Iterable[Tuple[str, BufferedIOBase]], - length : int = -1): + datapipe: Iterable[Tuple[str, BufferedIOBase]], + length: int = -1): super().__init__() - self.datapipe : Iterable[Tuple[str, BufferedIOBase]] = datapipe - self.length : int = length - + self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe + self.length: int = length def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: if not isinstance(self.datapipe, Iterable): @@ -60,7 +60,6 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]: "Unable to extract files from corrupted zipfile stream {} due to: {}, abort!".format(pathname, e)) raise e - def __len__(self): if self.length == -1: raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) From ffc2612087be1ab469e5a2cd5a1106bf8ec9e753 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 30 Aug 2021 19:08:45 -0700 Subject: [PATCH 373/530] Add acc_gpu_kernel_with_scalars and port add to use it (#63884) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63884 See https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302 for explanation of what's going on here. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30545296 Pulled By: ezyang fbshipit-source-id: f0da52153ae63599fe1d57e90e73f50ca2116939 --- .../ATen/native/cuda/BinaryAddSubKernel.cu | 51 +++----------- aten/src/ATen/native/cuda/Loops.cuh | 68 +++++++++++++------ 2 files changed, 58 insertions(+), 61 deletions(-) diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu index a07fd663581fe..b1c76e119a78a 100644 --- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu @@ -10,53 +10,20 @@ namespace at { namespace native { -template +template struct AddFunctor { - AddFunctor(accscalar_t a): alpha(a) {} - __device__ __forceinline__ scalar_t operator() (const scalar_t a, const scalar_t b) const { - return a + alpha * b; + AddFunctor(T alpha) : alpha_(alpha) {} + T alpha_; + __device__ __forceinline__ T operator()(T a, T b) const __ubsan_ignore_undefined__ { + return a + b * alpha_; } - private: - accscalar_t alpha; -}; - -template -struct AddScalarFunctor { - static_assert(SCALAR_ARG == 1 || SCALAR_ARG == 2, "SCALAR_ARG must be either 1 or 2"); - AddScalarFunctor(accscalar_t alpha, accscalar_t b): alpha(alpha), b(b) {} - __device__ __forceinline__ scalar_t operator() (const scalar_t a) const { - return static_cast(SCALAR_ARG == 1 ? b + alpha * a : a + alpha * b); - } - private: - accscalar_t alpha; - accscalar_t b; }; void add_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) { - if (!isIntegralType(iter.common_dtype(), /* includeBool */ true) && (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) { - // if common dtype is half the scalar constant can overflow in half precision, and yet the result can - // still be representable in the half dtype. Cast scalar to acc_type to have better accuracy. - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() { - using accscalar_t = at::acc_type; - int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2; - auto b = iter.scalar_value(scalar_arg); - iter.remove_operand(scalar_arg); - const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1))); - if (scalar_arg == 1) { - AddScalarFunctor f(alpha_scalar.to(), b); - gpu_kernel(iter, f); - } else { - AddScalarFunctor f(alpha_scalar.to(), b); - gpu_kernel(iter, f); - } - }); - } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() { - using accscalar_t = at::acc_type; - AddFunctor f(alpha_scalar.to()); - gpu_kernel_with_scalars(iter, f); - }); - } + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() { + using opmath_t = at::opmath_type; + opmath_gpu_kernel_with_scalars(iter, AddFunctor(alpha_scalar.to())); + }); } static void sub_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) { diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index fde8e86409db7..8849293e20210 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -111,49 +112,64 @@ void gpu_kernel(TensorIteratorBase& iter, const func_t& f) { gpu_kernel_impl(iter, f); } -template +template struct AUnaryFunctor { using traits = function_traits; - using arg1_t = typename traits::template arg<0>::type; - using arg2_t = typename traits::template arg<1>::type; - using return_t = typename traits::result_type; + using opmath_arg1_t = typename traits::template arg<0>::type; __device__ return_t operator()(arg2_t b) const { return f(a, b); } - AUnaryFunctor(func_t f_, arg1_t a_): f(f_), a(a_) {} + // NB: scalar is stored in higher precision! + AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {} private: func_t f; - arg1_t a; + opmath_arg1_t a; }; -template +template struct BUnaryFunctor { using traits = function_traits; - using arg1_t = typename traits::template arg<0>::type; - using arg2_t = typename traits::template arg<1>::type; - using return_t = typename traits::result_type; + using opmath_arg2_t = typename traits::template arg<1>::type; __device__ return_t operator()(arg1_t a) const { return f(a, b); } - BUnaryFunctor(func_t f_, arg2_t b_): f(f_), b(b_) {} + // NB: scalar is stored in higher precision! + BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {} private: func_t f; - arg2_t b; + opmath_arg2_t b; }; -template -void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { +// Though seemingly noop, this inserts casts from arg1_t to func_t's type +// (which may be higher precision), as well as casts to return_t +template +struct BinaryFunctor { + __device__ return_t operator()(arg1_t a, arg2_t b) const { + return f(a, b); + } + BinaryFunctor(func_t f_): f(f_) {} + private: + func_t f; +}; + +// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which +// accepts inputs at higher precision (typically opmath_t), but then +// ensure that we load from memory at the correct precision (scalar_t) +// to avoid expensive loads. For the whole sordid story see +// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302 +template +void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); using traits = function_traits; + using opmath_arg1_t = typename traits::template arg<0>::type; + using opmath_arg2_t = typename traits::template arg<1>::type; static_assert( traits::arity == 2, "gpu_kernel_with_scalars only supports two input arguments"); - using arg1_t = typename traits::template arg<0>::type; - using arg2_t = typename traits::template arg<1>::type; if (iter.is_cpu_scalar(1)) { - AUnaryFunctor af(f, iter.scalar_value(1)); + AUnaryFunctor af(f, iter.scalar_value(1)); iter.remove_operand(1); // TODO: When all kernels that use gpu_kernel_with_scalars are // ported to structured, this device guard can be deleted. This @@ -163,14 +179,28 @@ void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { const OptionalDeviceGuard device_guard(device_of(iter.tensor(1))); gpu_kernel(iter, af); } else if (iter.is_cpu_scalar(2)) { - BUnaryFunctor bf(f, iter.scalar_value(2)); + BUnaryFunctor bf(f, iter.scalar_value(2)); iter.remove_operand(2); gpu_kernel(iter, bf); } else { - gpu_kernel(iter, f); + gpu_kernel(iter, BinaryFunctor(f)); } } +// Legacy variant that assumes that func_t has the correct types +// that we expect to load from memory +template +void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + using traits = function_traits; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + using arg1_t = typename traits::template arg<0>::type; + using arg2_t = typename traits::template arg<1>::type; + using return_t = typename traits::result_type; + opmath_gpu_kernel_with_scalars(iter, f); +} + namespace { // functions for `gpu_kernel_multiple_outputs`. // check the return type is `thrust::tuple`, not `std::tuple`. From cb7cf823b30dcf623d1bceb76c6e16a899f5dc46 Mon Sep 17 00:00:00 2001 From: CaoE Date: Mon, 30 Aug 2021 19:12:23 -0700 Subject: [PATCH 374/530] add BFloat16 support for fold and unfold on CPU (#62880) Summary: Add BFloat16 support for fold and unfold operators on CPU. Pull Request resolved: https://github.com/pytorch/pytorch/pull/62880 Reviewed By: iramazanli Differential Revision: D30576387 Pulled By: zou3519 fbshipit-source-id: c48f6e56702bfea34448db1b3a1634c49c5d8ec8 --- aten/src/ATen/native/Col2Im.cpp | 2 +- aten/src/ATen/native/Im2Col.cpp | 2 +- test/test_nn.py | 16 ++++++++++++++++ .../_internal/common_methods_invocations.py | 1 + 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp index e1cc31df60f54..7e11b1bdd5b6f 100644 --- a/aten/src/ATen/native/Col2Im.cpp +++ b/aten/src/ATen/native/Col2Im.cpp @@ -136,7 +136,7 @@ static void col2im_out_cpu_template( output.resize_({batch_size, n_output_plane, output_height, output_width}); output.zero_(); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf, + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "col2im_out_cpu", [&] { Tensor input_n = Tensor(); Tensor output_n = Tensor(); diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp index 0970095a68fa9..586b9612f80f4 100644 --- a/aten/src/ATen/native/Im2Col.cpp +++ b/aten/src/ATen/native/Im2Col.cpp @@ -86,7 +86,7 @@ static void im2col_out_cpu_template( output.resize_({batch_size, n_output_plane, output_length}); output.zero_(); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf, + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "im2col_out_cpu", [&] { Tensor input_n; Tensor output_n; diff --git a/test/test_nn.py b/test/test_nn.py index c6d0e78044126..96321ba183be0 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -17438,14 +17438,30 @@ def test_softshrink_negative(self, device): m(input) def test_fold(self, device): + def test_dtype(fn, input, dtype): + input = input.detach().clone().to(dtype=dtype).requires_grad_(True) + input2 = input.detach().clone().float().requires_grad_(True) + out = fn(input) + out.sum().backward() + out2 = fn(input2) + out2.sum().backward() + self.assertEqual(out.dtype, dtype) + self.assertEqual(input.grad.dtype, dtype) + self.assertEqual(out, out2.to(dtype=dtype), atol=0.05, rtol=0) + self.assertEqual(input.grad, input2.grad.to(dtype=dtype)) + def func(x): return F.fold(x, output_size=(4, 5), kernel_size=(2, 2)) + seeds = (44, 83, 71, 25, 999) for sd in seeds: torch.manual_seed(sd) x = torch.randn(1, 12, 12, device=device, requires_grad=True) gradcheck(func, [x]) gradgradcheck(func, [x]) + if device == 'cpu': + test_dtype(func, x, torch.bfloat16) + def test_logsigmoid_out(self, device): # this isn't actually documented, but was broken previously: diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 04db52b2e607b..e7d93807511a5 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -7267,6 +7267,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): OpInfo('nn.functional.unfold', aten_name='im2col', dtypes=floating_types_and(torch.half), + dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16), sample_inputs_func=sample_inputs_nn_unfold, skips=( # JIT alias info internal asserts here From c7c711bfb88fcb0ef573125a5a8655c49156055b Mon Sep 17 00:00:00 2001 From: Samantha Andow Date: Mon, 30 Aug 2021 19:15:16 -0700 Subject: [PATCH 375/530] Add optional tensor arguments to (#63967) Summary: Fixes https://github.com/pytorch/pytorch/issues/63435 Adds optional tensor arguments to check handling torch function checks. The only one I didn't do this for in the functional file was `multi_head_attention_forward` since that already took care of some optional tensor arguments but not others so it seemed like arguments were specifically chosen Pull Request resolved: https://github.com/pytorch/pytorch/pull/63967 Reviewed By: albanD Differential Revision: D30640441 Pulled By: ezyang fbshipit-source-id: 5ef9554d2fb6c14779f8f45542ab435fb49e5d0f --- torch/nn/functional.py | 72 +++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index c11e261d9b85f..4b0449c8f5672 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -442,10 +442,10 @@ def fractional_max_pool2d_with_indices( .. _Fractional MaxPooling: http://arxiv.org/abs/1412.6071 """ - if has_torch_function_unary(input): + if has_torch_function_variadic(input, _random_samples): return handle_torch_function( fractional_max_pool2d_with_indices, - (input,), + (input, _random_samples), input, kernel_size, output_size=output_size, @@ -473,10 +473,10 @@ def _fractional_max_pool2d( return_indices: bool = False, _random_samples: Optional[Tensor] = None ) -> Tensor: - if has_torch_function_unary(input): + if has_torch_function_variadic(input, _random_samples): return handle_torch_function( fractional_max_pool2d, - (input,), + (input, _random_samples), input, kernel_size, output_size=output_size, @@ -537,10 +537,10 @@ def fractional_max_pool3d_with_indices( .. _Fractional MaxPooling: http://arxiv.org/abs/1412.6071 """ - if has_torch_function_unary(input): + if has_torch_function_variadic(input, _random_samples): return handle_torch_function( fractional_max_pool3d_with_indices, - (input,), + (input, _random_samples), input, kernel_size, output_size=output_size, @@ -571,10 +571,10 @@ def _fractional_max_pool3d( return_indices: bool = False, _random_samples: Optional[Tensor] = None ) -> Tensor: - if has_torch_function_unary(input): + if has_torch_function_variadic(input, _random_samples): return handle_torch_function( fractional_max_pool3d, - (input,), + (input, _random_samples), input, kernel_size, output_size=output_size, @@ -1843,8 +1843,8 @@ def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tens - Bias: :math:`(out\_features)` - Output: :math:`(N, *, out\_features)` """ - if has_torch_function_variadic(input, weight): - return handle_torch_function(linear, (input, weight), input, weight, bias=bias) + if has_torch_function_variadic(input, weight, bias): + return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias) return torch._C._nn.linear(input, weight, bias) @@ -1865,10 +1865,10 @@ def bilinear(input1: Tensor, input2: Tensor, weight: Tensor, bias: Optional[Tens - output: :math:`(N, *, H_{out})` where :math:`H_{out}=\text{out\_features}` and all but the last dimension are the same shape as the input. """ - if has_torch_function_variadic(input1, input2, weight): + if has_torch_function_variadic(input1, input2, weight, bias): return handle_torch_function( bilinear, - (input1, input2, weight), + (input1, input2, weight, bias), input1, input2, weight, bias=bias ) @@ -2135,10 +2135,10 @@ def embedding_bag( tensor([[ 0.0000, 0.0000, 0.0000], [-0.7082, 3.2145, -2.6251]]) """ - if has_torch_function_variadic(input, weight): + if has_torch_function_variadic(input, weight, offsets, per_sample_weights): return handle_torch_function( embedding_bag, - (input, weight), + (input, weight, offsets, per_sample_weights), input, weight, offsets=offsets, @@ -2263,10 +2263,10 @@ def batch_norm( See :class:`~torch.nn.BatchNorm1d`, :class:`~torch.nn.BatchNorm2d`, :class:`~torch.nn.BatchNorm3d` for details. """ - if has_torch_function_unary(input): + if has_torch_function_variadic(input, running_mean, running_var, weight, bias): return handle_torch_function( batch_norm, - (input,), + (input, running_mean, running_var, weight, bias), input, running_mean, running_var, @@ -2309,10 +2309,10 @@ def instance_norm( See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`, :class:`~torch.nn.InstanceNorm3d` for details. """ - if has_torch_function_unary(input): + if has_torch_function_variadic(input, running_mean, running_var, weight, bias): return handle_torch_function( instance_norm, - (input,), + (input, running_mean, running_var, weight, bias), input, running_mean=running_mean, running_var=running_var, @@ -2340,9 +2340,9 @@ def layer_norm( See :class:`~torch.nn.LayerNorm` for details. """ - if has_torch_function_unary(input): + if has_torch_function_variadic(input, weight, bias): return handle_torch_function( - layer_norm, (input,), input, normalized_shape, weight=weight, bias=bias, eps=eps + layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps ) return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) @@ -2354,8 +2354,8 @@ def group_norm( See :class:`~torch.nn.GroupNorm` for details. """ - if has_torch_function_unary(input): - return handle_torch_function(group_norm, (input,), input, num_groups, weight=weight, bias=bias, eps=eps) + if has_torch_function_variadic(input, weight, bias): + return handle_torch_function(group_norm, (input, weight, bias,), input, num_groups, weight=weight, bias=bias, eps=eps) _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:])) return torch.group_norm(input, num_groups, weight, bias, eps, torch.backends.cudnn.enabled) @@ -2515,10 +2515,10 @@ def nll_loss( >>> output = F.nll_loss(F.log_softmax(input), target) >>> output.backward() """ - if has_torch_function_variadic(input, target): + if has_torch_function_variadic(input, target, weight): return handle_torch_function( nll_loss, - (input, target), + (input, target, weight), input, target, weight=weight, @@ -2828,10 +2828,10 @@ def cross_entropy( >>> loss = F.cross_entropy(input, target) >>> loss.backward() """ - if has_torch_function_variadic(input, target): + if has_torch_function_variadic(input, target, weight): return handle_torch_function( cross_entropy, - (input, target), + (input, target, weight), input, target, weight=weight, @@ -2887,10 +2887,10 @@ def binary_cross_entropy( >>> loss = F.binary_cross_entropy(F.sigmoid(input), target) >>> loss.backward() """ - if has_torch_function_variadic(input, target): + if has_torch_function_variadic(input, target, weight): return handle_torch_function( binary_cross_entropy, - (input, target), + (input, target, weight), input, target, weight=weight, @@ -2959,10 +2959,10 @@ def binary_cross_entropy_with_logits( >>> loss = F.binary_cross_entropy_with_logits(input, target) >>> loss.backward() """ - if has_torch_function_variadic(input, target): + if has_torch_function_variadic(input, target, weight, pos_weight): return handle_torch_function( binary_cross_entropy_with_logits, - (input, target), + (input, target, weight, pos_weight), input, target, weight=weight, @@ -3243,10 +3243,10 @@ def multilabel_soft_margin_loss( See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details. """ - if has_torch_function_variadic(input, target): + if has_torch_function_variadic(input, target, weight): return handle_torch_function( multilabel_soft_margin_loss, - (input, target), + (input, target, weight), input, target, weight=weight, @@ -3323,10 +3323,10 @@ def multi_margin_loss( See :class:`~torch.nn.MultiMarginLoss` for details. """ - if has_torch_function_variadic(input, target): + if has_torch_function_variadic(input, target, weight): return handle_torch_function( multi_margin_loss, - (input, target), + (input, target, weight), input, target, p=p, @@ -4443,8 +4443,8 @@ def normalize(input: Tensor, p: float = 2.0, dim: int = 1, eps: float = 1e-12, o out (Tensor, optional): the output tensor. If :attr:`out` is used, this operation won't be differentiable. """ - if has_torch_function_unary(input): - return handle_torch_function(normalize, (input,), input, p=p, dim=dim, eps=eps, out=out) + if has_torch_function_variadic(input, out): + return handle_torch_function(normalize, (input, out), input, p=p, dim=dim, eps=eps, out=out) if out is None: denom = input.norm(p, dim, keepdim=True).clamp_min(eps).expand_as(input) return input / denom From 6b85c99ce562cf81749e5efc49bd835041e43f92 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Mon, 30 Aug 2021 19:17:21 -0700 Subject: [PATCH 376/530] Avoid an unnecessary list creation in `DataChunk` (#64111) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64111 Reviewed By: mruberry Differential Revision: D30639383 Pulled By: ezyang fbshipit-source-id: 96b243307413c99a67d55d862a71937e1ef210f4 --- torch/utils/data/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py index 7a069d61de6cc..609e1a1eb6e2d 100644 --- a/torch/utils/data/dataset.py +++ b/torch/utils/data/dataset.py @@ -31,7 +31,7 @@ def __init__(self, items): self.items = items def as_str(self, indent=''): - res = indent + "[" + ", ".join([str(i) for i in iter(self)]) + "]" + res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]" return res def __iter__(self) -> Iterator[T]: From 93f1090267df7e3023017b83f885edbc59e48913 Mon Sep 17 00:00:00 2001 From: oleshp <31859680+oleshp@users.noreply.github.com> Date: Mon, 30 Aug 2021 19:22:05 -0700 Subject: [PATCH 377/530] Update contribution_guide.rst (#64142) Summary: Grammatical update. Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/64142 Reviewed By: mruberry Differential Revision: D30639394 Pulled By: ezyang fbshipit-source-id: cf1a4dfbd8e34b0772f1b09f5d820278e8ef8574 --- docs/source/community/contribution_guide.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst index 166aa7526e731..7cba558dbdb54 100644 --- a/docs/source/community/contribution_guide.rst +++ b/docs/source/community/contribution_guide.rst @@ -200,8 +200,8 @@ Triaging issues ~~~~~~~~~~~~~~~ If you feel that an issue could benefit from a particular tag or level -of complexity comment on the issue and share your opinion. If you -feel an issue isn't categorized properly comment and let the team know. +of complexity, comment on the issue and share your opinion. If you +feel an issue isn't categorized properly, comment and let the team know. About open source development ----------------------------- From 09dfaa0339c944d1c4b64193e0962972affbe0c7 Mon Sep 17 00:00:00 2001 From: leslie-fang-intel Date: Mon, 30 Aug 2021 19:28:59 -0700 Subject: [PATCH 378/530] add operation list for AutocastCPU (#63534) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63534 In this PR: * We have changed the default dtype of `AutocastCPU` from `float16` to `bfloat16` as discussed here `https://github.com/pytorch/pytorch/pull/61002` * We also update the operation list which needs casting to `lower_precision_fp` or `float32`. Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D30644914 Pulled By: ezyang fbshipit-source-id: 8b93485ba452b3759611e3f0ac88e920fe495ac1 --- aten/src/ATen/autocast_mode.cpp | 300 ++++++++++++++++-- test/run_test.py | 1 + torch/cpu/amp/autocast_mode.py | 2 +- .../testing/_internal/autocast_test_lists.py | 14 +- 4 files changed, 277 insertions(+), 40 deletions(-) diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 1ac5ad1c88ba6..9f5f486bb7581 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -461,22 +461,22 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(conv1d), "conv1d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp) KERNEL_CPU(ADD_NS(conv2d), "conv2d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp) KERNEL_CPU(ADD_NS(conv3d), "conv3d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp) - KERNEL_CPU(ADD_NS(_log_softmax), "_log_softmax", Tensor (const Tensor &, int64_t, bool), lower_precision_fp) KERNEL_CPU(ADD_NS(bmm), "bmm", Tensor (const Tensor &, const Tensor &), lower_precision_fp) KERNEL_CPU(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), lower_precision_fp) KERNEL_CPU(ADD_NS(baddbmm), "baddbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp) KERNEL_CPU(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp) KERNEL_CPU(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp) KERNEL_CPU(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional &), lower_precision_fp) + KERNEL_CPU(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp) // fp32 cast policy + KERNEL_CPU(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(batch_norm), "batch_norm", Tensor (const Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, bool, double, double, bool), fp32) - KERNEL_CPU(ADD_NS(max_pool2d), "max_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32) - KERNEL_CPU(ADD_NS(adaptive_avg_pool2d), "adaptive_avg_pool2d", Tensor (const Tensor &, IntArrayRef), fp32) - KERNEL_CPU(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), fp32) KERNEL_CPU(ADD_NS(dropout), "dropout", Tensor (const Tensor &, double, bool), fp32) + KERNEL_CPU(ADD_NS(avg_pool1d), "avg_pool1d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool), fp32) KERNEL_CPU(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32) @@ -492,45 +492,285 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32) KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional, c10::optional, c10::optional), fp32) KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32) + KERNEL_CPU(ADD_NS(binary_cross_entropy), "binary_cross_entropy", Tensor (const Tensor &, const Tensor &, const c10::optional&, int64_t), fp32) KERNEL_CPU(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional&, const c10::optional&, int64_t), fp32) - KERNEL_CPU(ADD_NS(pow), "pow.Tensor_Scalar", Tensor (const Tensor &, const Scalar &), fp32) - KERNEL_CPU(ADD_NS(pow), "pow.Tensor_Tensor", Tensor (const Tensor &, const Tensor &), fp32) - KERNEL_CPU(ADD_NS(pow), "pow.Scalar", Tensor (const Scalar&, const Tensor &), fp32) - KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32) - KERNEL_CPU(ADD_NS(reflection_pad1d), "reflection_pad1d", Tensor (const Tensor &, IntArrayRef), fp32) - KERNEL_CPU(ADD_NS(std), "std", Tensor (const Tensor &, bool), fp32) - KERNEL_CPU(ADD_NS(std), "std.dim", Tensor (const Tensor &, IntArrayRef, bool, bool), fp32) KERNEL_CPU(ADD_NS(instance_norm), "instance_norm", Tensor (const Tensor &, const c10::optional&, const c10::optional&, const c10::optional&, const c10::optional&, bool, double, double, bool), fp32) + KERNEL_CPU(ADD_NS(grid_sampler), "grid_sampler", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) + KERNEL_CPU(ADD_NS(polar), "polar", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(multinomial), "multinomial", Tensor(const Tensor &, int64_t, bool, c10::optional), fp32) + KERNEL_CPU(ADD_NS(poisson), "poisson", Tensor(const Tensor &, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fmod), "fmod.Tensor", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(fmod), "fmod.Scalar", Tensor(const Tensor &, const Scalar &), fp32) + KERNEL_CPU(ADD_NS(prod), "prod", Tensor(const Tensor &, c10::optional), fp32) + KERNEL_CPU(ADD_NS(prod), "prod.dim_int", Tensor(const Tensor &, int64_t, bool, c10::optional), fp32) + KERNEL_CPU(ADD_NS(prod), "prod.dim_Dimname", Tensor(const Tensor &, at::Dimname, bool, c10::optional), fp32) + KERNEL_CPU(ADD_NS(quantile), "quantile", Tensor(const Tensor &, const Tensor &, c10::optional, bool), fp32) + KERNEL_CPU(ADD_NS(quantile), "quantile.scalar", Tensor(const Tensor &, double, c10::optional, bool), fp32) + KERNEL_CPU(ADD_NS(quantile), "quantile.new", Tensor(const Tensor &, const Tensor &, c10::optional, bool, c10::string_view), fp32) + KERNEL_CPU(ADD_NS(quantile), "quantile.new_scalar", Tensor(const Tensor &, double, c10::optional, bool, c10::string_view), fp32) + KERNEL_CPU(ADD_NS(nanquantile), "nanquantile", Tensor(const Tensor &, const Tensor &, c10::optional, bool), fp32) + KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.scalar", Tensor(const Tensor &, double, c10::optional, bool), fp32) + KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.new", Tensor(const Tensor &, const Tensor &, c10::optional, bool, c10::string_view), fp32) + KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.new_scalar", Tensor(const Tensor &, double, c10::optional, bool, c10::string_view), fp32) + KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional, c10::optional, const c10::optional &, bool, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(cdist), "cdist", Tensor(const Tensor &, const Tensor &, double, c10::optional), fp32) + KERNEL_CPU(ADD_NS(cross), "cross", Tensor(const Tensor &, const Tensor &, c10::optional), fp32) + KERNEL_CPU(ADD_NS(cumprod), "cumprod", Tensor(const Tensor &, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(cumprod), "cumprod.dimname", Tensor(const Tensor &, at::Dimname, c10::optional), fp32) + KERNEL_CPU(ADD_NS(cumsum), "cumsum", Tensor(const Tensor &, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(cumsum), "cumsum.dimname", Tensor(const Tensor &, at::Dimname, c10::optional), fp32) + KERNEL_CPU(ADD_NS(diag), "diag", Tensor(const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(diagflat), "diagflat", Tensor(const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(histc), "histc", Tensor(const Tensor &, int64_t, const at::Scalar &, const at::Scalar &), fp32) + KERNEL_CPU(ADD_NS(logcumsumexp), "logcumsumexp", Tensor(const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Tensor", Tensor(const Tensor &, const Tensor &, bool, bool), fp32) + KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Scalar", Tensor(const Tensor &, const at::Scalar &, bool, bool), fp32) + KERNEL_CPU(ADD_NS(trace), "trace", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(tril), "tril", Tensor(const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(triu), "triu", Tensor(const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(vander), "vander", Tensor(const Tensor &, c10::optional, bool), fp32) + KERNEL_CPU(ADD_NS(view_as_complex), "view_as_complex", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(cholesky), "cholesky", Tensor(const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(cholesky_inverse), "cholesky_inverse", Tensor(const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(cholesky_solve), "cholesky_solve", Tensor(const Tensor &, const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(dot), "dot", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(inverse), "inverse", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(lu_solve), "lu_solve", Tensor(const Tensor &, const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(matrix_rank), "matrix_rank", Tensor(const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(orgqr), "orgqr", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(ormqr), "ormqr", Tensor(const Tensor &, const Tensor &, const Tensor &, bool, bool), fp32) + KERNEL_CPU(ADD_NS(pinverse), "pinverse", Tensor(const Tensor &, double), fp32) + KERNEL_CPU(ADD_NS(vdot), "vdot", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(im2col), "im2col", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(col2im), "col2im", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(max_pool3d), "max_pool3d", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32) + KERNEL_CPU(ADD_NS(max_unpool2d), "max_unpool2d", Tensor(const Tensor &, const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(max_unpool3d), "max_unpool3d", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(adaptive_avg_pool3d), "adaptive_avg_pool3d", Tensor(const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(reflection_pad1d), "reflection_pad1d", Tensor(const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(reflection_pad2d), "reflection_pad2d", Tensor(const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(replication_pad1d), "replication_pad1d", Tensor(const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(replication_pad2d), "replication_pad2d", Tensor(const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(replication_pad3d), "replication_pad3d", Tensor(const Tensor &, IntArrayRef), fp32) + KERNEL_CPU(ADD_NS(elu), "elu", Tensor(const Tensor &, const Scalar &, const Scalar &, const Scalar &), fp32) + KERNEL_CPU(ADD_NS(hardshrink), "hardshrink", Tensor(const Tensor &, const Scalar &), fp32) + KERNEL_CPU(ADD_NS(hardsigmoid), "hardsigmoid", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(hardswish), "hardswish", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(log_sigmoid), "log_sigmoid", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(prelu), "prelu", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(selu), "selu", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(celu), "celu", Tensor(const Tensor &, const Scalar &), fp32) + KERNEL_CPU(ADD_NS(softplus), "softplus", Tensor(const Tensor &, const Scalar &, const Scalar &), fp32) + KERNEL_CPU(ADD_NS(softshrink), "softshrink", Tensor(const Tensor &, const Scalar &), fp32) + KERNEL_CPU(ADD_NS(group_norm), "group_norm", Tensor(const Tensor &, int64_t, const c10::optional &, const c10::optional &, double, bool), fp32) + KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32) + KERNEL_CPU(ADD_NS(mse_loss), "mse_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.IntList", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t, int64_t, bool), fp32) + KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.Tensor", Tensor(const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) + KERNEL_CPU(ADD_NS(kl_div), "kl_div", Tensor(const Tensor &, const Tensor &, int64_t, bool), fp32) + KERNEL_CPU(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(fft_fft), "fft_fft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_ifft), "fft_ifft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_fft2), "fft_fft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_ifft2), "fft_ifft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_fftn), "fft_fftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_ifftn), "fft_ifftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_rfft), "fft_rfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_irfft), "fft_irfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_rfft2), "fft_rfft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_irfft2), "fft_irfft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_rfftn), "fft_rfftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_irfftn), "fft_irfftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_hfft), "fft_hfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_ihfft), "fft_ihfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) + KERNEL_CPU(ADD_NS(conv_tbc), "conv_tbc", Tensor(const Tensor &, const Tensor &, const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm", Tensor(const Tensor &, const at::Scalar &, at::IntArrayRef, bool, c10::optional), fp32) + KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm.str_ord", Tensor(const Tensor &, c10::string_view, at::IntArrayRef, bool, c10::optional), fp32) + KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond", Tensor(const Tensor &, const c10::optional &), fp32) + KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond.p_str", Tensor(const Tensor &, c10::string_view), fp32) + KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank", Tensor(const Tensor &, const c10::optional, bool), fp32) + KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.tol_tensor", Tensor(const Tensor &, const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(linalg_solve), "linalg_solve", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(linalg_cholesky), "linalg_cholesky", Tensor(const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(linalg_svdvals), "linalg_svdvals", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(linalg_eigvals), "linalg_eigvals", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(linalg_eigvalsh), "linalg_eigvalsh", Tensor(const Tensor &, c10::string_view), fp32) + KERNEL_CPU(ADD_NS(linalg_inv), "linalg_inv", Tensor(const Tensor &), fp32) + KERNEL_CPU(ADD_NS(linalg_householder_product), "linalg_householder_product", Tensor(const Tensor &, const Tensor &), fp32) + KERNEL_CPU(ADD_NS(linalg_tensorinv), "linalg_tensorinv", Tensor(const Tensor &, int64_t), fp32) + KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, c10::optional), fp32) KERNEL_CPU(ADD_NS(fake_quantize_per_tensor_affine), "fake_quantize_per_tensor_affine", Tensor (const Tensor &, double, int64_t, int64_t, int64_t), fp32) + KERNEL_CPU(ADD_NS(glu), "glu", Tensor (const Tensor &, int64_t), fp32) - // promote - KERNEL_CPU(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote) - KERNEL_CPU(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote) + m.impl(TORCH_SELECTIVE_NAME("aten::cummax"), + TORCH_FN((&WrapFunction (const Tensor &, int64_t), + std::tuple (const Tensor &, int64_t), + &ADD_NS(cummax)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::cummax.dimname"), + TORCH_FN((&WrapFunction (const Tensor &, at::Dimname), + std::tuple (const Tensor &, at::Dimname), + &ADD_NS(cummax)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::cummin"), + TORCH_FN((&WrapFunction (const Tensor &, int64_t), + std::tuple (const Tensor &, int64_t), + &ADD_NS(cummin)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::cummin.dimname"), + TORCH_FN((&WrapFunction (const Tensor &, at::Dimname), + std::tuple (const Tensor &, at::Dimname), + &ADD_NS(cummin)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::eig"), + TORCH_FN((&WrapFunction (const Tensor &, bool), + std::tuple (const Tensor &, bool), + &ADD_NS(eig)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::geqrf"), + TORCH_FN((&WrapFunction (const Tensor &), + std::tuple (const Tensor &), + &ADD_NS(geqrf)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::lstsq"), + TORCH_FN((&WrapFunction (const Tensor &, const Tensor &), + std::tuple (const Tensor &, const Tensor &), + &ADD_NS(lstsq)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::_lu_with_info"), + TORCH_FN((&WrapFunction (const Tensor &, bool, bool), + std::tuple (const Tensor &, bool, bool), + &ADD_NS(_lu_with_info)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::lu_unpack"), + TORCH_FN((&WrapFunction (const Tensor &, const Tensor &, bool, bool), + std::tuple (const Tensor &, const Tensor &, bool, bool), + &ADD_NS(lu_unpack)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::qr"), + TORCH_FN((&WrapFunction (const Tensor &, bool), + std::tuple (const Tensor &, bool), + &ADD_NS(qr)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::solve"), + TORCH_FN((&WrapFunction (const Tensor &, const Tensor &), + std::tuple (const Tensor &, const Tensor &), + &ADD_NS(solve)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::svd"), + TORCH_FN((&WrapFunction (const Tensor &, bool, bool), + std::tuple (const Tensor &, bool, bool), + &ADD_NS(svd)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::symeig"), + TORCH_FN((&WrapFunction (const Tensor &, bool, bool), + std::tuple (const Tensor &, bool, bool), + &ADD_NS(symeig)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::triangular_solve"), + TORCH_FN((&WrapFunction (const Tensor &, const Tensor &, bool, bool, bool), + std::tuple (const Tensor &, const Tensor &, bool, bool, bool), + &ADD_NS(triangular_solve)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::fractional_max_pool2d"), + TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &), + std::tuple (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &), + &ADD_NS(fractional_max_pool2d)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::fractional_max_pool3d"), + TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &), + std::tuple (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &), + &ADD_NS(fractional_max_pool3d)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool1d"), + TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef), + std::tuple (const Tensor &, IntArrayRef), + &ADD_NS(adaptive_max_pool1d)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool2d"), + TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef), + std::tuple (const Tensor &, IntArrayRef), + &ADD_NS(adaptive_max_pool2d)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool3d"), + TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef), + std::tuple (const Tensor &, IntArrayRef), + &ADD_NS(adaptive_max_pool3d)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::multilabel_margin_loss_forward"), + TORCH_FN((&WrapFunction (const Tensor &, const Tensor &, int64_t), + std::tuple (const Tensor &, const Tensor &, int64_t), + &ADD_NS(multilabel_margin_loss_forward)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_qr"), + TORCH_FN((&WrapFunction (const Tensor &, c10::string_view), + std::tuple (const Tensor &, c10::string_view), + &ADD_NS(linalg_qr)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_cholesky_ex"), + TORCH_FN((&WrapFunction (const Tensor &, bool, bool), + std::tuple (const Tensor &, bool, bool), + &ADD_NS(linalg_cholesky_ex)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::topk"), + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_svd"), TORCH_FN((&WrapFunction (const Tensor &, int64_t, int64_t, bool, bool), - std::tuple (const Tensor &, int64_t, int64_t, bool, bool), - &ADD_NS(topk)>::type::call))); + std::tuple (const Tensor &, bool), + std::tuple (const Tensor &, bool), + &ADD_NS(linalg_svd)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::sort"), + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_eig"), TORCH_FN((&WrapFunction (const Tensor &, int64_t, bool), - std::tuple (const Tensor &, int64_t, bool), - &ADD_NS(sort)>::type::call))); + std::tuple (const Tensor &), + std::tuple (const Tensor &), + &ADD_NS(linalg_eig)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::kthvalue"), + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_eigh"), TORCH_FN((&WrapFunction (const Tensor &, int64_t, int64_t, bool), - std::tuple (const Tensor &, int64_t, int64_t, bool), - &ADD_NS(kthvalue)>::type::call))); + std::tuple (const Tensor &, c10::string_view), + std::tuple (const Tensor &, c10::string_view), + &ADD_NS(linalg_eigh)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::kthvalue.dimname"), + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_lstsq"), TORCH_FN((&WrapFunction (const Tensor &, int64_t, at::Dimname, bool), - std::tuple (const Tensor &, int64_t, at::Dimname, bool), - &ADD_NS(kthvalue)>::type::call))); + std::tuple (const Tensor &, const Tensor &, c10::optional, c10::optional), + std::tuple (const Tensor &, const Tensor &, c10::optional, c10::optional), + &ADD_NS(linalg_lstsq)>::type::call))); + + m.impl(TORCH_SELECTIVE_NAME("aten::linalg_inv_ex"), + TORCH_FN((&WrapFunction (const Tensor &, bool), + std::tuple (const Tensor &, bool), + &ADD_NS(linalg_inv_ex)>::type::call))); + + // promote + KERNEL_CPU(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote) + KERNEL_CPU(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote) + KERNEL_CPU(ADD_NS(index_copy), "index_copy", Tensor (const Tensor &, int64_t, const Tensor &, const Tensor &), promote) + KERNEL_CPU(ADD_NS(index_copy), "index_copy.dimname", Tensor (const Tensor &, at::Dimname, const Tensor &, const Tensor &), promote) + } } // namespace } // namespace autocast diff --git a/test/run_test.py b/test/run_test.py index 615aaf912c314..77e7f150c16e9 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -75,6 +75,7 @@ "distributed/test_pg_wrapper", "distributed/algorithms/test_join", "test_cuda", + "test_autocast", "test_jit_cuda_fuser", "test_cuda_primary_ctx", "test_dataloader", diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py index 08ea200a2bdc4..8c65f727753e2 100644 --- a/torch/cpu/amp/autocast_mode.py +++ b/torch/cpu/amp/autocast_mode.py @@ -5,5 +5,5 @@ class autocast(torch.autocast_mode.autocast): See :class:`torch.autocast`. ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)`` """ - def __init__(self, enabled=True, dtype=torch.float16): + def __init__(self, enabled=True, dtype=torch.bfloat16): super().__init__("cpu", enabled=enabled, dtype=dtype) diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py index 754ccca11ed9d..8350845e4ef19 100644 --- a/torch/testing/_internal/autocast_test_lists.py +++ b/torch/testing/_internal/autocast_test_lists.py @@ -307,7 +307,6 @@ def __init__(self, dev): ("conv1d", conv_args_fp32[0]), ("conv2d", conv_args_fp32[1]), ("conv3d", conv_args_fp32[2]), - ("log_softmax", pointwise0_fp32 + (0,)), ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32), torch.randn((n, n, n), device=dev, dtype=torch.float32))), ("mm", mat0_fp32 + mat1_fp32), @@ -319,24 +318,22 @@ def __init__(self, dev): torch.randn((n, n, n), device=dev, dtype=torch.float32))), ] self.torch_fp32 = [ + ("conv_transpose1d", conv_args_bf16[0]), + ("conv_transpose2d", conv_args_bf16[1]), ("conv_transpose3d", conv_args_bf16[2]), ("batch_norm", dummy_bf16[2], {"weight": None, "bias": None, "running_mean": torch.rand((n), dtype=torch.float32), "running_var": torch.rand((n), dtype=torch.float32), "training": False, "momentum": 0.1, "eps": 1e-5, "cudnn_enabled": False}), - ("max_pool2d", dummy_bf16[2], {"kernel_size": (3, 2), "stride": (1, 1)}), ("dropout", dummy_bf16[2], {"p": 0.1, "train": False}), ("binary_cross_entropy_with_logits", mat0_bf16 + (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)), - ("pow", ((pointwise0_bf16[0] + 1.).clamp(0.0, 100.0),) + pointwise1_bf16), - ("pow", ((pointwise0_bf16[0] + 1.).clamp(0.0, 100.0),) + (1.7,)), - ("instance_norm", dummy_bf16[2], {"weight": None, "bias": None, "running_mean": torch.rand((n), dtype=torch.float32), - "running_var": torch.rand((n), dtype=torch.float32), "use_input_stats": False, + ("instance_norm", dummy_bf16[1], {"weight": None, "bias": None, "running_mean": None, + "running_var": None, "use_input_stats": True, "momentum": 0.1, "eps": 1e-5, "cudnn_enabled": False}), ] self.nn_bf16 = [ ("linear", mat0_fp32 + mat1_fp32), ] self.nn_fp32 = [ - ("adaptive_avg_pool2d", dummy_bf16[2], {"output_size": (3, 2)}), ("avg_pool2d", dummy_bf16[2], {"kernel_size": (3, 2), "stride": (1, 1)}), ("avg_pool3d", dummy_bf16[3], {"kernel_size": (3, 3, 3), "stride": (1, 1, 1)}), ("gelu", dummy_bf16[3]), @@ -348,9 +345,8 @@ def __init__(self, dev): ("upsample_trilinear3d", dummy_bf16[4], {"output_size": (n, n, n), "align_corners": False}), ("binary_cross_entropy", (torch.rand((n, n), device=dev, dtype=torch.bfloat16),) + (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)), - ("smooth_l1_loss", mat0_bf16 + mat1_bf16), ("reflection_pad1d", dummy_bf16[2], {"padding": (3, 3)}), - ("std", dummy_bf16[2]), + ("smooth_l1_loss", mat0_bf16 + mat1_bf16), ] self.torch_need_autocast_promote = [ ("cat", (pointwise0_bf16 + pointwise1_fp32,)), From 538647fe1fb94b7822ea3b8bbbd6901961431d60 Mon Sep 17 00:00:00 2001 From: James Reed Date: Mon, 30 Aug 2021 19:54:50 -0700 Subject: [PATCH 379/530] [WIP][FX] BC guarantees for 1.10 (#63888) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63888 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D30523133 Pulled By: jamesr66a fbshipit-source-id: b04cc0d842a74862f42ecba98b757310cd2ec7b0 --- ..._compat-fx_backcompat_class_members.expect | 19 ++ ...t-fx_backcompat_function_signatures.expect | 70 +++++ test/test_fx.py | 242 ++++++++++++++++++ test/test_fx_experimental.py | 8 +- torch/fx/__init__.py | 23 +- torch/fx/_compatibility.py | 34 +++ torch/fx/_symbolic_trace.py | 55 ++-- torch/fx/annotate.py | 3 +- torch/fx/experimental/fx_acc/acc_ops.py | 10 +- torch/fx/graph.py | 41 ++- torch/fx/graph_module.py | 33 ++- torch/fx/immutable_collections.py | 4 + torch/fx/interpreter.py | 24 +- torch/fx/node.py | 51 +++- torch/fx/operator_schemas.py | 7 + torch/fx/passes/shape_prop.py | 8 +- torch/fx/passes/split_module.py | 3 + torch/fx/proxy.py | 27 +- torch/fx/subgraph_rewriter.py | 10 +- torch/fx/tensor_type.py | 6 +- torch/quantization/ns/graph_passes.py | 2 + 21 files changed, 603 insertions(+), 77 deletions(-) create mode 100644 test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect create mode 100644 test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect create mode 100644 torch/fx/_compatibility.py diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect new file mode 100644 index 0000000000000..88e4654b568df --- /dev/null +++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect @@ -0,0 +1,19 @@ +torch.fx._symbolic_trace.ProxyableClassMeta [] +torch.fx._symbolic_trace.Tracer ['call_module', 'create_arg', 'create_args_for_root', 'is_leaf_module', 'path_of_module', 'trace'] +torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'flatten_inps', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'output', 'owning_module', 'placeholder', 'print_tabular', 'python_code', 'unflatten_outs'] +torch.fx.graph.PythonCode [] +torch.fx.graph_module.GraphModule ['add_submodule', 'code', 'delete_all_unused_submodules', 'delete_submodule', 'graph', 'recompile', 'to_folder'] +torch.fx.immutable_collections.immutable_dict ['clear', 'pop', 'popitem', 'update'] +torch.fx.immutable_collections.immutable_list ['append', 'clear', 'extend', 'insert', 'pop', 'remove'] +torch.fx.interpreter.Interpreter ['call_function', 'call_method', 'call_module', 'fetch_args_kwargs_from_env', 'fetch_attr', 'get_attr', 'map_nodes_to_values', 'output', 'placeholder', 'run', 'run_node'] +torch.fx.interpreter.Transformer ['call_function', 'call_module', 'get_attr', 'placeholder', 'transform'] +torch.fx.node.Node ['all_input_nodes', 'append', 'args', 'format_node', 'is_impure', 'kwargs', 'next', 'normalized_arguments', 'prepend', 'prev', 'replace_all_uses_with', 'replace_input_with', 'stack_trace', 'update_arg', 'update_kwarg'] +torch.fx.passes.shape_prop.ShapeProp ['propagate', 'run_node'] +torch.fx.passes.shape_prop.TensorMetadata ['dtype', 'is_quantized', 'memory_format', 'q_scale', 'q_zero_point', 'qscheme', 'requires_grad', 'shape', 'stride'] +torch.fx.passes.split_module.Partition [] +torch.fx.proxy.Attribute ['node'] +torch.fx.proxy.GraphAppendingTracer [] +torch.fx.proxy.Proxy ['keys'] +torch.fx.proxy.TraceError [] +torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] +torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map'] \ No newline at end of file diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect new file mode 100644 index 0000000000000..a73fde735bc2d --- /dev/null +++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect @@ -0,0 +1,70 @@ +torch.fx._symbolic_trace.Tracer.__init__(self, autowrap_modules: Tuple[Callable] = (,), autowrap_functions: Tuple[Callable, ...] = (,), enable_cpatching: bool = False, param_shapes_constant: bool = False) -> None +torch.fx._symbolic_trace.Tracer.call_module(self, m: torch.nn.modules.module.Module, forward: Callable[..., Any], args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> Any +torch.fx._symbolic_trace.Tracer.create_arg(self, a: Any) -> 'Argument' +torch.fx._symbolic_trace.Tracer.is_leaf_module(self, m: torch.nn.modules.module.Module, module_qualified_name: str) -> bool +torch.fx._symbolic_trace.Tracer.path_of_module(self, mod: torch.nn.modules.module.Module) -> str +torch.fx._symbolic_trace.Tracer.trace(self, root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph.Graph +torch.fx._symbolic_trace.symbolic_trace(root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None, enable_cpatching: bool = False) -> torch.fx.graph_module.GraphModule +torch.fx._symbolic_trace.wrap(fn_or_name: Union[str, Callable]) +torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None) +torch.fx.graph.Graph.call_function(self, the_function: Callable[..., Any], args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.graph.Graph.call_method(self, method_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.graph.Graph.call_module(self, module_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.graph.Graph.create_node(self, op: str, target: 'Target', args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, name: Optional[str] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.graph.Graph.eliminate_dead_code(self) +torch.fx.graph.Graph.erase_node(self, to_erase: torch.fx.node.Node) -> None +torch.fx.graph.Graph.get_attr(self, qualified_name: str, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.graph.Graph.graph_copy(self, g: 'Graph', val_map: Dict[torch.fx.node.Node, torch.fx.node.Node], return_output_node = False) -> 'Optional[Argument]' +torch.fx.graph.Graph.inserting_after(self, n: Optional[torch.fx.node.Node] = None) +torch.fx.graph.Graph.inserting_before(self, n: Optional[torch.fx.node.Node] = None) +torch.fx.graph.Graph.lint(self) +torch.fx.graph.Graph.node_copy(self, node: torch.fx.node.Node, arg_transform: Callable[[torch.fx.node.Node], Argument] = >) -> torch.fx.node.Node +torch.fx.graph.Graph.output(self, result: 'Argument', type_expr: Optional[Any] = None) +torch.fx.graph.Graph.placeholder(self, name: str, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.graph.Graph.print_tabular(self) +torch.fx.graph.Graph.python_code(self, root_module: str) -> torch.fx.graph.PythonCode +torch.fx.graph_module.GraphModule.__init__(self, root: Union[torch.nn.modules.module.Module, Dict[str, Any]], graph: torch.fx.graph.Graph, class_name: str = 'GraphModule') +torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.modules.module.Module) -> bool +torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None +torch.fx.graph_module.GraphModule.delete_submodule(self, target: str) -> bool +torch.fx.graph_module.GraphModule.recompile(self) -> torch.fx.graph.PythonCode +torch.fx.interpreter.Interpreter.__init__(self, module: torch.fx.graph_module.GraphModule, garbage_collect_values: bool = True) +torch.fx.interpreter.Interpreter.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Interpreter.call_method(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Interpreter.call_module(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Interpreter.fetch_args_kwargs_from_env(self, n: torch.fx.node.Node) -> Tuple[Tuple, Dict] +torch.fx.interpreter.Interpreter.fetch_attr(self, target: str) +torch.fx.interpreter.Interpreter.get_attr(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Interpreter.map_nodes_to_values(self, args: torch.fx.node.Argument, n: torch.fx.node.Node) -> torch.fx.node.Argument +torch.fx.interpreter.Interpreter.output(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Interpreter.placeholder(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Interpreter.run(self, *args, initial_env: Optional[Dict[torch.fx.node.Node, Any]] = None) -> Any +torch.fx.interpreter.Interpreter.run_node(self, n: torch.fx.node.Node) -> Any +torch.fx.interpreter.Transformer.__init__(self, module) +torch.fx.interpreter.Transformer.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Transformer.call_module(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any +torch.fx.interpreter.Transformer.get_attr(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> torch.fx.proxy.Proxy +torch.fx.interpreter.Transformer.placeholder(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> torch.fx.proxy.Proxy +torch.fx.interpreter.Transformer.transform(self) -> torch.fx.graph_module.GraphModule +torch.fx.node.Node.__init__(self, graph: 'Graph', name: str, op: str, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Argument], return_type: Optional[Any] = None) -> None +torch.fx.node.Node.append(self, x: 'Node') -> None +torch.fx.node.Node.format_node(self, placeholder_names: List[str] = None, maybe_return_typename: List[str] = None) -> Optional[str] +torch.fx.node.Node.prepend(self, x: 'Node') -> None +torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node') -> List[Node] +torch.fx.node.Node.replace_input_with(self, old_input: 'Node', new_input: 'Node') +torch.fx.node.Node.update_arg(self, idx: int, arg: torch.fx.node.Argument) -> None +torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) -> None +torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Argument], torch.fx.node.Argument]) -> torch.fx.node.Argument +torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument +torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int]) +torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str) +torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None) +torch.fx.proxy.Proxy.keys(self) +torch.fx.proxy.TracerBase.create_arg(self, a: Any) -> torch.fx.node.Argument +torch.fx.proxy.TracerBase.create_node(self, kind: str, target: torch.fx.node.Target, args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, torch.fx.node.Argument], name: Optional[str] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node +torch.fx.proxy.TracerBase.create_proxy(self, kind: str, target: torch.fx.node.Target, args: Tuple[Any, ...], kwargs: Dict[str, Any], name: Optional[str] = None, type_expr: Optional[Any] = None, proxy_factory_fn: Callable[[torch.fx.node.Node], Proxy] = None) +torch.fx.proxy.TracerBase.iter(self, obj: 'Proxy') -> Iterator +torch.fx.proxy.TracerBase.keys(self, obj: 'Proxy') -> Any +torch.fx.proxy.TracerBase.proxy(self, node: torch.fx.node.Node) -> 'Proxy' +torch.fx.proxy.TracerBase.to_bool(self, obj: 'Proxy') -> bool +torch.fx.subgraph_rewriter.replace_pattern(gm: torch.fx.graph_module.GraphModule, pattern: Callable, replacement: Callable) -> List[torch.fx.subgraph_rewriter.Match] \ No newline at end of file diff --git a/test/test_fx.py b/test/test_fx.py index 47873d7ef9b41..eadcf6cc0b2f4 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -11,6 +11,8 @@ import sys import torch import traceback +import typing +import types import warnings import unittest from math import sqrt @@ -31,6 +33,7 @@ from collections import namedtuple from torch.fx.proxy import TraceError +from torch.fx._compatibility import _BACK_COMPAT_OBJECTS, _MARKED_WITH_COMATIBLITY from fx.test_subgraph_rewriter import TestSubgraphRewriter # noqa: F401 from fx.test_dce_pass import TestDCE # noqa: F401 @@ -3060,6 +3063,245 @@ def test_get_torch_func_signature_exhaustive(self, device, dtype, op): assert op.name in known_no_schema or "nn.functional" in op.name +class TestFXAPIBackwardCompatibility(JitTestCase): + def setUp(self): + self.maxDiff = None + + def _fn_to_stable_annotation_str(self, obj): + """ + Unfortunately we have to serialize function signatures manually since + serialization for `inspect.Signature` objects is not stable across + python versions + """ + fn_name = torch.typename(obj) + + signature = inspect.signature(obj) + + sig_str = f'{fn_name}{signature}' + + arg_strs = [] + for k, v in signature.parameters.items(): + maybe_type_annotation = f': {self._annotation_type_to_stable_str(v.annotation, sig_str)}'\ + if v.annotation is not inspect.Signature.empty else '' + + def default_val_str(val): + if isinstance(val, (tuple, list)): + str_pieces = ['(' if isinstance(val, tuple) else '['] + str_pieces.append(', '.join(default_val_str(v) for v in val)) + if isinstance(val, tuple) and len(str_pieces) == 2: + str_pieces.append(',') + str_pieces.append(')' if isinstance(val, tuple) else ']') + return ''.join(str_pieces) + + # Need to fix up some default value strings. + # First case: modules. Default module `repr` contains the FS path of the module. + # Don't leak that + if isinstance(val, types.ModuleType): + return f'' + + # Second case: callables. Callables (such as lambdas) encode their address in + # their string repr. Don't do that + if callable(val): + return f'' + + return str(val) + + if v.default is not inspect.Signature.empty: + default_val_str = default_val_str(v.default) if not isinstance(v.default, str) else f"'{v.default}'" + maybe_default = f' = {default_val_str}' + else: + maybe_default = '' + maybe_stars = '' + if v.kind == inspect.Parameter.VAR_POSITIONAL: + maybe_stars = '*' + elif v.kind == inspect.Parameter.VAR_KEYWORD: + maybe_stars = '**' + arg_strs.append(f'{maybe_stars}{k}{maybe_type_annotation}{maybe_default}') + + return_annot = f' -> {self._annotation_type_to_stable_str(signature.return_annotation, sig_str)}'\ + if signature.return_annotation is not inspect.Signature.empty else '' + + return f'{fn_name}({", ".join(arg_strs)}){return_annot}' + + def _annotation_type_to_stable_str(self, t, sig_str): + if t is inspect.Signature.empty: + return '' + + # Forward ref + if isinstance(t, str): + return f"'{t}'" + if hasattr(typing, 'ForwardRef') and isinstance(t, typing.ForwardRef): + return t.__forward_arg__ + if hasattr(typing, '_ForwardRef') and isinstance(t, typing._ForwardRef): + return t.__forward_arg__ + + trivial_mappings = { + str : 'str', + int : 'int', + float: 'float', + bool: 'bool', + torch.dtype: 'torch.dtype', + torch.Tensor: 'torch.Tensor', + torch.device: 'torch.device', + torch.memory_format: 'torch.memory_format', + slice: 'slice', + torch.nn.Module: 'torch.nn.modules.module.Module', + torch.fx.Graph : 'torch.fx.graph.Graph', + torch.fx.Node : 'torch.fx.node.Node', + torch.fx.Proxy : 'torch.fx.proxy.Proxy', + torch.fx.node.Target : 'torch.fx.node.Target', + torch.fx.node.Argument : 'torch.fx.node.Argument', + torch.fx.graph.PythonCode : 'torch.fx.graph.PythonCode', + torch.fx.graph_module.GraphModule: 'torch.fx.graph_module.GraphModule', + torch.fx.subgraph_rewriter.Match: 'torch.fx.subgraph_rewriter.Match', + Ellipsis : '...', + typing.Any: 'Any', + type(None): 'NoneType', + None: 'None', + typing.Iterator: 'Iterator', + } + + mapping = trivial_mappings.get(t, None) + if mapping: + return mapping + + # Handle types with contained types + contained = getattr(t, '__args__', None) or [] + + # Callables contain a bare List for arguments + contained = t if isinstance(t, list) else contained + + # Python 3.8 puts type vars into __args__ for unbound types such as Dict + if all(isinstance(ct, typing.TypeVar) for ct in contained): + contained = [] + + contained_type_annots = [self._annotation_type_to_stable_str(ct, sig_str) for ct in contained] + contained_type_str = f'[{", ".join(contained_type_annots)}]' if len(contained_type_annots) > 0 else '' + + + origin = getattr(t, '__origin__', None) + if origin is None: + # Unbound types don't have `__origin__` in some Python versions, so fix that up here. + origin = t if t in {typing.Tuple, typing.Union, typing.Dict, typing.List, typing.Type, typing.Callable} else origin + + if origin in {tuple, typing.Tuple}: + return f'Tuple{contained_type_str}' + if origin in {typing.Union}: + # Annoying hack to detect Optional + if len(contained) == 2 and (contained[0] is type(None)) ^ (contained[1] is type(None)): + not_none_param = contained[0] if contained[0] is not type(None) else contained[1] + return f'Optional[{self._annotation_type_to_stable_str(not_none_param, sig_str)}]' + return f'Union{contained_type_str}' + if origin in {dict, typing.Dict}: + return f'Dict{contained_type_str}' + if origin in {list, typing.List}: + return f'List{contained_type_str}' + if origin in {type, typing.Type}: + return f'Type{contained_type_str}' + if isinstance(t, typing.Callable): + if len(contained) > 0 and contained[0] is not Ellipsis: + return f'Callable[[{", ".join(contained_type_annots[:-1])}], {contained_type_annots[-1]}]' + else: + return f'Callable{contained_type_str}' + + raise RuntimeError(f'Unrecognized type {t} used in BC-compatible type signature {sig_str}.' + f'Please add support for this type and confirm with the ' + f'FX team that your signature change is valid.') + + + def test_function_back_compat(self): + """ + Test backward compatibility for function signatures with + @compatibility(is_backward_compatible=True). Currently this checks for + exact signature matches, which may lead to false positives. If this + becomes too annoying, we can refine this check to actually parse out + the saved schema strings and check if the change is truly backward- + incompatible. + """ + signature_strs = [] + + for obj in _BACK_COMPAT_OBJECTS: + if not isinstance(obj, type): + signature_strs.append(self._fn_to_stable_annotation_str(obj)) + + signature_strs.sort() + + try: + self.assertExpected('\n'.join(signature_strs), 'fx_backcompat_function_signatures') + except AssertionError as e: + msg = f"{e}\n****** ERROR ******\nAn FX function that has been marked " \ + f"as backwards-compatible has experienced a signature change. See the " \ + f"above exception context for more information. If this change was " \ + f"unintended, please revert it. If it was intended, check with the FX " \ + f"team to ensure that the proper deprecation protocols have been followed " \ + f"and subsequently --accept the change." + raise AssertionError(msg) + + def test_class_member_back_compat(self): + """ + Test backward compatibility for members of classes with + @compatibility(is_backward_compatible=True). Currently this checks for + exact matches on the publicly visible members of the class. + """ + class_method_strs = [] + + for obj in _BACK_COMPAT_OBJECTS: + if isinstance(obj, type): + public_members = [name for name in obj.__dict__ if not name.startswith('_')] + class_method_strs.append(f'{torch.typename(obj)} {sorted(public_members)}') + + class_method_strs.sort() + + try: + self.assertExpected('\n'.join(class_method_strs), 'fx_backcompat_class_members') + except AssertionError as e: + msg = f"{e}\n****** ERROR ******\nAn FX class that has been marked " \ + f"as backwards-compatible has experienced change in its public members. See the " \ + f"above exception context for more information. If this change was " \ + f"unintended, please revert it. If it was intended, check with the FX " \ + f"team to ensure that the proper deprecation protocols have been followed " \ + f"and subsequently --accept the change." + raise AssertionError(msg) + + def test_public_api_surface(self): + mod = torch.fx + + non_back_compat_objects = {} + + def check_symbols_have_bc_designation(m, prefix): + if not m.__name__.startswith('torch.fx'): + return + if m.__name__.startswith('torch.fx.experimental'): + return + for k, v in m.__dict__.items(): + if v is m: + continue + if k.startswith('_'): + continue + if isinstance(v, types.ModuleType): + check_symbols_have_bc_designation(v, prefix + [k]) + elif isinstance(v, type) or isinstance(v, types.FunctionType): + if v not in _MARKED_WITH_COMATIBLITY: + non_back_compat_objects.setdefault(v) + + check_symbols_have_bc_designation(mod, ['torch', 'fx']) + + + non_back_compat_strs = [torch.typename(obj) for obj in non_back_compat_objects.keys()] + # Only want objects in torch.fx + non_back_compat_strs = [ + s for s in non_back_compat_strs if s.startswith('torch.fx') and not s.startswith('torch.fx.experimental')] + # Only want objects in public namespaces + non_back_compat_strs = [ + s for s in non_back_compat_strs if all(not atom.startswith('_') for atom in s.split('.'))] + non_back_compat_strs.sort() + + if len(non_back_compat_strs) != 0: + raise AssertionError(f"Public FX API(s) {non_back_compat_strs} introduced but not given a " + f"backwards-compatibility classification! Please decorate these " + f"API(s) with `@torch.fx._compatibility.compatibility` to specify " + f"BC guarantees.") + class TestFunctionalTracing(JitTestCase): IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary", "has_torch_function_variadic", "handle_torch_function", diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index f000b0af59598..e723ee4622991 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -32,7 +32,7 @@ type_matches, create_type_hint, ) -from torch.fx.passes.shape_prop import extract_tensor_metadata, ShapeProp +from torch.fx.passes.shape_prop import _extract_tensor_metadata, ShapeProp from torch.fx.passes.split_module import split_module from torch.testing._internal.common_device_type import ( ops, @@ -96,13 +96,13 @@ def forward(self, a, b, c): # Fix for now to add type/shape to output for node in traced.graph.nodes: if node.op == "output": - node.meta["tensor_meta"] = extract_tensor_metadata(a) + node.meta["tensor_meta"] = _extract_tensor_metadata(a) for mod in module_with_submodules.modules(): if isinstance(mod, GraphModule): for node in mod.graph.nodes: - node.meta["tensor_meta"] = extract_tensor_metadata(a) + node.meta["tensor_meta"] = _extract_tensor_metadata(a) for node in module_with_submodules.graph.nodes: - node.meta["tensor_meta"] = extract_tensor_metadata(a) + node.meta["tensor_meta"] = _extract_tensor_metadata(a) weights1 = {} weights2 = {} diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py index 4ff795e632944..6524c2d1b8716 100644 --- a/torch/fx/__init__.py +++ b/torch/fx/__init__.py @@ -1,6 +1,4 @@ r''' -**This feature is under a Beta release and its API may change.** - FX is a toolkit for developers to use to transform ``nn.Module`` instances. FX consists of three main components: a **symbolic tracer,** an **intermediate representation**, and **Python code generation**. A @@ -28,12 +26,13 @@ def forward(self, x): # High-level intermediate representation (IR) - Graph representation print(symbolic_traced.graph) """ - graph(x): - %param : [#users=1] = self.param - %add_1 : [#users=1] = call_function[target=](args = (%x, %param), kwargs = {}) - %linear_1 : [#users=1] = call_module[target=linear](args = (%add_1,), kwargs = {}) - %clamp_1 : [#users=1] = call_method[target=clamp](args = (%linear_1,), kwargs = {min: 0.0, max: 1.0}) - return clamp_1 + graph(): + %x : [#users=1] = placeholder[target=x] + %param : [#users=1] = get_attr[target=param] + %add : [#users=1] = call_function[target=operator.add](args = (%x, %param), kwargs = {}) + %linear : [#users=1] = call_module[target=linear](args = (%add,), kwargs = {}) + %clamp : [#users=1] = call_method[target=clamp](args = (%linear,), kwargs = {min: 0.0, max: 1.0}) + return clamp """ # Code generation - valid Python code @@ -41,10 +40,10 @@ def forward(self, x): """ def forward(self, x): param = self.param - add_1 = x + param; x = param = None - linear_1 = self.linear(add_1); add_1 = None - clamp_1 = linear_1.clamp(min = 0.0, max = 1.0); linear_1 = None - return clamp_1 + add = x + param; x = param = None + linear = self.linear(add); add = None + clamp = linear.clamp(min = 0.0, max = 1.0); linear = None + return clamp """ The **symbolic tracer** performs "symbolic execution" of the Python diff --git a/torch/fx/_compatibility.py b/torch/fx/_compatibility.py new file mode 100644 index 0000000000000..2d33813200be2 --- /dev/null +++ b/torch/fx/_compatibility.py @@ -0,0 +1,34 @@ +from typing import Any, Dict +import textwrap + +_BACK_COMPAT_OBJECTS : Dict[Any, None] = {} +_MARKED_WITH_COMATIBLITY : Dict[Any, None] = {} + +def compatibility(is_backward_compatible : bool): + if is_backward_compatible: + + def mark_back_compat(fn): + docstring = textwrap.dedent(getattr(fn, '__doc__', None) or '') + docstring += """ +.. note:: + Backwards-compatibility for this API is guaranteed. +""" + fn.__doc__ = docstring + _BACK_COMPAT_OBJECTS.setdefault(fn) + _MARKED_WITH_COMATIBLITY.setdefault(fn) + return fn + + return mark_back_compat + else: + + def mark_not_back_compat(fn): + docstring = textwrap.dedent(getattr(fn, '__doc__', None) or '') + docstring += """ +.. warning:: + This API is experimental and is *NOT* backward-compatible. +""" + fn.__doc__ = docstring + _MARKED_WITH_COMATIBLITY.setdefault(fn) + return fn + + return mark_not_back_compat diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py index 25f739e49f9ad..d38197322fab1 100644 --- a/torch/fx/_symbolic_trace.py +++ b/torch/fx/_symbolic_trace.py @@ -12,6 +12,7 @@ import torch.utils._pytree as pytree import sys +from ._compatibility import compatibility from .node import Argument, map_aggregate, base_types from .graph import Graph, _PyTreeInfo from .graph_module import GraphModule @@ -25,6 +26,7 @@ _proxyable_classes : Dict[Type, None] = {} +@compatibility(is_backward_compatible=True) class ProxyableClassMeta(type): """ ProxyableClassMeta allows you to make construction of a given Python class @@ -157,6 +159,7 @@ def __enter__(self): def __exit__(self, type, value, tb): sys.setprofile(None) +@compatibility(is_backward_compatible=False) class PHBase(object): """ Object representing an input placeholder to `concrete_args` @@ -166,6 +169,7 @@ def __repr__(self): PH = PHBase() +@compatibility(is_backward_compatible=True) class Tracer(TracerBase): # Reference: https://github.com/pytorch/pytorch/issues/54354 # The first line of this docstring overrides the one Sphinx generates for the @@ -182,6 +186,11 @@ class Tracer(TracerBase): process. The different behaviors that can be overridden are described in the docstrings of the methods on this class. """ + + # Not checking BC on this API because the default value for `autowrap_modules` + # includes the local filepath to the `math` module, which would jitter + # across machines. + @compatibility(is_backward_compatible=True) def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ), autowrap_functions: Tuple[Callable, ...] = (), enable_cpatching: bool = False, @@ -197,11 +206,19 @@ def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ), autowrap_modules (Tuple[ModuleType]): defaults to `(math, )`, Python modules whose functions should be wrapped automatically - without needing to use fx.wrap(). + without needing to use fx.wrap(). Backward-compatibility for + this parameter is guaranteed. autowrap_function (Tuple[Callable, ...]): defaults to `()`, Python functions that should be wrapped automatically without - needing to use fx.wrap(). + needing to use fx.wrap(). Backward compabilibility for this + parameter is guaranteed. + + param_shapes_constant (bool): When this flag is set, calls to shape, + size and a few other shape like attributes of a module's parameter + will be evaluted directly, rather than returning a new Proxy value + for an attribute access. Backward compatibility for this parameter + is guaranteed. enable_cpatching (bool): defaults to `False`, Allows you to enable/disable monkeypatching of torch functions at the @@ -210,12 +227,9 @@ def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ), C-level monkeypatching works by directly modifying the PyCFunctionObject* so that calling it returns a different function. - Turning this on is likely to slow down tracing by 1.5-3x. - - param_shapes_constant (bool): see https://github.com/pytorch/pytorch/issues/61733. When - this flag is set, calls to shape, size and a few other shape like attributes of a module's parameter - will be evaluted directly, rather than returning a new Proxy value for an attribute access. - + Turning this on is likely to slow down tracing by 1.5-3x. This + parameter is experimental and its backward-compatibility is NOT + guaranteed. """ super().__init__() @@ -235,6 +249,7 @@ def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ), self.submodule_paths: Optional[Dict[torch.nn.Module, str]] = None + @compatibility(is_backward_compatible=True) def create_arg(self, a: Any) -> 'Argument': """ A method to specify the behavior of tracing when preparing values to @@ -325,6 +340,7 @@ def create_arg(self, a: Any) -> 'Argument': return super().create_arg(a) + @compatibility(is_backward_compatible=True) def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool: """ A method to specify whether a given ``nn.Module`` is a "leaf" module. @@ -346,6 +362,7 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo """ return m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential) + @compatibility(is_backward_compatible=True) def path_of_module(self, mod : torch.nn.Module) -> str: """ Helper method to find the qualified name of ``mod`` in the Module hierarchy @@ -372,6 +389,7 @@ def path_of_module(self, mod : torch.nn.Module) -> str: return n raise NameError('module is not installed as a submodule') + @compatibility(is_backward_compatible=True) def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any: """ Method that specifies the behavior of this ``Tracer`` when it encounters @@ -404,6 +422,8 @@ def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tu return forward(*args, **kwargs) return self.create_proxy('call_module', module_qualified_name, args, kwargs) + # This method will be refactored + @compatibility(is_backward_compatible=False) def create_args_for_root(self, root_fn, is_module, concrete_args=None): """ Create ``placeholder`` nodes corresponding to the signature of the ``root`` @@ -509,8 +529,8 @@ def _module_getattr(self, attr, attr_val, parameter_proxy_cache): return attr_val - - def trace(self, root: Union[torch.nn.Module, Callable], concrete_args: Optional[Dict[str, Any]] = None) -> Graph: + @compatibility(is_backward_compatible=True) + def trace(self, root: Union[torch.nn.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> Graph: """ Trace ``root`` and return the corresponding FX ``Graph`` representation. ``root`` can either be an ``nn.Module`` instance or a Python callable. @@ -524,8 +544,11 @@ def trace(self, root: Union[torch.nn.Module, Callable], concrete_args: Optional[ Args: root (Union[Module, Callable]): Either a ``Module`` or a function to be - traced through. - concrete_args (Optional[Dict[str, any]]): Concrete arguments that should not be treated as Proxies. + traced through. Backwards-compatibility for this parameter is + guaranteed. + concrete_args (Optional[Dict[str, any]]): Concrete arguments that should + not be treated as Proxies. This parameter is experimental and + its backwards-compatibility is *NOT* guaranteed. Returns: @@ -772,6 +795,7 @@ def _autowrap_check(patcher : _Patcher, frame_dict : Dict[str, Any], function_id patcher.patch(frame_dict, name, _create_wrapped_func(value)) +@compatibility(is_backward_compatible=True) def wrap(fn_or_name : Union[str, Callable]): """ This function can be called at module-level scope to register fn_or_name as a "leaf function". @@ -828,9 +852,11 @@ def my_custom_function(x, y): _wrapped_fns_to_patch.append((f.f_globals, fn_name)) return fn_or_name -def symbolic_trace(root : Union[torch.nn.Module, Callable], concrete_args: Optional[Dict[str, Any]] = None, +@compatibility(is_backward_compatible=True) +def symbolic_trace(root : Union[torch.nn.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None, enable_cpatching: bool = False) -> GraphModule: - """Symbolic tracing API + """ + Symbolic tracing API Given an ``nn.Module`` or function instance ``root``, this function will return a ``GraphModule`` constructed by recording operations seen while tracing through ``root``. @@ -876,7 +902,6 @@ def f(x): Returns: GraphModule: a Module created from the recorded operations from ``root``. - """ tracer = Tracer(enable_cpatching=enable_cpatching) graph = tracer.trace(root, concrete_args) diff --git a/torch/fx/annotate.py b/torch/fx/annotate.py index 6e0646a58ec52..032ce14b6ec70 100644 --- a/torch/fx/annotate.py +++ b/torch/fx/annotate.py @@ -1,6 +1,7 @@ from torch.fx.proxy import Proxy +from ._compatibility import compatibility - +@compatibility(is_backward_compatible=False) def annotate(val, type): # val could be either a regular value (not tracing) # or fx.Proxy (tracing) diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 692ca6304910f..1b4b4690f732d 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -10,7 +10,7 @@ register_acc_op_mapping, register_custom_acc_mapper_fn, ) -from torch.fx.passes.shape_prop import extract_tensor_metadata +from torch.fx.passes.shape_prop import _extract_tensor_metadata this_arg_is_optional = True @@ -1134,12 +1134,12 @@ def packed_quantized_linear_mapper( with node.graph.inserting_before(node): # Insert get_attr nodes for weight and bias get_weight = node.graph.get_attr(weight_name) - get_weight.meta["tensor_meta"] = extract_tensor_metadata(linear_module.weight()) + get_weight.meta["tensor_meta"] = _extract_tensor_metadata(linear_module.weight()) get_bias = None if linear_module.bias() is not None: get_bias = node.graph.get_attr(bias_name) - get_bias.meta["tensor_meta"] = extract_tensor_metadata(linear_module.bias()) + get_bias.meta["tensor_meta"] = _extract_tensor_metadata(linear_module.bias()) # Create kwargs for acc_op.quantized_linear kwargs = { @@ -1182,12 +1182,12 @@ def packed_quantized_conv2d_mapper( with node.graph.inserting_before(node): # Insert get_attr nodes for weight and bias get_weight = node.graph.get_attr(weight_name) - get_weight.meta["tensor_meta"] = extract_tensor_metadata(conv_module.weight()) + get_weight.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.weight()) get_bias = None if conv_module.bias() is not None: get_bias = node.graph.get_attr(bias_name) - get_bias.meta["tensor_meta"] = extract_tensor_metadata(conv_module.bias()) + get_bias.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.bias()) # Create kwargs for acc_op.conv kwargs = { diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 1ee6f05f79809..29ffc416715a7 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -1,6 +1,7 @@ from .node import Node, Argument, Target, map_arg, _type_repr, _get_qualified_name import torch.utils._pytree as pytree from . import _pytree as fx_pytree +from ._compatibility import compatibility from typing import TYPE_CHECKING, Callable, Any, List, Dict, NamedTuple, Optional, Tuple, Set, FrozenSet, Type from dataclasses import dataclass @@ -175,9 +176,12 @@ def _is_illegal_name(self, name: str, obj: Any) -> bool: return False +@compatibility(is_backward_compatible=True) @dataclass class PythonCode: - """Represents all the information necessary to exec or save a graph as Python code.""" + """ + Represents all the information necessary to exec or save a graph as Python code. + """ # Python source code for the forward function definition. src: str # Values in global scope during exection of `src_def`. @@ -240,6 +244,7 @@ class _PyTreeInfo(NamedTuple): in_spec: pytree.TreeSpec out_spec: Optional[pytree.TreeSpec] +@compatibility(is_backward_compatible=True) class Graph: """ ``Graph`` is the main data structure used in the FX Intermediate Representation. @@ -283,6 +288,8 @@ def forward(self, x): For the semantics of operations represented in the ``Graph``, please see :class:`Node`. """ + + @compatibility(is_backward_compatible=True) def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Optional[Type["Tracer"]] = None): """ Construct an empty Graph. @@ -299,6 +306,11 @@ def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Op @property def owning_module(self): + """ + Return the module that owns this ``GraphModule``, if there is one, + ``None`` if there is no owning module or if there are multiple owning + modules. + """ return self._owning_module @owning_module.setter @@ -322,6 +334,7 @@ def nodes(self) -> _node_list: """ return _node_list(self) + @compatibility(is_backward_compatible=True) def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node], return_output_node=False) -> 'Optional[Argument]': """ Copy all nodes from a given graph into ``self``. @@ -354,7 +367,7 @@ def __deepcopy__(self, memo=None) -> 'Graph': from the default implementation. This uses graph_copy to copy the nodes in an iterative way, rather than recursive. It also populates the memoization table to prevent unnecessary copies (e.g. references to - nodes or other parts of the Graph from a custom GraphModule implementation + nodes or other parts of the Graph from a custom GraphModule implementation. """ memo = memo if memo else {} g = Graph(tracer_cls=self._tracer_cls) @@ -364,6 +377,7 @@ def __deepcopy__(self, memo=None) -> 'Graph': g.output(output_val, type_expr=getattr(old_output_val, 'type', None)) return g + @compatibility(is_backward_compatible=True) def create_node(self, op: str, target: 'Target', args: Optional[Tuple['Argument', ...]] = None, kwargs: Optional[Dict[str, 'Argument']] = None, @@ -410,10 +424,12 @@ def create_node(self, op: str, target: 'Target', self._len += 1 return n + @compatibility(is_backward_compatible=False) def flatten_inps(self, *args): flat_args, args_spec = pytree.tree_flatten(args) return flat_args + @compatibility(is_backward_compatible=False) def unflatten_outs(self, out): if self._pytree_info is None: return out @@ -422,6 +438,7 @@ def unflatten_outs(self, out): assert(self._pytree_info.out_spec is not None) return pytree.tree_unflatten(out, self._pytree_info.out_spec) + @compatibility(is_backward_compatible=True) def erase_node(self, to_erase : Node) -> None: """ Erases a ``Node`` from the ``Graph``. Throws an exception if @@ -448,6 +465,7 @@ def erase_node(self, to_erase : Node) -> None: assert isinstance(new_kwargs, dict) to_erase.kwargs = new_kwargs + @compatibility(is_backward_compatible=True) def inserting_before(self, n: Optional[Node] = None): """Set the point at which create_node and companion methods will insert into the graph. When used within a 'with' statement, this will temporary set the insert point and @@ -470,6 +488,7 @@ def inserting_before(self, n: Optional[Node] = None): assert n.graph == self, "Node to insert before is not in graph." return _InsertPoint(self, n.prepend) + @compatibility(is_backward_compatible=True) def inserting_after(self, n: Optional[Node] = None): """Set the point at which create_node and companion methods will insert into the graph. When used within a 'with' statement, this will temporary set the insert point and @@ -492,7 +511,7 @@ def inserting_after(self, n: Optional[Node] = None): assert n.graph == self, "Node to insert after is not in graph." return _InsertPoint(self, n.append) - # sugar for create_node when you know the op + @compatibility(is_backward_compatible=True) def placeholder(self, name: str, type_expr: Optional[Any] = None) -> Node: """ Insert a ``placeholder`` node into the Graph. A ``placeholder`` represents @@ -514,6 +533,7 @@ def placeholder(self, name: str, type_expr: Optional[Any] = None) -> Node: """ return self.create_node('placeholder', name, type_expr=type_expr) + @compatibility(is_backward_compatible=True) def get_attr(self, qualified_name: str, type_expr: Optional[Any] = None) -> Node: """ Insert a ``get_attr`` node into the Graph. A ``get_attr`` ``Node`` represents the @@ -571,6 +591,7 @@ def _get_attr_reference_exists(mod: torch.nn.Module, qualified_name: str) -> boo "necessary buffer") return self.create_node('get_attr', qualified_name, type_expr=type_expr) + @compatibility(is_backward_compatible=True) def call_module(self, module_name: str, args: Optional[Tuple['Argument', ...]] = None, @@ -615,6 +636,7 @@ def call_module(self, "necessary submodule") return self.create_node('call_module', module_name, args, kwargs, type_expr=type_expr) + @compatibility(is_backward_compatible=True) def call_method(self, method_name: str, args: Optional[Tuple['Argument', ...]] = None, @@ -649,6 +671,7 @@ def call_method(self, """ return self.create_node('call_method', method_name, args, kwargs, type_expr=type_expr) + @compatibility(is_backward_compatible=True) def call_function(self, the_function: Callable[..., Any], args: Optional[Tuple['Argument', ...]] = None, @@ -684,6 +707,7 @@ def call_function(self, """ return self.create_node('call_function', the_function, args, kwargs, type_expr=type_expr) + @compatibility(is_backward_compatible=True) def node_copy(self, node: Node, arg_transform: Callable[[Node], 'Argument'] = lambda x: x) -> Node: """ Copy a node from one graph into another. ``arg_transform`` needs to transform arguments from @@ -714,6 +738,7 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], 'Argument'] = la result_node.meta = copy.copy(node.meta) return result_node + @compatibility(is_backward_compatible=True) def output(self, result: 'Argument', type_expr: Optional[Any] = None): """ Insert an ``output`` ``Node`` into the ``Graph``. An ``output`` node represents @@ -745,6 +770,7 @@ def _target_to_str(self, target : Target) -> str: op = _snake_case(op) return op + @compatibility(is_backward_compatible=True) def python_code(self, root_module: str) -> PythonCode: """ Turn this ``Graph`` into valid Python code. @@ -995,7 +1021,7 @@ def forward({', '.join(orig_args)}){maybe_return_annotation[0]}: def __str__(self) -> str: """ - Print a human-readable (not machine-readable) string representation + Return a human-readable (not machine-readable) string representation of this Graph """ placeholder_names : List[str] = [] @@ -1011,10 +1037,12 @@ def __str__(self) -> str: s += '\n ' + node_str return s + @compatibility(is_backward_compatible=True) def print_tabular(self): """ Prints the intermediate representation of the graph in tabular - format. + format. Note that this API requires the ``tabulate`` module to be + installed. """ try: from tabulate import tabulate @@ -1027,6 +1055,7 @@ def print_tabular(self): print(tabulate(node_specs, headers=['opcode', 'name', 'target', 'args', 'kwargs'])) + @compatibility(is_backward_compatible=True) def lint(self): """ Runs various checks on this Graph to make sure it is well-formed. In @@ -1097,6 +1126,7 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None: else: m_itr = new_m_itr + @compatibility(is_backward_compatible=True) def eliminate_dead_code(self): """ Remove all dead code from the graph, based on each node's number of @@ -1124,7 +1154,6 @@ def forward(self, x): def forward(self, x): return x + self.attr_1 - """ # Lint the graph first to make sure its topologically sorted, otherwise # DCE below will not behave as expected. diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index c91857342ffcd..e7750db9353bd 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -6,6 +6,7 @@ import linecache from typing import Type, Dict, List, Any, Union, Optional, Set from .graph import Graph, _is_from_torch, _custom_builtins, PythonCode +from ._compatibility import compatibility from torch.package import Importer, sys_importer import copy import itertools @@ -17,9 +18,9 @@ # Normal exec loses the source code, however we can work with # the linecache module to recover it. -# Using exec_with_source will add it to our local cache +# Using _exec_with_source will add it to our local cache # and then tools like TorchScript will be able to get source info. -class EvalCacheLoader(object): +class _EvalCacheLoader(object): def __init__(self): self.eval_cache = {} self.next_id = 0 @@ -62,10 +63,10 @@ def _get_key(self): self.next_id += 1 return key -_loader = EvalCacheLoader() +_loader = _EvalCacheLoader() -def exec_with_source(src: str, globals: Dict[str, Any]): +def _exec_with_source(src: str, globals: Dict[str, Any]): key = _loader.cache(src, globals) exec(compile(src, key, 'exec'), globals) @@ -73,7 +74,7 @@ def exec_with_source(src: str, globals: Dict[str, Any]): def _forward_from_src(src: str, globals: Dict[str, Any]): # avoid mutating the passed in dict globals_copy = globals.copy() - exec_with_source(src, globals_copy) + _exec_with_source(src, globals_copy) forward_fn = globals_copy['forward'] del globals_copy['forward'] return forward_fn @@ -95,7 +96,7 @@ def _format_import_block(globals: Dict[str, Any], importer: Importer): return '\n'.join(import_strs) -def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module: +def _reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module: # BC: attribute name was changed from `code` to `_code` to facilitate # making `code` into a property and adding a docstring to it fn_src = body.get('_code') or body['code'] @@ -103,14 +104,14 @@ def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Mod return _deserialize_graph_module(forward, body) -def reduce_package_graph_module( +def _reduce_package_graph_module( importer: PackageImporter, body: Dict[Any, Any], generated_module_name: str ) -> torch.nn.Module: forward = importer.import_module(generated_module_name).forward return _deserialize_graph_module(forward, body) -def reduce_deploy_graph_module( +def _reduce_deploy_graph_module( importer: PackageImporter, body: Dict[Any, Any], import_block: str ) -> torch.nn.Module: ns = dict() @@ -219,6 +220,7 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str): else: setattr(to_module, field, from_obj) +@compatibility(is_backward_compatible=True) class GraphModule(torch.nn.Module): """ GraphModule is an nn.Module generated from an fx.Graph. Graphmodule has a @@ -231,7 +233,6 @@ class GraphModule(torch.nn.Module): regenerated. However, if you edit the contents of the ``graph`` without reassigning the ``graph`` attribute itself, you must call ``recompile()`` to update the generated code. - """ def __new__(cls: 'Type[GraphModule]', *args, **kwargs): # each instance of a graph module needs its own forward method @@ -243,6 +244,7 @@ class GraphModuleImpl(cls): # type: ignore[misc, valid-type] pass return super().__new__(GraphModuleImpl) + @compatibility(is_backward_compatible=True) def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, @@ -266,7 +268,6 @@ def __init__(self, class_name (str): ``name`` denotes the name of this GraphModule for debugging purposes. If it's unset, all error messages will report as originating from ``GraphModule``. It may be helpful to set this to ``root``'s original name or a name that makes sense within the context of your transform. - """ super().__init__() self.__class__.__name__ = class_name @@ -334,6 +335,7 @@ def graph(self, g : Graph) -> None: g.owning_module = self self.recompile() + @compatibility(is_backward_compatible=False) def to_folder(self, folder: Union[str, os.PathLike], module_name : str = "FxModule"): """Dumps out module to ``folder`` with ``module_name`` so that it can be imported with ``from import `` @@ -398,6 +400,7 @@ def _gen_model_repr(module_name: str, module: torch.nn.Module) -> Optional[str]: warnings.warn("Was not able to save the following children modules as reprs -" f"saved as pickled files instead: {blobified_modules}") + @compatibility(is_backward_compatible=True) def add_submodule(self, target: str, m: torch.nn.Module) -> bool: """ Adds the given submodule to ``self``. @@ -418,7 +421,6 @@ def add_submodule(self, target: str, m: torch.nn.Module) -> bool: denoted by ``target`` must either a) not exist yet, or b) reference an ``nn.Module`` (not a parameter or other attribute) - """ *prefix, field = target.split('.') mod: torch.nn.Module = self @@ -439,6 +441,7 @@ def add_submodule(self, target: str, m: torch.nn.Module) -> bool: mod.add_module(field, m) return True + @compatibility(is_backward_compatible=True) def delete_submodule(self, target: str) -> bool: """ Deletes the given submodule from ``self``. @@ -481,6 +484,7 @@ def delete_submodule(self, target: str) -> bool: delattr(mod, target_submod) return True + @compatibility(is_backward_compatible=True) def delete_all_unused_submodules(self) -> None: """ Deletes all unused submodules from ``self``. @@ -535,6 +539,7 @@ def code(self) -> str: raise RuntimeError('Code has not been generated! Please report a bug to PyTorch') return self._code + @compatibility(is_backward_compatible=True) def recompile(self) -> PythonCode: """ Recompile this GraphModule from its ``graph`` attribute. This should be @@ -613,7 +618,7 @@ def __reduce_deploy__(self, importer: Importer): python_code = self.recompile() import_block = _format_import_block(python_code.globals, importer) - return (reduce_deploy_graph_module, (dict_without_graph, import_block)) + return (_reduce_deploy_graph_module, (dict_without_graph, import_block)) def __reduce_package__(self, exporter: PackageExporter): dict_without_graph = self.__dict__.copy() @@ -625,7 +630,7 @@ def __reduce_package__(self, exporter: PackageExporter): import_block = _format_import_block(python_code.globals, exporter.importer) module_code = import_block + self.code exporter.save_source_string(generated_module_name, module_code) - return (reduce_package_graph_module, (dict_without_graph, generated_module_name)) + return (_reduce_package_graph_module, (dict_without_graph, generated_module_name)) def __reduce__(self): """ @@ -639,7 +644,7 @@ def __reduce__(self): python_code = self.recompile() import_block = _format_import_block(python_code.globals, sys_importer) del dict_without_graph['_graph'] - return (reduce_graph_module, (dict_without_graph, import_block)) + return (_reduce_graph_module, (dict_without_graph, import_block)) # because __reduce__ is defined for serialization, # we need to define deepcopy otherwise it will call __reduce__ diff --git a/torch/fx/immutable_collections.py b/torch/fx/immutable_collections.py index 459c30e745dfd..1093a07c8d229 100644 --- a/torch/fx/immutable_collections.py +++ b/torch/fx/immutable_collections.py @@ -1,3 +1,4 @@ +from ._compatibility import compatibility _help_mutation = """\ If you are attempting to modify the kwargs or args of a torch.fx.Node object, @@ -20,5 +21,8 @@ def _create_immutable_container(base, mutable_functions): 'clear', 'extend', 'insert', 'pop', 'remove']) immutable_list.__reduce__ = lambda self: (immutable_list, (tuple(iter(self)),)) +compatibility(is_backward_compatible=True)(immutable_list) + immutable_dict = _create_immutable_container(dict, ['__delitem__', '__setitem__', 'clear', 'pop', 'popitem', 'update']) immutable_dict.__reduce__ = lambda self: (immutable_dict, (iter(self.items()),)) +compatibility(is_backward_compatible=True)(immutable_dict) diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py index 20dcf62e0c3cb..64233b4cf18b6 100644 --- a/torch/fx/interpreter.py +++ b/torch/fx/interpreter.py @@ -3,8 +3,10 @@ from .node import Argument, Node, Target, map_arg, map_aggregate from .proxy import Proxy from ._symbolic_trace import Tracer +from ._compatibility import compatibility from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +@compatibility(is_backward_compatible=True) class Interpreter: """ An Interpreter executes an FX graph Node-by-Node. This pattern @@ -59,6 +61,7 @@ def fn(x): execution. This can be disabled to, for example, examine all of the intermediate values in the execution by looking at the ``Interpreter.env`` attribute. """ + @compatibility(is_backward_compatible=True) def __init__(self, module : GraphModule, garbage_collect_values : bool = True): assert isinstance(module, GraphModule) self.module = module @@ -84,6 +87,7 @@ def register_last_uses(n : Node, user : Node): map_arg(node.args, lambda n: register_last_uses(n, node)) map_arg(node.kwargs, lambda n: register_last_uses(n, node)) + @compatibility(is_backward_compatible=True) def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any: """ Run `module` via interpretation and return the result. @@ -123,6 +127,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any: output_val = self.env[node] return output_val + @compatibility(is_backward_compatible=True) def run_node(self, n : Node) -> Any: """ Run a specific node ``n`` and return the result. @@ -142,7 +147,7 @@ def run_node(self, n : Node) -> Any: return getattr(self, n.op)(n.target, args, kwargs) # Main Node running APIs - + @compatibility(is_backward_compatible=True) def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: """ Execute a ``placeholder`` node. Note that this is stateful: @@ -168,6 +173,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D else: return next(self.args_iter) + @compatibility(is_backward_compatible=True) def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: """ Execute a ``get_attr`` node. Will retrieve an attribute @@ -186,6 +192,7 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict assert isinstance(target, str) return self.fetch_attr(target) + @compatibility(is_backward_compatible=True) def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: """ Execute a ``call_function`` node and return the result. @@ -205,6 +212,7 @@ def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : # Execute the function and return the result return target(*args, **kwargs) + @compatibility(is_backward_compatible=True) def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: """ Execute a ``call_method`` node and return the result. @@ -226,6 +234,7 @@ def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D assert isinstance(target, str) return getattr(self_obj, target)(*args_tail, **kwargs) + @compatibility(is_backward_compatible=True) def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: """ Execute a ``call_module`` node and return the result. @@ -248,6 +257,7 @@ def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D return submod(*args, **kwargs) + @compatibility(is_backward_compatible=True) def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: """ Execute an ``output`` node. This really just retrieves @@ -266,7 +276,7 @@ def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[s return args[0] # Helper methods - + @compatibility(is_backward_compatible=True) def fetch_attr(self, target : str): """ Fetch an attribute from the ``Module`` hierarchy of ``self.module``. @@ -285,6 +295,7 @@ def fetch_attr(self, target : str): attr_itr = getattr(attr_itr, atom) return attr_itr + @compatibility(is_backward_compatible=True) def fetch_args_kwargs_from_env(self, n : Node) -> Tuple[Tuple, Dict]: """ Fetch the concrete values of ``args`` and ``kwargs`` of node ``n`` @@ -302,6 +313,7 @@ def fetch_args_kwargs_from_env(self, n : Node) -> Tuple[Tuple, Dict]: assert isinstance(kwargs, dict) return args, kwargs + @compatibility(is_backward_compatible=True) def map_nodes_to_values(self, args : Argument, n : Node) -> Argument: """ Recursively descend through ``args`` and look up the concrete value @@ -319,6 +331,7 @@ def load_arg(n_arg : Node) -> Any: return self.env[n_arg] return map_arg(args, load_arg) +@compatibility(is_backward_compatible=True) class Transformer(Interpreter): """ ``Transformer`` is a special type of interpreter that produces a @@ -357,6 +370,8 @@ def fn(x): Args: module (GraphModule): The ``Module`` to be transformed. """ + + @compatibility(is_backward_compatible=True) def __init__(self, module): super().__init__(module) self.new_graph = Graph() @@ -371,6 +386,7 @@ def is_leaf_module(self, _, __) -> bool: self.tracer = TransformerTracer(self.new_graph) self.tracer.root = module + @compatibility(is_backward_compatible=True) def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy: """ Execute a ``placeholder`` node. In ``Transformer``, this is @@ -387,6 +403,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D assert isinstance(target, str) return Proxy(self.new_graph.placeholder(target), self.tracer) + @compatibility(is_backward_compatible=True) def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy: """ Execute a ``get_attr`` node. In ``Transformer``, this is @@ -403,16 +420,19 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict assert isinstance(target, str) return Proxy(self.new_graph.get_attr(target), self.tracer) + @compatibility(is_backward_compatible=True) def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: # Override so that the leaf module policy from `self.tracer` is respected. assert isinstance(target, str) submod = self.fetch_attr(target) return self.tracer.call_module(submod, submod.forward, args, kwargs) + @compatibility(is_backward_compatible=True) def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any: # Override so that functions that were wrapped are still wrapped. return self.tracer.create_proxy('call_function', target, args, kwargs) + @compatibility(is_backward_compatible=True) def transform(self) -> GraphModule: """ Transform ``self.module`` and return the transformed diff --git a/torch/fx/node.py b/torch/fx/node.py index 8c4faf7d4fa27..61dfba7acb03f 100644 --- a/torch/fx/node.py +++ b/torch/fx/node.py @@ -1,5 +1,6 @@ # Nodes represent a definition of a value in our graph of operators. from typing import TYPE_CHECKING, Union, Callable, Any, Tuple, List, Optional, Dict, Set +from ._compatibility import compatibility from .immutable_collections import immutable_dict, immutable_list import torch import builtins @@ -85,6 +86,7 @@ def _format_arg(arg) -> str: else: return str(arg) +@compatibility(is_backward_compatible=True) class Node: """ ``Node`` is the data structure that represents individual operations within @@ -112,9 +114,37 @@ class Node: - ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement in the Graph printout. """ + + @compatibility(is_backward_compatible=True) def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target', args: Tuple['Argument', ...], kwargs: Dict[str, 'Argument'], return_type : Optional[Any] = None) -> None: + """ + Instantiate an instance of ``Node``. Note: most often, you want to use the + Graph APIs, i.e. ``Graph.call_module``, ``Graph.call_method``, etc. rather + than instantiating a ``Node`` directly. + + Args: + graph (Graph): The ``Graph`` to which this ``Node`` should belong. + + name (str): The name to which the output of this ``Node`` should be assigned + + op (str): The opcode for this ``Node``. Can be one of 'placeholder', + 'call_method', 'call_module', 'call_function', 'get_attr', + 'output' + + target ('Target'): The target this op should call. See the broader + ``Node`` docstring for more details. + + args (Tuple['Argument']): The args to be passed to ``target`` + + kwargs (Dict[str, 'Argument']): The kwargs to be passed to ``target`` + + return_type (Optional[Any]): The python type expression representing the + type of the output of this node. This field can be used for + annotation of values in the generated code or for other types + of analyses. + """ self.graph = graph self.name = name # unique name of value being created assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output', 'root'] @@ -187,6 +217,7 @@ def prev(self) -> 'Node': """ return self._prev + @compatibility(is_backward_compatible=True) def prepend(self, x: 'Node') -> None: """ Insert x before this node in the list of nodes in the graph. Example:: @@ -205,6 +236,7 @@ def prepend(self, x: 'Node') -> None: p._next, x._prev = x, p x._next, self._prev = self, x + @compatibility(is_backward_compatible=True) def append(self, x: 'Node') -> None: """ Insert x after this node in the list of nodes in the graph. @@ -279,6 +311,7 @@ def all_input_nodes(self) -> List['Node']: """ return list(self._input_nodes.keys()) + @compatibility(is_backward_compatible=True) def update_arg(self, idx : int, arg : Argument) -> None: """ Update an existing positional argument to contain the new value @@ -293,6 +326,7 @@ def update_arg(self, idx : int, arg : Argument) -> None: args[idx] = arg self.args = tuple(args) + @compatibility(is_backward_compatible=True) def update_kwarg(self, key : str, arg : Argument) -> None: """ Update an existing keyword argument to contain the new value @@ -365,6 +399,7 @@ def _pretty_print_target(self, target): return f'operator.{target.__name__}' return _get_qualified_name(target) + @compatibility(is_backward_compatible=True) def format_node(self, placeholder_names: List[str] = None, maybe_return_typename: List[str] = None) -> Optional[str]: @@ -420,6 +455,7 @@ def format_node(self, f'{self.op}[target={self._pretty_print_target(self.target)}](' \ f'args = {_format_arg(self.args)}, kwargs = {_format_arg(self.kwargs)})' + @compatibility(is_backward_compatible=True) def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']: """ Replace all uses of ``self`` in the Graph with the Node ``replace_with``. @@ -449,6 +485,7 @@ def maybe_replace_node(n : Node) -> Node: assert len(self.users) == 0 return to_process + @compatibility(is_backward_compatible=False) def is_impure(self): """ Returns whether this op is impure, i.e. if its op is a placeholder or @@ -478,6 +515,7 @@ def is_impure(self): return False + @compatibility(is_backward_compatible=False) def normalized_arguments( self, root : torch.nn.Module, arg_types : Optional[Tuple[Any]] = None, kwarg_types : Optional[Dict[str, Any]] = None, @@ -513,7 +551,7 @@ def normalized_arguments( return None - + @compatibility(is_backward_compatible=True) def replace_input_with(self, old_input: 'Node', new_input: 'Node'): """ Loop through input nodes of ``self``, and replace all instances of @@ -523,7 +561,6 @@ def replace_input_with(self, old_input: 'Node', new_input: 'Node'): old_input (Node): The old input node to be replaced. new_input (Node): The new input node to replace ``old_input``. - """ def maybe_replace_node(n : Node) -> Node: return new_input if n == old_input else n @@ -535,13 +572,19 @@ def maybe_replace_node(n : Node) -> Node: self.__update_args_kwargs(new_args, new_kwargs) +@compatibility(is_backward_compatible=True) def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument: - """ Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """ + """ + Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. + """ assert callable(fn), "torch.fx.map_arg(a, fn): fn must be a callable" return map_aggregate(a, lambda x: fn(x) if isinstance(x, Node) else x) +@compatibility(is_backward_compatible=True) def map_aggregate(a: Argument, fn: Callable[[Argument], Argument]) -> Argument: - """ Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """ + """ + Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. + """ if isinstance(a, tuple): return tuple(map_aggregate(elem, fn) for elem in a) elif isinstance(a, list): diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py index 5f61ebe718ff1..ac559b19530c7 100644 --- a/torch/fx/operator_schemas.py +++ b/torch/fx/operator_schemas.py @@ -6,7 +6,9 @@ import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast from torch._jit_internal import boolean_dispatched +from ._compatibility import compatibility +@compatibility(is_backward_compatible=False) class ArgsKwargsPair(NamedTuple): """ Simple named tuple for wrapping args/kwargs pairs. @@ -76,6 +78,7 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins return inspect.Signature(parameters, return_annotation=return_type) +@compatibility(is_backward_compatible=False) def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]: """ Given an operator on the `torch` namespace, return a list of `inspect.Signature` @@ -103,6 +106,7 @@ def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature return signatures +@compatibility(is_backward_compatible=False) def create_type_hint(x): try: if isinstance(x, list) or isinstance(x, tuple): @@ -130,6 +134,7 @@ def ret_type(x): pass return x +@compatibility(is_backward_compatible=False) def type_matches(signature_type : Any, argument_type : Any): sig_origin_type = getattr(signature_type, '__origin__', signature_type) @@ -177,6 +182,7 @@ def is_homogeneous_tuple(t): return False +@compatibility(is_backward_compatible=False) def normalize_function( target: Callable, args: Tuple[Any], kwargs : Optional[Dict[str, Any]] = None, arg_types : Optional[Tuple[Any]] = None, kwarg_types : Optional[Dict[str, Any]] = None, @@ -272,6 +278,7 @@ def normalize_function( return new_args_and_kwargs +@compatibility(is_backward_compatible=False) def normalize_module( root: torch.nn.Module, target: str, args: Tuple[Any], kwargs : Optional[Dict[str, Any]] = None, normalize_to_only_use_kwargs : bool = False) -> Optional[ArgsKwargsPair]: diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py index 6f0f72d38c75f..816fbe7aaac6c 100644 --- a/torch/fx/passes/shape_prop.py +++ b/torch/fx/passes/shape_prop.py @@ -2,7 +2,9 @@ import torch.fx from torch.fx.node import Node, map_aggregate from typing import Any, Tuple, NamedTuple, Optional +from torch.fx._compatibility import compatibility +@compatibility(is_backward_compatible=True) class TensorMetadata(NamedTuple): # TensorMetadata is a structure containing pertinent information # about a tensor within a PyTorch program. @@ -20,7 +22,7 @@ class TensorMetadata(NamedTuple): q_scale : Optional[float] q_zero_point : Optional[int] -def extract_tensor_metadata(result : torch.Tensor) -> TensorMetadata: +def _extract_tensor_metadata(result : torch.Tensor) -> TensorMetadata: """ Extract a TensorMetadata NamedTuple describing `result`. """ @@ -58,7 +60,7 @@ def extract_tensor_metadata(result : torch.Tensor) -> TensorMetadata: return TensorMetadata( shape, dtype, requires_grad, stride, memory_format, is_quantized, qscheme, q_scale, q_zero_point) - +@compatibility(is_backward_compatible=True) class ShapeProp(torch.fx.Interpreter): """ Execute an FX graph Node-by-Node and @@ -113,7 +115,7 @@ def extract_tensor_meta(obj): if isinstance(obj, torch.Tensor): nonlocal found_tensor found_tensor = True - return extract_tensor_metadata(obj) + return _extract_tensor_metadata(obj) else: return obj diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py index 989ec92777cc3..c42af7e9c2d9b 100644 --- a/torch/fx/passes/split_module.py +++ b/torch/fx/passes/split_module.py @@ -1,7 +1,9 @@ import torch from torch.fx.graph_module import GraphModule from typing import Callable, List, Dict, Any, Optional +from torch.fx._compatibility import compatibility +@compatibility(is_backward_compatible=True) class Partition: def __init__(self, name: str): self.name: str = name @@ -23,6 +25,7 @@ def __repr__(self) -> str: f" parition dependents: {self.partition_dependents}" # Creates subgraphs out of main graph +@compatibility(is_backward_compatible=True) def split_module( m: GraphModule, root_m: torch.nn.Module, diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py index c0b83bc5c3734..61b039f8b7219 100644 --- a/torch/fx/proxy.py +++ b/torch/fx/proxy.py @@ -7,11 +7,14 @@ from .graph import magic_methods, reflectable_magic_methods, Graph from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable from .node import Target, Node, Argument, base_types, map_aggregate +from ._compatibility import compatibility +@compatibility(is_backward_compatible=True) class TracerBase: graph: Graph record_stack_traces : bool = False + @compatibility(is_backward_compatible=True) def create_node(self, kind : str, target : Target, args : Tuple[Argument, ...], kwargs : Dict[str, Argument], name : Optional[str] = None, type_expr : Optional[Any] = None) -> Node: @@ -24,11 +27,11 @@ def create_node(self, kind : str, target : Target, """ return self.graph.create_node(kind, target, args, kwargs, name, type_expr) + @compatibility(is_backward_compatible=True) def proxy(self, node: Node) -> 'Proxy': return Proxy(node, self) - - + @compatibility(is_backward_compatible=True) def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs: Dict[str, Any], name: Optional[str] = None, type_expr : Optional[Any] = None, proxy_factory_fn: Callable[[Node], 'Proxy'] = None): @@ -86,6 +89,7 @@ def _find_user_frame(self): return frame + @compatibility(is_backward_compatible=True) def create_arg(self, a: Any) -> Argument: """ A method that lowers the objects seen as arguments during symbolic evaluation @@ -131,6 +135,7 @@ def no_node(arg): raise NotImplementedError(f"argument of type: {type(a)}") + @compatibility(is_backward_compatible=True) def to_bool(self, obj: 'Proxy') -> bool: """Called when a proxy object is being converted to a boolean, such as when used in control flow. Normally we don't know what to do because @@ -139,6 +144,7 @@ def to_bool(self, obj: 'Proxy') -> bool: """ raise TraceError('symbolically traced variables cannot be used as inputs to control flow') + @compatibility(is_backward_compatible=True) def iter(self, obj: 'Proxy') -> Iterator: """Called when a proxy object is being iterated over, such as when used in control flow. Normally we don't know what to do because @@ -154,6 +160,7 @@ def iter(self, obj: 'Proxy') -> Iterator: ' Proxy docstring for help troubleshooting ' 'Proxy iteration errors') + @compatibility(is_backward_compatible=True) def keys(self, obj: 'Proxy') -> Any: """Called when a proxy object is has the keys() method called. This is what happens when ** is called on a proxy. This should return an @@ -163,15 +170,17 @@ def keys(self, obj: 'Proxy') -> Any: # used in Proxy object when just appending to the graph while not tracing. +@compatibility(is_backward_compatible=True) class GraphAppendingTracer(TracerBase): def __init__(self, graph: Graph): super().__init__() self.graph = graph +@compatibility(is_backward_compatible=True) class TraceError(ValueError): pass - +@compatibility(is_backward_compatible=True) class Proxy: """ ``Proxy`` objects are ``Node`` wrappers that flow through the @@ -200,6 +209,8 @@ class Proxy: For a more detailed description into the Proxy internals, check out the "Proxy" section in `torch/fx/OVERVIEW.md` """ + + @compatibility(is_backward_compatible=True) def __init__(self, node: Node, tracer: 'Optional[TracerBase]' = None): if tracer is None: # This allows you to create a Proxy object around a raw Node @@ -232,6 +243,7 @@ def __iter__(self) -> Iterable['Proxy']: def __bool__(self) -> bool: return self.tracer.to_bool(self) + @compatibility(is_backward_compatible=True) def keys(self): return self.tracer.keys(self) @@ -253,7 +265,9 @@ def __torch_function__(self, orig_method, types, args=None, kwargs=None): return self.tracer.create_proxy('call_function', orig_method, args, kwargs, name=self.tracer.graph._target_to_str(orig_method.__name__)) +@compatibility(is_backward_compatible=True) class Attribute(Proxy): + @compatibility(is_backward_compatible=True) def __init__(self, root: Proxy, attr: str): self.root = root self.attr = attr @@ -272,9 +286,10 @@ def __call__(self, *args, **kwargs): return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs) +@compatibility(is_backward_compatible=False) class ParameterProxy(Proxy): """ - a special proxy which lets "shape", "size", "dim", and a few other + A special proxy which lets "shape", "size", "dim", and a few other attribute accesses pass through to the underlying module parameter object, so that conditional tests on these attributes will not throw exception during tracing """ @@ -309,7 +324,7 @@ def nelement(self): for method in magic_methods: - def scope(method): + def _scope(method): def impl(*args, **kwargs): tracer = args[0].tracer target = getattr(operator, method) @@ -317,7 +332,7 @@ def impl(*args, **kwargs): impl.__name__ = method as_magic = f'__{method}__' setattr(Proxy, as_magic, impl) - scope(method) + _scope(method) def _define_reflectable(orig_method_name): method_name = f'__r{orig_method_name}__' diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py index e779f6ca9e6b1..72ea56aa31196 100644 --- a/torch/fx/subgraph_rewriter.py +++ b/torch/fx/subgraph_rewriter.py @@ -2,22 +2,24 @@ from .graph import Graph from .node import Node from ._symbolic_trace import symbolic_trace +from ._compatibility import compatibility import copy from typing import Callable, Dict, List, NamedTuple, Optional, Set import torch +@compatibility(is_backward_compatible=True) class Match(NamedTuple): # Node from which the match was found anchor: Node # Maps nodes in the pattern subgraph to nodes in the larger graph nodes_map: Dict[Node, Node] -class SubgraphMatcher: +class _SubgraphMatcher: def __init__(self, pattern: Graph) -> None: self.pattern = pattern if len(pattern.nodes) == 0: - raise ValueError("SubgraphMatcher cannot be initialized with an " + raise ValueError("_SubgraphMatcher cannot be initialized with an " "empty pattern") # `self.pattern_anchor` is the output Node in `pattern` self.pattern_anchor = next(iter(reversed(pattern.nodes))) @@ -129,6 +131,7 @@ def try_get_submodule(mod: torch.nn.Module, target: str) -> Optional[torch.nn.Mo gm.graph.lint() +@compatibility(is_backward_compatible=True) def replace_pattern(gm: GraphModule, pattern: Callable, replacement: Callable) -> List[Match]: """ Matches all possible non-overlapping sets of operators and their @@ -242,7 +245,6 @@ def forward(self, x, w1, w2): max_2 = torch.max(sum_2) add_2 = add_1 + max_2 return add_2 - """ # Get the graphs for `gm`, `pattern`, `replacement` original_graph = gm.graph @@ -251,7 +253,7 @@ def forward(self, x, w1, w2): # Find all possible pattern matches in original_graph. Note that # pattern matches may overlap with each other. - matcher = SubgraphMatcher(pattern_graph) + matcher = _SubgraphMatcher(pattern_graph) matches: List[Match] = [] # Consider each node as an "anchor" (deepest matching graph node) diff --git a/torch/fx/tensor_type.py b/torch/fx/tensor_type.py index 18387ee3c78f7..0840122a9b168 100644 --- a/torch/fx/tensor_type.py +++ b/torch/fx/tensor_type.py @@ -1,6 +1,9 @@ from torch.fx.experimental.unification import Var # type: ignore[attr-defined] +from ._compatibility import compatibility + +@compatibility(is_backward_compatible=False) class TensorType: """ TensorType defines a type for tensors, which consists of a list of dimensions. @@ -48,7 +51,7 @@ def __repr__(self): Dyn = _DynType() - +@compatibility(is_backward_compatible=False) def is_consistent(t1, t2): """ A binary relation denoted by ~ that determines if t1 is consistent with t2. @@ -74,6 +77,7 @@ def is_consistent(t1, t2): return False +@compatibility(is_backward_compatible=False) def is_more_precise(t1, t2): """ A binary relation denoted by <= that determines if t1 is more precise than t2. diff --git a/torch/quantization/ns/graph_passes.py b/torch/quantization/ns/graph_passes.py index 36e737e3baf4b..51eb6c24ef3fb 100644 --- a/torch/quantization/ns/graph_passes.py +++ b/torch/quantization/ns/graph_passes.py @@ -361,6 +361,7 @@ def _insert_copy_of_subgraph_a_after_input_node_c( if isinstance(input_node_c, Node): graph_c = input_node_c.graph else: + assert isinstance(input_node_c, list) graph_c = input_node_c[0].graph # create a sequential list of the subgraphs' nodes from start to end, @@ -450,6 +451,7 @@ def _insert_copy_of_node_a_after_input_node_c( if isinstance(input_node_c, Node): graph_c = input_node_c.graph else: + assert isinstance(input_node_c, list) graph_c = input_node_c[0].graph # generically handle all args and kwargs except for the input From e7fb35021aca4b29bbf6f7120c5bff1400179175 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 30 Aug 2021 20:08:15 -0700 Subject: [PATCH 380/530] [nnc] Enable fusion of bfloat16 ops (#64196) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64196 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D30643864 Pulled By: bertmaher fbshipit-source-id: e95edeaf7089464d713ea1d1f951743d3e5f61c5 --- test/test_jit_fuser_te.py | 30 ++++++------- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 7 ++-- torch/csrc/jit/tensorexpr/block_codegen.cpp | 2 + torch/csrc/jit/tensorexpr/codegen.cpp | 2 +- torch/csrc/jit/tensorexpr/codegen.h | 4 +- torch/csrc/jit/tensorexpr/cpp_codegen.cpp | 2 +- torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 31 +++++++------- torch/csrc/jit/tensorexpr/eval.cpp | 29 ++++++++----- torch/csrc/jit/tensorexpr/eval.h | 16 +++---- torch/csrc/jit/tensorexpr/expr.cpp | 2 +- torch/csrc/jit/tensorexpr/expr.h | 2 +- torch/csrc/jit/tensorexpr/fwd_decls.h | 2 +- torch/csrc/jit/tensorexpr/half_support.h | 42 +++++++++++++++---- torch/csrc/jit/tensorexpr/hash_provider.h | 10 ++++- torch/csrc/jit/tensorexpr/ir.cpp | 2 +- torch/csrc/jit/tensorexpr/ir.h | 10 ++--- torch/csrc/jit/tensorexpr/ir_cloner.cpp | 2 +- torch/csrc/jit/tensorexpr/ir_cloner.h | 2 +- torch/csrc/jit/tensorexpr/ir_mutator.cpp | 2 +- torch/csrc/jit/tensorexpr/ir_mutator.h | 2 +- torch/csrc/jit/tensorexpr/ir_printer.cpp | 2 +- torch/csrc/jit/tensorexpr/ir_printer.h | 2 +- torch/csrc/jit/tensorexpr/ir_simplifier.h | 2 +- torch/csrc/jit/tensorexpr/ir_visitor.cpp | 2 +- torch/csrc/jit/tensorexpr/ir_visitor.h | 2 +- torch/csrc/jit/tensorexpr/kernel.cpp | 4 +- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 6 ++- torch/csrc/jit/tensorexpr/reduction.h | 4 +- torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 4 +- torch/csrc/jit/tensorexpr/types.cpp | 8 ++-- torch/csrc/jit/tensorexpr/types.h | 4 +- 31 files changed, 147 insertions(+), 94 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 918cc702d83d6..a6cc085b27c70 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -97,6 +97,7 @@ def setUp(self): torch.float16, torch.float32, torch.float64, + torch.bfloat16, ] self.dtypes = self.int_dtypes + self.fp_dtypes @@ -1145,7 +1146,7 @@ def forward(self, x): bad_dtypes = [] for dtype, output_dtype, device, size in product(dtypes, dtypes, self.devices, sizes): # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue if dtype == output_dtype: continue @@ -1210,7 +1211,7 @@ def test_isnan(self): for inp, device, dtype in product(inputs, self.devices, dtypes): # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue inp = inp.to(device=device, dtype=dtype) try: @@ -1263,7 +1264,8 @@ def apply(fn): torch.round, torch.trunc, torch.frac, - F.hardshrink, + # TODO: broken on ROCm? + # F.hardshrink, F.leaky_relu, lambda x: torch.threshold(x, 0, -10), lambda x: torch.clamp(x, -10, 10), @@ -1272,7 +1274,7 @@ def apply(fn): sizes = [(1,), (2,), (4, 4)] for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue if op in gpu_only and device == "cpu": continue @@ -1325,7 +1327,7 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, binary_ops, devices): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device) @@ -1377,7 +1379,7 @@ def fn(x, y): "[[10, 3, 4], [4, 5]]", ] for dtype, size, device in product(self.dtypes, sizes, devices): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: size_x, size_y = size @@ -1423,7 +1425,7 @@ def apply_with_scalar(fn, scalar): # only using scalar values relevant to particular ops scalars = [1.5, 3, 0, -2.0, -1] for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device) @@ -1457,7 +1459,7 @@ def apply_with_scalar(fn, scalar): # only using scalar values relevant to particular ops scalars = [1.5, 3, -2.0, -1] # skip 0 for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device) @@ -1494,7 +1496,7 @@ def apply_with_scalar(fn, scalar): # only using scalar values relevant to particular ops scalars = [1.5, 3, 0, -2.0, -1] for dtype, op, device, scalar in product(dtypes, binary_ops, self.devices, scalars): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device) @@ -1524,7 +1526,7 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, ternary_ops, devices): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device) @@ -1555,7 +1557,7 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, ternary_ops, devices): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device, size=[5, 3, 128, 128]) @@ -1588,7 +1590,7 @@ def apply(fn): torch.cat, ] for dtype, op, device in product(self.dtypes, list_ops, devices): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: x = self.data_for(dtype, device, size=[5, 4, 1, 7]) @@ -1621,7 +1623,7 @@ def apply(fn): ] devices = self.devices for dtype, op, device in product(self.dtypes, ops, devices): - if dtype == torch.float16 and device == "cpu": + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue try: cond = self.data_for(torch.bool, device) @@ -1650,7 +1652,6 @@ def fn(x): unsupported_dtypes = [ torch.uint8, - torch.bfloat16, torch.complex32, torch.complex64, torch.complex128, @@ -1791,6 +1792,7 @@ def eager(x, y): dtypes = self.dtypes.copy() # CPU fuser doesn't support float16. dtypes.remove(torch.float16) + dtypes.remove(torch.bfloat16) for dtype1, dtype2 in product(dtypes, dtypes): x = torch.randint(2, (1, 13,)).to(dtype1) zero = torch.tensor([[0]]).to(dtype2) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index a3e37072a032d..75305d63e072f 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -966,7 +966,9 @@ class TensorExprFuser { // but on top of that Float16 has a few kinks on LLVM. Thus, on CPU we // additionally disable it until we either move to a more stable version // or find workarounds. - if (*st == c10::ScalarType::Half && *device == c10::kCPU) { + if ((*st == c10::ScalarType::Half || + *st == c10::ScalarType::BFloat16) && + *device == c10::kCPU) { return false; } @@ -1098,8 +1100,7 @@ class TensorExprFuser { // All tensor types should be known. return false; } - if (c10::isComplexType(*st) || c10::isQIntType(*st) || - *st == c10::ScalarType::BFloat16) { + if (c10::isComplexType(*st) || c10::isQIntType(*st)) { return false; } } diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp index 51b7b77f6d39d..b42d37428208b 100644 --- a/torch/csrc/jit/tensorexpr/block_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp @@ -16,6 +16,8 @@ std::string blockDtypeCppString(const Dtype& dtype) { return "1"; case ScalarType::Half: return "2"; + case ScalarType::BFloat16: + return "2"; // NOLINTNEXTLINE(bugprone-branch-clone) case ScalarType::Char: return "1"; diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp index 0bbc3378b0323..b2b077b9771d1 100644 --- a/torch/csrc/jit/tensorexpr/codegen.cpp +++ b/torch/csrc/jit/tensorexpr/codegen.cpp @@ -67,7 +67,7 @@ void* CodeGen::argToPtr(const BufferArg& bufferArg, const CallArg& callArg) { case ScalarType::Name: \ return callArg.Name##Ptr(); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h index 29255aac07df2..0504f9a8b0b0b 100644 --- a/torch/csrc/jit/tensorexpr/codegen.h +++ b/torch/csrc/jit/tensorexpr/codegen.h @@ -153,7 +153,7 @@ class CodeGen::CallArg { memcpy(&data_, &v, sizeof(Type)); \ } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, ARG_TYPE_CTOR); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_TYPE_CTOR); #undef ARG_TYPE_CTOR void* data() const { @@ -165,7 +165,7 @@ class CodeGen::CallArg { return (Type*)&data_; \ } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, ARG_PTR_DEFINE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_PTR_DEFINE); #undef ARG_PTR_DEFINE private: diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp index 20795e43dd57b..6c02f7f7e09df 100644 --- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp @@ -149,7 +149,7 @@ void dispatch_binary_op(std::ostream& os, const BinaryOpNode* v) { case ScalarType::Name: \ visit_binary_op(os, v->lhs(), v->rhs(), v->expr_type()); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index 30d42075189fb..c23eda31204de 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -98,6 +98,8 @@ std::string CudaPrinter::dtypeToCppString(const Dtype& dtype) { return "bool"; case ScalarType::Half: return "half"; + case ScalarType::BFloat16: + return "__nv_bfloat16"; case ScalarType::Char: return "char"; case ScalarType::Byte: @@ -251,20 +253,15 @@ void CudaPrinter::visit(ForPtr v) { } void CudaPrinter::visit(CastPtr v) { - if (v->dtype().scalar_type() == ScalarType::Half) { - os() << "__float2half("; - v->src_value()->accept(this); - os() << ")"; - return; - } else if (v->src_value()->dtype().scalar_type() == ScalarType::Half) { - os() << "__half2float("; - v->src_value()->accept(this); - os() << ")"; - return; - } - - os() << "(" << dtypeToCppString(v->dtype()) << ")"; - os() << "("; + std::string castFn = v->dtype().scalar_type() == ScalarType::Half + ? "__float2half" + : v->dtype().scalar_type() == ScalarType::BFloat16 ? "__float2bfloat16" + : v->src_value()->dtype().scalar_type() == ScalarType::Half + ? "__half2float" + : v->src_value()->dtype().scalar_type() == ScalarType::BFloat16 + ? "__bfloat162float" + : ("(" + dtypeToCppString(v->dtype()) + ")"); + os() << castFn << "("; v->src_value()->accept(this); os() << ")"; } @@ -320,7 +317,8 @@ void CudaPrinter::visit(LoadPtr v) { return; } if (v->dtype().scalar_type() == ScalarType::Bool || - v->dtype().scalar_type() == ScalarType::Half) { + v->dtype().scalar_type() == ScalarType::Half || + v->dtype().scalar_type() == ScalarType::BFloat16) { // There's no __ldg overload for bool or half. os() << *v->base_handle() << "[" << *v->flat_index() << "]"; return; @@ -944,6 +942,9 @@ void CudaCodeGen::Initialize() { if (halfChecker.hasHalf()) { os() << fuser::cuda::half_support_literal << std::endl; } + if (halfChecker.hasBFloat16()) { + os() << fuser::cuda::bfloat16_support_literal << std::endl; + } std::string func_name = GetUniqueFuncName(kernel_func_name()); os() << "extern \"C\" __global__" << std::endl; diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp index e42ce77820e11..4582433d95697 100644 --- a/torch/csrc/jit/tensorexpr/eval.cpp +++ b/torch/csrc/jit/tensorexpr/eval.cpp @@ -62,6 +62,10 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) { return lhs / rhs; } +inline c10::BFloat16 div_value(c10::BFloat16 lhs, c10::BFloat16 rhs) { + return lhs / rhs; +} + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class SimpleIREvaluatorImpl : public IRVisitor { public: @@ -347,7 +351,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { case ScalarType::Name: \ value_ = binary_op(lhs_v, rhs_v, expr_type); \ break; - AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE); #undef TYPE_CASE case ScalarType::Bool: value_ = binary_op(lhs_v, rhs_v, expr_type); @@ -370,7 +374,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { case ScalarType::Name: \ value = compare_select_op(lhs, rhs, retval1, retval2, cmp_op); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -402,7 +406,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { value_ = compare_select_op_helper( \ lhs_v, rhs_v, ret_val1_v, ret_val2_v, cmp_op); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -413,7 +417,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { TORCH_API void visit(Name##ImmPtr v) override { \ value_ = Value(v->value()); \ } - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT); #undef IMM_VISIT TORCH_API void visit(BlockPtr v) override { @@ -464,7 +468,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { case ScalarType::Name: \ this->value_ = Value(castValues(src_dtype, v)); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DST_TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DST_TYPE_CASE); #undef DST_TYPE_CASE default: throw unsupported_dtype(); @@ -486,7 +490,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { case ScalarType::Name: \ doCastFromSrc(src_dtype, dst_dtype, value_); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, SRC_TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, SRC_TYPE_CASE); #undef SRC_TYPE_CASE default: throw unsupported_dtype(); @@ -590,7 +594,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { std::vector v(lanes, value.as()); \ value_ = Value(v); \ } break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -610,6 +614,9 @@ class SimpleIREvaluatorImpl : public IRVisitor { #undef TYPE_CASE case ScalarType::Half: throw unsupported_dtype("IfThenElse condition can't have Half dtype"); + case ScalarType::BFloat16: + throw unsupported_dtype( + "IfThenElse condition can't have BFloat16 dtype"); default: throw unsupported_dtype(); } @@ -660,7 +667,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { } \ value_ = Value(v); \ } break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -693,7 +700,7 @@ class SimpleIREvaluatorImpl : public IRVisitor { ptr##Name[index[i]] = value[i]; \ } \ } break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -801,6 +808,8 @@ class SimpleIREvaluatorImpl : public IRVisitor { visit_intrinsics_helper(v); } else if (inp_dtype == ScalarType::Half) { throw unsupported_dtype(); // TODO + } else if (inp_dtype == ScalarType::BFloat16) { + throw unsupported_dtype(); // TODO } } else { switch (ty) { @@ -1039,7 +1048,7 @@ void SimpleIREvaluator::bindArg(const BufferArg& bufArg, void* data) { impl_->bindVar(bufArg.var(), typed_data); \ break; \ } - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index 494ba283ea902..e11bb169484f6 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -36,7 +36,7 @@ class Value { Name##values.push_back(v); \ return; \ } - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE throw unsupported_dtype(); } @@ -46,14 +46,14 @@ class Value { Name##values.push_back(v); \ } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_CTOR); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_CTOR); #undef VALUE_CTOR #define VALUE_VEC_CTOR(Type, Name) \ Value(const std::vector& v) \ : dtype_(Dtype(k##Name, v.size())), Name##values(v) {} // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_VEC_CTOR); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_VEC_CTOR); #undef VALUE_VEC_CTOR template @@ -72,7 +72,7 @@ class Value { Dtype dtype_; #define VALUE_STORAGE(Type, Name) std::vector Name##values; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_STORAGE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_STORAGE); #undef VALUE_STORAGE void* ptr; }; @@ -85,7 +85,7 @@ class Value { } \ return Name##values[0]; \ } -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_DISPATCH); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_DISPATCH); #undef VALUE_AS_DISPATCH #define VALUE_AS_VEC_DISPATCH(Type, Name) \ @@ -96,7 +96,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_DISPATCH); } \ return Name##values; \ } -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_VEC_DISPATCH); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_VEC_DISPATCH); #undef VALUE_AS_VEC_DISPATCH template @@ -206,7 +206,7 @@ class ExprEval { ret_value_ = Value(ret_val_arg[0]); \ } break; // NOLINTNEXTLINE(modernize-use-emplace) - AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE); #undef TYPE_CASE case ScalarType::Bool: { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -231,7 +231,7 @@ class ExprEval { codegen_->call_raw(args_extended); \ ret_value_ = Value(ret_val_arg[0]); \ } break; - AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE); #undef TYPE_CASE case ScalarType::Bool: { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp index cbf5ddd9f1d6d..c757d4b0ca201 100644 --- a/torch/csrc/jit/tensorexpr/expr.cpp +++ b/torch/csrc/jit/tensorexpr/expr.cpp @@ -89,7 +89,7 @@ ExprHandle ExprHandle::operator>>(const ExprHandle& other) const { // NOLINTNEXTLINE #define IMM_EXPR_DECLARE(Type, Name) \ ExprHandle::ExprHandle(Type v) : ExprHandle(Name##Imm::make(v)) {} -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_EXPR_DECLARE); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE); #undef IMM_EXPR_DECLARE ExprHandle sin(const ExprHandle& v) { diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 4947bfdc36be9..41ce99a085179 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -110,7 +110,7 @@ class TORCH_API ExprHandle { } #define IMM_EXPR_DECLARE(Type, Name) ExprHandle(Type v); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_EXPR_DECLARE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE); #undef IMM_EXPR_DECLARE template diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h index 1b3dde560b427..119308b053442 100644 --- a/torch/csrc/jit/tensorexpr/fwd_decls.h +++ b/torch/csrc/jit/tensorexpr/fwd_decls.h @@ -113,7 +113,7 @@ using SyncThreadsPtr = NodePtr; #define IMM_DECLARE(Type, Name) \ class Name##Imm; \ using Name##ImmPtr = NodePtr; -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE); #undef IMM_DECLARE } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h index 674af8a764928..8ecf956d6d75b 100644 --- a/torch/csrc/jit/tensorexpr/half_support.h +++ b/torch/csrc/jit/tensorexpr/half_support.h @@ -18,17 +18,23 @@ class HalfChecker : public IRVisitor { } } - bool hasHalf() { + bool hasHalf() const { return hasHalf_; } + bool hasBFloat16() const { + return hasBFloat16_; + } + void visit(LoadPtr v) override { hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half; + hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16; IRVisitor::visit(v); } void visit(StorePtr v) override { hasHalf_ |= v->buf()->dtype().scalar_type() == ScalarType::Half; + hasBFloat16_ |= v->buf()->dtype().scalar_type() == ScalarType::BFloat16; IRVisitor::visit(v); } @@ -36,20 +42,26 @@ class HalfChecker : public IRVisitor { hasHalf_ = true; } + void visit(BFloat16ImmPtr v) override { + hasBFloat16_ = true; + } + void visit(CastPtr v) override { hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half; + hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16; IRVisitor::visit(v); } private: bool hasHalf_{false}; + bool hasBFloat16_{false}; }; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class HalfRewriter : public IRMutator { ExprPtr mutate(LoadPtr v) override { ExprPtr child = IRMutator::mutate(v); - if (child->dtype().scalar_type() != ScalarType::Half) { + if (!isHalf(child)) { return child; } @@ -63,12 +75,11 @@ class HalfRewriter : public IRMutator { StmtPtr mutate(StorePtr v) override { // Since mutation changes the `value()` expression in-place, we need to // get the dtype of the `value()` before that is mutated. - Dtype newType = v->value()->dtype(); + auto newType = v->value()->dtype(); ExprPtr new_val = v->value()->accept_mutator(this); - if (newType.scalar_type() == ScalarType::Half) { - new_val = - alloc(newType.cloneWithScalarType(ScalarType::Half), new_val); + if (isHalf(newType.scalar_type())) { + new_val = alloc(newType, new_val); inserted_half_casts_.insert(new_val); } @@ -80,11 +91,15 @@ class HalfRewriter : public IRMutator { return alloc(kFloat, v); } + ExprPtr mutate(BFloat16ImmPtr v) override { + return alloc(kFloat, v); + } + ExprPtr mutate(CastPtr v) override { ExprPtr child = v->src_value()->accept_mutator(this); // just don't allow half casts we didn't insert. - if (v->dtype().scalar_type() == ScalarType::Half) { + if (isHalf(v)) { if (inserted_half_casts_.count(v) < 1) { return child; } @@ -105,8 +120,9 @@ class HalfRewriter : public IRMutator { return alloc(v->dtype(), child); } + StmtPtr mutate(LetPtr v) override { - if (v->dtype().scalar_type() == ScalarType::Half) { + if (isHalf(v->dtype().scalar_type())) { VarPtr load_new_var = alloc(v->var()->name_hint(), kFloat); ExprPtr new_value = alloc( v->dtype().cloneWithScalarType(ScalarType::Float), @@ -131,7 +147,7 @@ class HalfRewriter : public IRMutator { template ExprPtr mutateArithmetic(T v) { IRMutator::mutate(v); - if (v->dtype().scalar_type() == c10::kHalf) { + if (isHalf(v)) { v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat)); } return v; @@ -169,6 +185,14 @@ class HalfRewriter : public IRMutator { } private: + static bool isHalf(ScalarType st) { + return st == ScalarType::Half || st == ScalarType::BFloat16; + } + + static bool isHalf(ExprPtr v) { + return isHalf(v->dtype().scalar_type()); + } + std::unordered_set inserted_half_casts_; std::unordered_map var_map; }; diff --git a/torch/csrc/jit/tensorexpr/hash_provider.h b/torch/csrc/jit/tensorexpr/hash_provider.h index 91ce269edeb5c..35d493a0025b4 100644 --- a/torch/csrc/jit/tensorexpr/hash_provider.h +++ b/torch/csrc/jit/tensorexpr/hash_provider.h @@ -92,7 +92,7 @@ class TORCH_API HashProvider : public IRVisitor { CACHE_GUARD(); \ putHash(v, hash_combine(#Name, v->value())); \ } - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT); #undef IMM_VISIT void visit(CastPtr v) override; @@ -287,6 +287,14 @@ class TORCH_API HashProvider : public IRVisitor { std::memcpy(&n, &d, sizeof d); return te_hash(n); } + + size_t te_hash(at::BFloat16 d) { + // memcpy as type punning. Should be optimized out. + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int16_t n; + std::memcpy(&n, &d, sizeof d); + return te_hash(n); + } }; } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp index 2680f5366b46e..439993c481903 100644 --- a/torch/csrc/jit/tensorexpr/ir.cpp +++ b/torch/csrc/jit/tensorexpr/ir.cpp @@ -231,7 +231,7 @@ bool immediateIsNegative(ExprPtr e) { if (Name##ImmPtr imm = to(e)) { \ return imm->value() < 0; \ } - AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE); #undef TYPE_CASE return false; } diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 1218082e6af98..65a362ef023fe 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -320,7 +320,7 @@ class Min : public BinaryOpNode { private: \ Type value_; \ }; -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE); #undef IMM_DECLARE // Get immediate by ScalarType. @@ -329,9 +329,9 @@ ExprPtr getImmediateByType(ScalarType immType, T initialVal) { switch (immType) { #define TYPE_CASE(Type, Name) \ case ScalarType::Name: \ - return alloc(initialVal); + return alloc(Type(initialVal)); // NOLINTNEXTLINE(bugprone-branch-clone) - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -374,7 +374,7 @@ T immediateAs(ExprPtr e) { if (Name##ImmPtr imm = to(e)) { \ return imm->value(); \ } - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE throw unsupported_dtype(); return 0; @@ -391,7 +391,7 @@ bool immediateEquals(ExprPtr e, T val) { if (Name##ImmPtr imm = to(e)) { \ return imm->value() == val; \ } - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE throw unsupported_dtype(); return false; diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.cpp b/torch/csrc/jit/tensorexpr/ir_cloner.cpp index e225826df66e2..1144833c7990e 100644 --- a/torch/csrc/jit/tensorexpr/ir_cloner.cpp +++ b/torch/csrc/jit/tensorexpr/ir_cloner.cpp @@ -119,7 +119,7 @@ ExprPtr IRCloner::mutate(CompareSelectPtr v) { ExprPtr IRCloner::mutate(Name##ImmPtr v) { \ return v; \ } -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DEFINE); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DEFINE); #undef IMM_MUTATE_DEFINE ExprPtr IRCloner::mutate(CastPtr v) { diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.h b/torch/csrc/jit/tensorexpr/ir_cloner.h index f03e12886eabe..5f516a02ffadb 100644 --- a/torch/csrc/jit/tensorexpr/ir_cloner.h +++ b/torch/csrc/jit/tensorexpr/ir_cloner.h @@ -26,7 +26,7 @@ class TORCH_API IRCloner : public IRMutator { ExprPtr mutate(RshiftPtr v) override; ExprPtr mutate(CompareSelectPtr v) override; #define IMM_MUTATE_DECLARE(Type, Name) ExprPtr mutate(Name##ImmPtr v) override; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE); #undef IMM_MUTATE_DECLARE ExprPtr mutate(CastPtr v) override; ExprPtr mutate(BitCastPtr v) override; diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index 45121581eebf0..71a40a134e0b5 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -115,7 +115,7 @@ ExprPtr IRMutator::mutate(CompareSelectPtr v) { ExprPtr IRMutator::mutate(Name##ImmPtr v) { \ return v; \ } -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DEFINE); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DEFINE); #undef IMM_MUTATE_DEFINE ExprPtr IRMutator::mutate(CastPtr v) { diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h index fb6c420af46a0..0a96876606dfb 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.h +++ b/torch/csrc/jit/tensorexpr/ir_mutator.h @@ -25,7 +25,7 @@ class TORCH_API IRMutator { virtual ExprPtr mutate(RshiftPtr v); virtual ExprPtr mutate(CompareSelectPtr v); #define IMM_MUTATE_DECLARE(Type, Name) virtual ExprPtr mutate(Name##ImmPtr v); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE); #undef IMM_MUTATE_DECLARE virtual ExprPtr mutate(CastPtr v); virtual ExprPtr mutate(BitCastPtr v); diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index ca90d9995e0d2..4a10c282e60b1 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -226,7 +226,7 @@ static void formatImm(std::ostream& os, T v) { void IRPrinter::visit(Name##ImmPtr v) { \ formatImm(os(), v->value()); \ } -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT); #undef IMM_PRINT_VISIT void IRPrinter::visit(CastPtr v) { diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h index 327119dcc74e6..fb357a8fb79fa 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.h +++ b/torch/csrc/jit/tensorexpr/ir_printer.h @@ -34,7 +34,7 @@ class TORCH_API IRPrinter : public IRVisitor { void visit(RshiftPtr v) override; void visit(CompareSelectPtr v) override; #define IMM_PRINT_VISIT(Type, Name) void visit(Name##ImmPtr v) override; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT); #undef IMM_PRINT_VISIT void visit(CastPtr v) override; void visit(BitCastPtr v) override; diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h index 1df8b5d8f3501..11d004f395ed1 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.h +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h @@ -97,7 +97,7 @@ inline ExprPtr evaluateOp(ExprPtr v) { Type val = eval.value(); \ return getImmediateByType(v->dtype().scalar_type(), val); \ } - AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: LOG(FATAL) << "Unsupported datatype: " << v->dtype(); diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp index eb2a4280c4f88..9489422b66ebe 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp +++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp @@ -79,7 +79,7 @@ void IRVisitor::visit(CompareSelectPtr v) { // NOLINTNEXTLINE #define IMM_VISIT(Type, Name) \ void IRVisitor::visit(Name##ImmPtr v) {} -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT); +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT); #undef IMM_VISIT void IRVisitor::visit(CastPtr v) { diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h index 001725f961619..e54786b2f9036 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.h +++ b/torch/csrc/jit/tensorexpr/ir_visitor.h @@ -26,7 +26,7 @@ class TORCH_API IRVisitor { #define IMM_PRINT_VISIT(Type, Name) virtual void visit(Name##ImmPtr v); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT) + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT) #undef IMM_PRINT_VISIT virtual void visit(CastPtr v); diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index e4136d85c0a50..78cbb822bfbff 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -52,7 +52,7 @@ static ExprHandle promoteToDtype(ExprHandle e, ScalarType dt) { case ScalarType::Name: \ e = cast(e); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw unsupported_dtype(); @@ -520,7 +520,7 @@ ExprHandle demoteOutput( #define TYPE_CASE(Type, Name) \ case ScalarType::Name: \ return cast(e); - AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE); #undef TYPE_CASE case ScalarType::Bool: return cast(e); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 026d52bfc938c..b9ea70806ee1a 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -231,7 +231,7 @@ class LLVMCodeGenImpl : public IRVisitor { void visit(CompareSelectPtr v) override; #define IMM_VISIT_DECLARE(_1, Name) void visit(Name##ImmPtr v) override; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT_DECLARE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT_DECLARE); #undef IMM_VISIT_DECLARE void visit(CastPtr v) override; @@ -902,6 +902,10 @@ void LLVMCodeGenImpl::visit(HalfImmPtr v) { value_ = llvm::ConstantFP::get(HalfTy_, v->value()); } +void LLVMCodeGenImpl::visit(BFloat16ImmPtr v) { + TORCH_INTERNAL_ASSERT(false, "llvm codegen does not support bfloat16"); +} + void LLVMCodeGenImpl::visit(BoolImmPtr v) { value_ = llvm::ConstantInt::get(BoolTy_, v->value()); } diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h index 08aef01c7d310..22d90b9981b82 100644 --- a/torch/csrc/jit/tensorexpr/reduction.h +++ b/torch/csrc/jit/tensorexpr/reduction.h @@ -171,7 +171,7 @@ inline ExprHandle maximumVal(ScalarType type) { #define MAX_BY_TYPE_CASE(Type, Name) \ case ScalarType::Name: \ return ExprHandle(std::numeric_limits::max()); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, MAX_BY_TYPE_CASE) + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE) #undef MAX_BY_TYPE_CASE default: throw unsupported_dtype(); @@ -184,7 +184,7 @@ inline ExprHandle minimumVal(ScalarType type) { #define MAX_BY_TYPE_CASE(Type, Name) \ case ScalarType::Name: \ return ExprHandle(std::numeric_limits::min()); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, MAX_BY_TYPE_CASE) + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE) #undef MAX_BY_TYPE_CASE default: throw unsupported_dtype(); diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp index c7f48824303d4..c924bded3543c 100644 --- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp +++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp @@ -69,7 +69,7 @@ void initTensorExprBindings(PyObject* module) { #define DTYPE_SINGLETON_ACCESSOR(ctype, name) \ dtype_class.def_property_readonly_static( \ #name, [](py::object) { return k##name; }); // NOLINT - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DTYPE_SINGLETON_ACCESSOR) + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DTYPE_SINGLETON_ACCESSOR) #undef DTYPE_SINGLETON_ACCESSOR auto expr_handle_class = @@ -144,7 +144,7 @@ void initTensorExprBindings(PyObject* module) { #define EXPRHANDLE_CTOR(ctype, name) \ expr_handle_class.def_static(#ctype, [](ctype v) { return ExprHandle(v); }); - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, EXPRHANDLE_CTOR) + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, EXPRHANDLE_CTOR) #undef EXPRHANDLE_CTOR py::class_(te, "VarHandle") diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp index 5cef86a2dfe26..e75ecd9744d61 100644 --- a/torch/csrc/jit/tensorexpr/types.cpp +++ b/torch/csrc/jit/tensorexpr/types.cpp @@ -16,7 +16,7 @@ Dtype Dtype::scalar_dtype() const { // NOLINTNEXTLINE #define DTYPE_DEFINE(_1, n) TORCH_API Dtype k##n(ScalarType::n, 1); -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DTYPE_DEFINE) +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DTYPE_DEFINE) #undef DTYPE_DEFINE @@ -28,7 +28,7 @@ Dtype ToDtype(ScalarType type) { #define TYPE_CASE(_1, n) \ case ScalarType::n: \ return k##n; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE) + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE) #undef TYPE_CASE case ScalarType::Undefined: @@ -56,7 +56,7 @@ int Dtype::byte_size() const { scalar_size = sizeof(Type); \ break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE); + AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE); #undef TYPE_CASE default: throw std::runtime_error( @@ -77,6 +77,8 @@ std::string Dtype::ToCppString() const { return "bool"; case ScalarType::Half: return "half"; + case ScalarType::BFloat16: + return "__nv_bfloat16"; default: throw unsupported_dtype(); } diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h index 00cd50db288b3..3716a0a1cd559 100644 --- a/torch/csrc/jit/tensorexpr/types.h +++ b/torch/csrc/jit/tensorexpr/types.h @@ -75,7 +75,7 @@ extern TORCH_API Dtype kHandle; #define NNC_DTYPE_DECLARATION(ctype, name) extern TORCH_API Dtype k##name; -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, NNC_DTYPE_DECLARATION) +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_DTYPE_DECLARATION) #undef NNC_DTYPE_DECLARATION template @@ -86,7 +86,7 @@ TORCH_API Dtype ToDtype(); inline Dtype ToDtype() { \ return k##name; \ } -AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, NNC_TODTYPE_DECLARATION) +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_TODTYPE_DECLARATION) #undef NNC_TODTYPE_DECLARATION TORCH_API Dtype ToDtype(ScalarType type); From 7ca4728e6dd4f3a706082dedf33c21771116070a Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 30 Aug 2021 20:17:12 -0700 Subject: [PATCH 381/530] Compile BatchLinearAlgebra without nvcc (#64146) Summary: These files only use cuda libraries interfaces, so don't actually need to be compiled with nvcc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64146 Reviewed By: ezyang Differential Revision: D30633189 Pulled By: ngimel fbshipit-source-id: c9d0ae5259a10cb49332d31f0da89ad758736ea8 --- ...inearAlgebra.cu => BatchLinearAlgebra.cpp} | 29 +++++++++---------- ...lgebraLib.cu => BatchLinearAlgebraLib.cpp} | 18 ------------ caffe2/CMakeLists.txt | 2 +- 3 files changed, 15 insertions(+), 34 deletions(-) rename aten/src/ATen/native/cuda/{BatchLinearAlgebra.cu => BatchLinearAlgebra.cpp} (99%) rename aten/src/ATen/native/cuda/{BatchLinearAlgebraLib.cu => BatchLinearAlgebraLib.cpp} (98%) diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp similarity index 99% rename from aten/src/ATen/native/cuda/BatchLinearAlgebra.cu rename to aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp index 4e806f000c5ae..7fdc55d818084 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp @@ -1701,7 +1701,7 @@ static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) #endif // USE_CUSOLVER } -REGISTER_DISPATCH(cholesky_stub, &cholesky_kernel) +REGISTER_CUDA_DISPATCH(cholesky_stub, &cholesky_kernel) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1773,7 +1773,7 @@ Tensor& cholesky_inverse_kernel_impl(Tensor &result, Tensor& infos, bool upper) } -REGISTER_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl); +REGISTER_CUDA_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1945,7 +1945,7 @@ static void apply_lu(const Tensor& input, const Tensor& pivots, const Tensor& in } } -REGISTER_DISPATCH(lu_stub, &apply_lu); +REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2039,7 +2039,7 @@ void triangular_solve_kernel(Tensor& A, Tensor& B, Tensor& infos, bool upper, bo } } -REGISTER_DISPATCH(triangular_solve_stub, &triangular_solve_kernel); +REGISTER_CUDA_DISPATCH(triangular_solve_stub, &triangular_solve_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ orgqr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2057,7 +2057,7 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) { #endif } -REGISTER_DISPATCH(orgqr_stub, &orgqr_kernel_impl); +REGISTER_CUDA_DISPATCH(orgqr_stub, &orgqr_kernel_impl); void ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) { #if defined(USE_CUSOLVER) @@ -2069,7 +2069,7 @@ void ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, b #endif } -REGISTER_DISPATCH(ormqr_stub, &ormqr_kernel); +REGISTER_CUDA_DISPATCH(ormqr_stub, &ormqr_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2148,7 +2148,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) { } } -REGISTER_DISPATCH(geqrf_stub, &geqrf_kernel); +REGISTER_CUDA_DISPATCH(geqrf_stub, &geqrf_kernel); template static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns, @@ -2423,7 +2423,7 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c #endif } -REGISTER_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel); +REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2513,7 +2513,7 @@ std::tuple eig_kernel_impl(const Tensor& self, bool& eigenvector return std::tuple(out_eigvals, out_eigvecs); } -REGISTER_DISPATCH(eig_stub, &eig_kernel_impl); +REGISTER_CUDA_DISPATCH(eig_stub, &eig_kernel_impl); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2599,7 +2599,7 @@ void linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, }); } -REGISTER_DISPATCH(linalg_eig_stub, &linalg_eig_kernel); +REGISTER_CUDA_DISPATCH(linalg_eig_stub, &linalg_eig_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2671,8 +2671,7 @@ AT_ERROR("svd: MAGMA library not found in " std::tuple _svd_helper_cuda_legacy(const Tensor& self, bool some, bool compute_uv) { std::vector infos(batchCount(self), 0); - int64_t m = self.size(-2), n = self.size(-1); - int64_t k = std::min(m, n); + int64_t m = self.size(-2); char jobchar = compute_uv ? (some ? 'S' : 'A') : 'N'; @@ -2922,13 +2921,13 @@ static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Ten } } -REGISTER_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_dispatch); +REGISTER_CUDA_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_dispatch); static void lu_solve_dispatch(const Tensor& b, const Tensor& lu, const Tensor& pivots) { lu_solve_trans_dispatch(b, lu, pivots, 'N'); } -REGISTER_DISPATCH(lu_solve_stub, &lu_solve_dispatch); +REGISTER_CUDA_DISPATCH(lu_solve_stub, &lu_solve_dispatch); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -3112,7 +3111,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul } } -REGISTER_DISPATCH(lstsq_stub, &lstsq_kernel); +REGISTER_CUDA_DISPATCH(lstsq_stub, &lstsq_kernel); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ legacy_lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp similarity index 98% rename from aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu rename to aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp index bb9af142955f0..13d67e571e7dc 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp @@ -143,10 +143,6 @@ static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool upper, boo cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; cublasSideMode_t side = CUBLAS_SIDE_LEFT; - auto A_data = A.data_ptr(); - auto B_data = B.data_ptr(); - auto A_mat_stride = matrixStride(A); - auto B_mat_stride = matrixStride(B); auto batch_size = cuda_int_cast(batchCount(A), "batch_size"); auto m = cuda_int_cast(A.size(-2), "m"); auto n = cuda_int_cast(A.size(-1), "n"); @@ -329,8 +325,6 @@ Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Ten result.zero_(); result.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1); - const int batch_size = cuda_int_cast(batchCount(result), "batchCount"); - if (result.dim() > 2) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{ apply_batched_inverse_lib( @@ -435,10 +429,6 @@ inline static void _apply_svd_lib_gesvdjBatched(const Tensor& self, Tensor& U, T auto U_data = U.data_ptr(); auto S_data = S.data_ptr(); auto VT_data = VT.data_ptr(); - auto self_stride = matrixStride(self); - auto U_stride = matrixStride(U); - auto S_stride = S.size(-1); - auto VT_stride = matrixStride(VT); int batchsize = cuda_int_cast(batchCount(self), "batch size"); int m = cuda_int_cast(self.size(-2), "m"); @@ -481,7 +471,6 @@ std::tuple _svd_helper_cuda_lib(const Tensor& self, bool at::Tensor infos = at::zeros({batch_size}, self.options().dtype(at::kInt)); const int64_t m = self.size(-2); const int64_t n = self.size(-1); - const int64_t k = std::min(m, n); Tensor U_working_copy, S_working_copy, VT_working_copy; std::tie(U_working_copy, S_working_copy, VT_working_copy) = \ @@ -686,11 +675,7 @@ inline static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_cop const int64_t nrhs = self_working_copy.size(-1); const int64_t lda = std::max(1, n); const int64_t batch_size = batchCount(self_working_copy); - const int64_t self_matrix_stride = matrixStride(self_working_copy); - scalar_t* self_working_copy_ptr = self_working_copy.data_ptr(); - const scalar_t* A_ptr = A_column_major_copy.data_ptr(); - const int64_t A_matrix_stride = matrixStride(A_column_major_copy); const int64_t ldb = std::max(1, A_column_major_copy.size(-1)); int* infos_ptr = infos.data_ptr(); @@ -882,8 +867,6 @@ void geqrf_cusolver(const Tensor& input, const Tensor& tau) { */ template static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) { - using value_t = typename c10::scalar_value_type::type; - auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? (input.is_complex() ? CUBLAS_OP_C : CUBLAS_OP_T) : CUBLAS_OP_N; @@ -957,7 +940,6 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other, */ template inline static void apply_orgqr(Tensor& self, const Tensor& tau) { - using value_t = typename c10::scalar_value_type::type; auto self_data = self.data_ptr(); auto tau_data = tau.data_ptr(); auto self_matrix_stride = matrixStride(self); diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 1662a92268d37..8b403a7c4014e 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -175,7 +175,7 @@ endif() if(BUILD_SPLIT_CUDA) # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp foreach(tmp ${Caffe2_GPU_SRCS}) - if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") + if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files list(APPEND Caffe2_GPU_SRCS_CU ${tmp}) else() From bc9277dca3a40d99147d4a1a3e0160a4a8e91f9f Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Mon, 30 Aug 2021 20:53:50 -0700 Subject: [PATCH 382/530] [Pytorch lite predictor] Use KinetoEdgeCPUProfiler for operator profiling. (#63367) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63367 This diff changes the way operator profiling is done in lite predictor benchmarking binary. Instead of using custom callbacks it uses KinetoEdgeCPUProfiler to profile events and then generate operator level metric from it. Since KinetoEvents do not contain cpu clock time, now we report only wallclock time. This unifies various profiling effort that we have for benchmarking purpose. In production we will still use observer based mechanism, but the advantage of using kineto profiler is that we get few other things for free, such as: - chrome trace generation. - operator level memory profiling (to be added) - flop counts (to be added) Furthermore possible we can use python post processing script to parse chrome trace and generate output similar to torch.profiler. (To be done) Test Plan: aibench run Model without debug info: https://www.internalfb.com/intern/aibench/details/219598441154763 Model with debug info and `--print_module_info true` (see Operator summary has now module hierarchy information). https://www.internalfb.com/intern/aibench/details/617154236292985 Reviewed By: raziel Differential Revision: D30327514 fbshipit-source-id: 3bb2f2daaaedfb04bd6f5d9c91292783f9c4344f --- test/cpp/jit/test_lite_interpreter.cpp | 165 ------------------------ tools/build_variables.bzl | 4 +- torch/csrc/jit/mobile/debug_info.cpp | 15 ++- torch/csrc/jit/mobile/import.cpp | 3 + torch/csrc/jit/mobile/interpreter.cpp | 3 + torch/csrc/jit/mobile/module.cpp | 3 +- torch/csrc/jit/mobile/module.h | 11 +- torch/csrc/jit/mobile/profiler_edge.cpp | 45 ++++++- torch/csrc/jit/mobile/profiler_edge.h | 5 + 9 files changed, 72 insertions(+), 182 deletions(-) diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index 3bd2becd8779d..8fb5fe2c4ec11 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -455,171 +455,6 @@ TEST(LiteInterpreterTest, BuiltinFunction) { AT_ASSERT(str == expected); } -#if !defined FB_XPLAT_BUILD -TEST(LiteInterpreterTest, ModuleInfoBasic) { - Module m("M"); - m.define(R"JIT( - def forward(self, x): - return 2 * x - )JIT"); - - std::stringstream ss; - m._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::unordered_set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - AT_ASSERT(module_debug_info_set.count("top(M)::.aten::mul")); -} - -TEST(LiteInterpreterTest, NotSaveModuleInfo) { - Module m("M"); - m.define(R"JIT( - def forward(self, x): - return x + 5 - )JIT"); - - std::stringstream ss; - m._save_for_mobile(ss); - mobile::Module bc = _load_for_mobile(ss); - - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - AT_ASSERT( - module_info.empty() || - (module_info.find("debug_handle") != std::string::npos)); - ++pc; - } catch (const std::exception& e) { - break; - } - } -} - -TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return 2 * x + 5 - )JIT"); - Module b("B"); - b.register_module("A0", a); - b.define(R"JIT( - def forward(self, x): - return self.A0.forward(x) + 1 - )JIT"); - - std::stringstream ss; - b._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - AT_ASSERT(module_debug_info_set.count("top(B)::.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(B)::.A0(A)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(B)::.A0(A)::forward.aten::mul")); -} - -TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return x + 1 - )JIT"); - Module b("B"); - b.define(R"JIT( - def forward(self, x): - return x + 2 - )JIT"); - Module c("C"); - c.register_module("A0", a); - c.register_module("B0", b); - c.define(R"JIT( - def forward(self, x): - return self.A0.forward(x) + self.B0.forward(x) - )JIT"); - - std::stringstream ss; - c._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - AT_ASSERT(module_debug_info_set.count("top(C)::.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.A0(A)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.B0(B)::forward.aten::add")); -} - -TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) { - auto runtime_bytecode_version = _get_runtime_bytecode_version(); - AT_ASSERT( - runtime_bytecode_version == - caffe2::serialize::kMaxSupportedBytecodeVersion); -} - -/** - * The test below is disarmed for FB internal xplat builds since - * BUCK requires us to pass in the script_module_v4.ptl file in - * as a resource dependency of the build rule for this file, and - * we would need to access it via the C++ Resources API instead - * of directly reading from disk (which is what the open source - * build/run does). - */ -TEST(LiteInterpreterTest, GetByteCodeVersion) { - std::string filePath(__FILE__); - auto test_model_file_v4 = - filePath.substr(0, filePath.find_last_of("/\\") + 1); - test_model_file_v4.append("script_module_v4.ptl"); - - auto version_v4 = _get_model_bytecode_version(test_model_file_v4); - AT_ASSERT(version_v4 == 4); -} -#endif // !defined(FB_XPLAT_BUILD) - namespace { void compareModelOutput( diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index dd89981094d4f..e0c43d2f8e97f 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -319,7 +319,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/testing/hooks_for_testing.cpp", "torch/csrc/utils/tensor_flatten.cpp", "torch/csrc/utils/variadic.cpp", -] + libtorch_profiler_sources +] core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [ "torch/csrc/jit/backends/backend_debug_info.cpp", @@ -337,7 +337,7 @@ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp", ] -libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources) +libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources + libtorch_profiler_sources) # These files are the only ones that are supported on Windows. libtorch_distributed_base_sources = [ diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp index 41ce3c6d46d52..a75ffe16c61f5 100644 --- a/torch/csrc/jit/mobile/debug_info.cpp +++ b/torch/csrc/jit/mobile/debug_info.cpp @@ -13,6 +13,12 @@ namespace jit { namespace { +C10_ALWAYS_INLINE std::string debugHandlesNotFoundMessage( + const std::string& debug_handles_string) { + return "Debug info for handle(s): " + debug_handles_string + + ", was not found."; +} + std::pair, std::string> getStackTraceWithModuleHierarchy( const DebugInfoTuple& source_callstack, const std::string& caller_name) { @@ -152,8 +158,7 @@ std::string MobileDebugTable::getModuleHierarchyInfo( const std::string& top_module_type_name) const { const auto it = callstack_ptr_map_.find(debug_handle); if (it == callstack_ptr_map_.end()) { - return "Module info for handle, " + std::to_string(debug_handle) + - ", not found."; + return debugHandlesNotFoundMessage(std::to_string(debug_handle)); } return (getStackTraceWithModuleHierarchy( {it->second}, "top", top_module_type_name)) @@ -172,8 +177,7 @@ std::string MobileDebugTable::getSourceDebugString( const std::string& top_module_type_name) const { const auto it = callstack_ptr_map_.find(debug_handle); if (it == callstack_ptr_map_.end()) { - return "Debug info for handle, " + std::to_string(debug_handle) + - ", not found."; + return debugHandlesNotFoundMessage(std::to_string(debug_handle)); } return (getStackTraceWithModuleHierarchy( {it->second}, "top", top_module_type_name)) @@ -208,8 +212,7 @@ std::pair MobileDebugTable:: debug_handles_string += std::to_string(debug_handle); } debug_handles_string += "}"; - debug_handles_string = - "Debug info for handles: " + debug_handles_string + ", was not found."; + debug_handles_string = debugHandlesNotFoundMessage(debug_handles_string); return {debug_handles_string, debug_handles_string}; } return (getStackTraceWithModuleHierarchy( diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index 6a548103f6965..99be225255ffb 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -517,12 +517,15 @@ mobile::Module BytecodeDeserializer::deserialize( auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements(); c10::optional> debug_handles; + bool has_debug_handles{false}; if (reader_->hasRecord("mobile_debug_handles.pkl")) { debug_handles = readArchive("mobile_debug_handles", mcu).toTuple()->elements(); + has_debug_handles = true; } parseMethods(bvals, debug_handles, *mcu); auto m = mobile::Module(readArchive("data", mcu).toObject(), mcu); + m.setHasDebugHandles(has_debug_handles); #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE) MobileDebugTable debug_table = MobileDebugTable(reader_, compilation_unit_); m.setDebugTable(std::move(debug_table)); diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp index 02e7c35792693..ab558cd2bf5e0 100644 --- a/torch/csrc/jit/mobile/interpreter.cpp +++ b/torch/csrc/jit/mobile/interpreter.cpp @@ -57,6 +57,9 @@ bool InterpreterState::run(Stack& stack) { auto inst_with_handle = code_->instructions_with_handles_.at(pc); Instruction inst = inst_with_handle.instruction; DebugHandle debug_handle = inst_with_handle.debug_handle; + // If no valid debug handle found then just log pc. + // This is possible when we did not save debug handles + debug_handle = debug_handle == -1 ? pc : debug_handle; // std::cout << "RUNNING " << pc << " " // << code_->instructions_with_handles_[pc].instruction; diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp index c04d9f74b7378..c74ca138d848a 100644 --- a/torch/csrc/jit/mobile/module.cpp +++ b/torch/csrc/jit/mobile/module.cpp @@ -145,8 +145,7 @@ std::string Module::getCallStack(const int64_t debug_handle) const { // We really need to change this part, so in the next step for profiling support // for delegates, the first thing will be to rewrite how profiling is done // for lite interpreter. -std::string Module::get_forward_method_debug_info(size_t pc) const { - auto debug_handle = find_method("forward")->get_debug_handle(pc); +std::string Module::get_forward_method_debug_info(int64_t debug_handle) const { #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE) return getDebugTable().getModuleHierarchyInfo( debug_handle, getTopModuleTypeName(*this)); diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h index 73637aa4584a0..6102aa517df66 100644 --- a/torch/csrc/jit/mobile/module.h +++ b/torch/csrc/jit/mobile/module.h @@ -78,7 +78,7 @@ class TORCH_API Module { } const std::vector parameters() const; const std::map named_parameters() const; - std::string get_forward_method_debug_info(size_t pc) const; + std::string get_forward_method_debug_info(int64_t debug_handle) const; std::string getModuleHierarchy(const int64_t debug_handle) const; std::string getCallStack(const int64_t debug_handle) const; /// Enables "training" mode. @@ -115,11 +115,20 @@ class TORCH_API Module { return debug_table_; } + void setHasDebugHandles(bool has_debug_handles) { + has_debug_handles_ = has_debug_handles; + } + + bool hasDebugHandles() const { + return has_debug_handles_; + } + private: c10::intrusive_ptr object_; std::unordered_map metadata_; std::shared_ptr cu_; MobileDebugTable debug_table_; + bool has_debug_handles_; }; } // namespace mobile } // namespace jit diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp index bcd5a6258ee7c..162e43f0982a6 100644 --- a/torch/csrc/jit/mobile/profiler_edge.cpp +++ b/torch/csrc/jit/mobile/profiler_edge.cpp @@ -2,7 +2,6 @@ #include #include -namespace profiler = torch::autograd::profiler; namespace torch { namespace jit { namespace mobile { @@ -27,17 +26,26 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler( if (with_modules || with_stack) { auto post_processing = [this, with_stack, with_modules]( std::vector& events) { + std::string no_debug_info("Model was not saved with debug information"); for (auto& e : events) { if (with_modules) { // Since KinetoEvents's module hierarchy takes vector of strings we // just construct a temporary vector using one string element - e.moduleHierarchy(std::vector( - {this->m_.getModuleHierarchy(e.debugHandle())})); + if (this->m_.hasDebugHandles()) { + e.moduleHierarchy(std::vector( + {this->m_.getModuleHierarchy(e.debugHandle())})); + } else { + e.moduleHierarchy(std::vector({no_debug_info})); + } } else if (with_stack) { // Since KinetoEvents's stack trace takes vector of strings we just // construct a temporary vector using one string element - e.stack(std::vector( - {this->m_.getCallStack(e.debugHandle())})); + if (this->m_.hasDebugHandles()) { + e.stack(std::vector( + {this->m_.getCallStack(e.debugHandle())})); + } else { + e.stack(std::vector({no_debug_info})); + } } } }; @@ -55,8 +63,33 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler( trace_file_name_ = fname; } +const std::unique_ptr& KinetoEdgeCPUProfiler:: + disableProfiler() { + TORCH_CHECK( + !profiler_result_, + "KinetoEdgeCPUProfiler already disabled. " + "To get list of events use getProfilerResults()"); + profiler_result_ = profiler::disableProfiler(); + return profiler_result_; +} + +const std::unique_ptr& KinetoEdgeCPUProfiler:: + getProfilerResult() { + TORCH_CHECK( + profiler_result_, + "KinetoEdgeCPUProfiler has not been disabled. " + "use disableProfiler() API first, which returns the ProfilerResult."); + return profiler_result_; +} + KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() { - profiler::disableProfiler()->save(trace_file_name_); + if (!trace_file_name_.empty()) { + if (profiler_result_) { + profiler_result_->save(trace_file_name_); + } else { + profiler::disableProfiler()->save(trace_file_name_); + } + } } } // namespace mobile } // namespace jit diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h index a245034e34f9b..ef37e01ed4c71 100644 --- a/torch/csrc/jit/mobile/profiler_edge.h +++ b/torch/csrc/jit/mobile/profiler_edge.h @@ -2,6 +2,7 @@ #include #include +namespace profiler = torch::autograd::profiler; namespace torch { namespace jit { namespace mobile { @@ -53,6 +54,9 @@ class TORCH_API KinetoEdgeCPUProfiler { const bool with_flops = false, const bool with_modules = false); + const std::unique_ptr& disableProfiler(); + const std::unique_ptr& getProfilerResult(); + ~KinetoEdgeCPUProfiler(); private: @@ -62,6 +66,7 @@ class TORCH_API KinetoEdgeCPUProfiler { */ const mobile::Module& m_; std::string trace_file_name_; + std::unique_ptr profiler_result_; }; } // namespace mobile } // namespace jit From a3d6dae319f03e9b5450c875349dbd65cb437767 Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Mon, 30 Aug 2021 21:31:11 -0700 Subject: [PATCH 383/530] Automated submodule update: FBGEMM (#64213) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/9d69998df6236d6714aa37ae6142a2a2d4fb2bf6 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64213 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jspark1105 Differential Revision: D30647878 fbshipit-source-id: b903b39441b4e28dda7eab226ac874e2227e750a --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index 9f4078a7bb92b..e6f80ee6570bb 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 9f4078a7bb92b88cdcfc913398ffade158160c91 +Subproject commit e6f80ee6570bb8a7ed15a5ad0d496fdfb8927470 From 3c15822f5f4ab616eb6a519a0ff9b82fc7a3dc63 Mon Sep 17 00:00:00 2001 From: Harut Movsisyan Date: Tue, 31 Aug 2021 00:49:39 -0700 Subject: [PATCH 384/530] [Static Runtime] Implement aten::nonzero out variant (#64126) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64126 Test Plan: Confirm out variant is called: ``` > buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1 ``` Reviewed By: mikeiovine Differential Revision: D30617729 fbshipit-source-id: 752749638c8f467815efa57021cb3de5c728ab1b --- benchmarks/static_runtime/test_scripts.h | 6 ++++++ .../static_runtime/test_static_runtime.cc | 9 +++++++- torch/csrc/jit/runtime/static/ops.cpp | 21 +++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index e26437fe4a6f9..37bb222f6a3d4 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -752,6 +752,12 @@ const auto append_tensor_script = R"JIT( return lst )JIT"; +const auto nonzero_tensor = R"JIT( + def forward(self, input: Tensor): + a = torch.nonzero(input).clone() + return (a) +)JIT"; + const std::string quantize_script = R"IR( graph(%input: Tensor, %weights: Tensor): %scale: float = prim::Constant[value=1.]() diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index aa5cd35e38e56..8e498dbbc664e 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1312,7 +1312,6 @@ TEST(StaticRuntime, IndividualOps_Cat) { testStaticRuntime(cat_script, args0, args1); } - TEST(StaticRuntime, IndividualOps_Cumsum) { auto a = at::randn({2, 3}); std::vector args0{a, 0}; @@ -1333,3 +1332,11 @@ TEST(StaticRuntime, IndividualOps_CumsumDtype) { std::vector args1{b, 1, dtype}; testStaticRuntime(cumsum_script_dtype, args0, args1); } + +TEST(StaticRuntime, IndividualOps_Nonzero) { + auto a = at::randint(0, 2, {2, 3}); + testStaticRuntime(nonzero_tensor, {a}); + + auto b = at::randint(0, 2, {4, 3, 2}); + testStaticRuntime(nonzero_tensor, {a}, {b}); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index a73872b540258..0cc38b0812f11 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1755,6 +1755,27 @@ REGISTER_OPERATOR_FUNCTOR(aten::cumsum, aten_cumsum, [](Node* n) -> SROperator { }; }); +REGISTER_OPERATOR_FUNCTOR( + aten::nonzero, + aten_nonzero, + [](Node* n) -> SROperator { + if (!n->matches(torch::schema("aten::nonzero(Tensor self) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& input = p_node->Input(0).toTensor(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::nonzero_cpu(input); + return; + } + + auto& output = p_node->Output(0).toTensor(); + fastResizeToZero(output); + at::native::nonzero_out_cpu(input, output); + }; + }); + namespace { void check_cat_no_zero_dim(const std::vector& tensors) { From 67cb131458bc299c5362cefa021d527852b3f683 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Tue, 31 Aug 2021 07:36:53 -0700 Subject: [PATCH 385/530] Revert D30327514: [Pytorch lite predictor] Use KinetoEdgeCPUProfiler for operator profiling. Test Plan: revert-hammer Differential Revision: D30327514 (https://github.com/pytorch/pytorch/commit/bc9277dca3a40d99147d4a1a3e0160a4a8e91f9f) Original commit changeset: 3bb2f2daaaed fbshipit-source-id: 0b2aa7c57d08de77c9aaa75e546a7d0938610f64 --- test/cpp/jit/test_lite_interpreter.cpp | 165 ++++++++++++++++++++++++ tools/build_variables.bzl | 4 +- torch/csrc/jit/mobile/debug_info.cpp | 15 +-- torch/csrc/jit/mobile/import.cpp | 3 - torch/csrc/jit/mobile/interpreter.cpp | 3 - torch/csrc/jit/mobile/module.cpp | 3 +- torch/csrc/jit/mobile/module.h | 11 +- torch/csrc/jit/mobile/profiler_edge.cpp | 45 +------ torch/csrc/jit/mobile/profiler_edge.h | 5 - 9 files changed, 182 insertions(+), 72 deletions(-) diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index 8fb5fe2c4ec11..3bd2becd8779d 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -455,6 +455,171 @@ TEST(LiteInterpreterTest, BuiltinFunction) { AT_ASSERT(str == expected); } +#if !defined FB_XPLAT_BUILD +TEST(LiteInterpreterTest, ModuleInfoBasic) { + Module m("M"); + m.define(R"JIT( + def forward(self, x): + return 2 * x + )JIT"); + + std::stringstream ss; + m._save_for_mobile(ss, {}, true); + mobile::Module bc = _load_for_mobile(ss); + + std::unordered_set module_debug_info_set; + size_t pc = 0; + while (true) { + try { + std::string module_info = bc.get_forward_method_debug_info(pc); + if (!module_info.empty() && + (module_info.find("debug_handle") == std::string::npos)) { + module_debug_info_set.insert(module_info); + } + ++pc; + } catch (const std::exception& e) { + break; + } + } + + AT_ASSERT(module_debug_info_set.count("top(M)::.aten::mul")); +} + +TEST(LiteInterpreterTest, NotSaveModuleInfo) { + Module m("M"); + m.define(R"JIT( + def forward(self, x): + return x + 5 + )JIT"); + + std::stringstream ss; + m._save_for_mobile(ss); + mobile::Module bc = _load_for_mobile(ss); + + size_t pc = 0; + while (true) { + try { + std::string module_info = bc.get_forward_method_debug_info(pc); + AT_ASSERT( + module_info.empty() || + (module_info.find("debug_handle") != std::string::npos)); + ++pc; + } catch (const std::exception& e) { + break; + } + } +} + +TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) { + Module a("A"); + a.define(R"JIT( + def forward(self, x): + return 2 * x + 5 + )JIT"); + Module b("B"); + b.register_module("A0", a); + b.define(R"JIT( + def forward(self, x): + return self.A0.forward(x) + 1 + )JIT"); + + std::stringstream ss; + b._save_for_mobile(ss, {}, true); + mobile::Module bc = _load_for_mobile(ss); + + std::set module_debug_info_set; + size_t pc = 0; + while (true) { + try { + std::string module_info = bc.get_forward_method_debug_info(pc); + if (!module_info.empty() && + (module_info.find("debug_handle") == std::string::npos)) { + module_debug_info_set.insert(module_info); + } + ++pc; + } catch (const std::exception& e) { + break; + } + } + + AT_ASSERT(module_debug_info_set.count("top(B)::.aten::add")); + AT_ASSERT(module_debug_info_set.count( + "top(B)::.A0(A)::forward.aten::add")); + AT_ASSERT(module_debug_info_set.count( + "top(B)::.A0(A)::forward.aten::mul")); +} + +TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) { + Module a("A"); + a.define(R"JIT( + def forward(self, x): + return x + 1 + )JIT"); + Module b("B"); + b.define(R"JIT( + def forward(self, x): + return x + 2 + )JIT"); + Module c("C"); + c.register_module("A0", a); + c.register_module("B0", b); + c.define(R"JIT( + def forward(self, x): + return self.A0.forward(x) + self.B0.forward(x) + )JIT"); + + std::stringstream ss; + c._save_for_mobile(ss, {}, true); + mobile::Module bc = _load_for_mobile(ss); + + std::set module_debug_info_set; + size_t pc = 0; + while (true) { + try { + std::string module_info = bc.get_forward_method_debug_info(pc); + if (!module_info.empty() && + (module_info.find("debug_handle") == std::string::npos)) { + module_debug_info_set.insert(module_info); + } + ++pc; + } catch (const std::exception& e) { + break; + } + } + + AT_ASSERT(module_debug_info_set.count("top(C)::.aten::add")); + AT_ASSERT(module_debug_info_set.count( + "top(C)::.A0(A)::forward.aten::add")); + AT_ASSERT(module_debug_info_set.count( + "top(C)::.B0(B)::forward.aten::add")); +} + +TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) { + auto runtime_bytecode_version = _get_runtime_bytecode_version(); + AT_ASSERT( + runtime_bytecode_version == + caffe2::serialize::kMaxSupportedBytecodeVersion); +} + +/** + * The test below is disarmed for FB internal xplat builds since + * BUCK requires us to pass in the script_module_v4.ptl file in + * as a resource dependency of the build rule for this file, and + * we would need to access it via the C++ Resources API instead + * of directly reading from disk (which is what the open source + * build/run does). + */ +TEST(LiteInterpreterTest, GetByteCodeVersion) { + std::string filePath(__FILE__); + auto test_model_file_v4 = + filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file_v4.append("script_module_v4.ptl"); + + auto version_v4 = _get_model_bytecode_version(test_model_file_v4); + AT_ASSERT(version_v4 == 4); +} +#endif // !defined(FB_XPLAT_BUILD) + namespace { void compareModelOutput( diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index e0c43d2f8e97f..dd89981094d4f 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -319,7 +319,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/testing/hooks_for_testing.cpp", "torch/csrc/utils/tensor_flatten.cpp", "torch/csrc/utils/variadic.cpp", -] +] + libtorch_profiler_sources core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [ "torch/csrc/jit/backends/backend_debug_info.cpp", @@ -337,7 +337,7 @@ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp", ] -libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources + libtorch_profiler_sources) +libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources) # These files are the only ones that are supported on Windows. libtorch_distributed_base_sources = [ diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp index a75ffe16c61f5..41ce3c6d46d52 100644 --- a/torch/csrc/jit/mobile/debug_info.cpp +++ b/torch/csrc/jit/mobile/debug_info.cpp @@ -13,12 +13,6 @@ namespace jit { namespace { -C10_ALWAYS_INLINE std::string debugHandlesNotFoundMessage( - const std::string& debug_handles_string) { - return "Debug info for handle(s): " + debug_handles_string + - ", was not found."; -} - std::pair, std::string> getStackTraceWithModuleHierarchy( const DebugInfoTuple& source_callstack, const std::string& caller_name) { @@ -158,7 +152,8 @@ std::string MobileDebugTable::getModuleHierarchyInfo( const std::string& top_module_type_name) const { const auto it = callstack_ptr_map_.find(debug_handle); if (it == callstack_ptr_map_.end()) { - return debugHandlesNotFoundMessage(std::to_string(debug_handle)); + return "Module info for handle, " + std::to_string(debug_handle) + + ", not found."; } return (getStackTraceWithModuleHierarchy( {it->second}, "top", top_module_type_name)) @@ -177,7 +172,8 @@ std::string MobileDebugTable::getSourceDebugString( const std::string& top_module_type_name) const { const auto it = callstack_ptr_map_.find(debug_handle); if (it == callstack_ptr_map_.end()) { - return debugHandlesNotFoundMessage(std::to_string(debug_handle)); + return "Debug info for handle, " + std::to_string(debug_handle) + + ", not found."; } return (getStackTraceWithModuleHierarchy( {it->second}, "top", top_module_type_name)) @@ -212,7 +208,8 @@ std::pair MobileDebugTable:: debug_handles_string += std::to_string(debug_handle); } debug_handles_string += "}"; - debug_handles_string = debugHandlesNotFoundMessage(debug_handles_string); + debug_handles_string = + "Debug info for handles: " + debug_handles_string + ", was not found."; return {debug_handles_string, debug_handles_string}; } return (getStackTraceWithModuleHierarchy( diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index 99be225255ffb..6a548103f6965 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -517,15 +517,12 @@ mobile::Module BytecodeDeserializer::deserialize( auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements(); c10::optional> debug_handles; - bool has_debug_handles{false}; if (reader_->hasRecord("mobile_debug_handles.pkl")) { debug_handles = readArchive("mobile_debug_handles", mcu).toTuple()->elements(); - has_debug_handles = true; } parseMethods(bvals, debug_handles, *mcu); auto m = mobile::Module(readArchive("data", mcu).toObject(), mcu); - m.setHasDebugHandles(has_debug_handles); #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE) MobileDebugTable debug_table = MobileDebugTable(reader_, compilation_unit_); m.setDebugTable(std::move(debug_table)); diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp index ab558cd2bf5e0..02e7c35792693 100644 --- a/torch/csrc/jit/mobile/interpreter.cpp +++ b/torch/csrc/jit/mobile/interpreter.cpp @@ -57,9 +57,6 @@ bool InterpreterState::run(Stack& stack) { auto inst_with_handle = code_->instructions_with_handles_.at(pc); Instruction inst = inst_with_handle.instruction; DebugHandle debug_handle = inst_with_handle.debug_handle; - // If no valid debug handle found then just log pc. - // This is possible when we did not save debug handles - debug_handle = debug_handle == -1 ? pc : debug_handle; // std::cout << "RUNNING " << pc << " " // << code_->instructions_with_handles_[pc].instruction; diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp index c74ca138d848a..c04d9f74b7378 100644 --- a/torch/csrc/jit/mobile/module.cpp +++ b/torch/csrc/jit/mobile/module.cpp @@ -145,7 +145,8 @@ std::string Module::getCallStack(const int64_t debug_handle) const { // We really need to change this part, so in the next step for profiling support // for delegates, the first thing will be to rewrite how profiling is done // for lite interpreter. -std::string Module::get_forward_method_debug_info(int64_t debug_handle) const { +std::string Module::get_forward_method_debug_info(size_t pc) const { + auto debug_handle = find_method("forward")->get_debug_handle(pc); #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE) return getDebugTable().getModuleHierarchyInfo( debug_handle, getTopModuleTypeName(*this)); diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h index 6102aa517df66..73637aa4584a0 100644 --- a/torch/csrc/jit/mobile/module.h +++ b/torch/csrc/jit/mobile/module.h @@ -78,7 +78,7 @@ class TORCH_API Module { } const std::vector parameters() const; const std::map named_parameters() const; - std::string get_forward_method_debug_info(int64_t debug_handle) const; + std::string get_forward_method_debug_info(size_t pc) const; std::string getModuleHierarchy(const int64_t debug_handle) const; std::string getCallStack(const int64_t debug_handle) const; /// Enables "training" mode. @@ -115,20 +115,11 @@ class TORCH_API Module { return debug_table_; } - void setHasDebugHandles(bool has_debug_handles) { - has_debug_handles_ = has_debug_handles; - } - - bool hasDebugHandles() const { - return has_debug_handles_; - } - private: c10::intrusive_ptr object_; std::unordered_map metadata_; std::shared_ptr cu_; MobileDebugTable debug_table_; - bool has_debug_handles_; }; } // namespace mobile } // namespace jit diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp index 162e43f0982a6..bcd5a6258ee7c 100644 --- a/torch/csrc/jit/mobile/profiler_edge.cpp +++ b/torch/csrc/jit/mobile/profiler_edge.cpp @@ -2,6 +2,7 @@ #include #include +namespace profiler = torch::autograd::profiler; namespace torch { namespace jit { namespace mobile { @@ -26,26 +27,17 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler( if (with_modules || with_stack) { auto post_processing = [this, with_stack, with_modules]( std::vector& events) { - std::string no_debug_info("Model was not saved with debug information"); for (auto& e : events) { if (with_modules) { // Since KinetoEvents's module hierarchy takes vector of strings we // just construct a temporary vector using one string element - if (this->m_.hasDebugHandles()) { - e.moduleHierarchy(std::vector( - {this->m_.getModuleHierarchy(e.debugHandle())})); - } else { - e.moduleHierarchy(std::vector({no_debug_info})); - } + e.moduleHierarchy(std::vector( + {this->m_.getModuleHierarchy(e.debugHandle())})); } else if (with_stack) { // Since KinetoEvents's stack trace takes vector of strings we just // construct a temporary vector using one string element - if (this->m_.hasDebugHandles()) { - e.stack(std::vector( - {this->m_.getCallStack(e.debugHandle())})); - } else { - e.stack(std::vector({no_debug_info})); - } + e.stack(std::vector( + {this->m_.getCallStack(e.debugHandle())})); } } }; @@ -63,33 +55,8 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler( trace_file_name_ = fname; } -const std::unique_ptr& KinetoEdgeCPUProfiler:: - disableProfiler() { - TORCH_CHECK( - !profiler_result_, - "KinetoEdgeCPUProfiler already disabled. " - "To get list of events use getProfilerResults()"); - profiler_result_ = profiler::disableProfiler(); - return profiler_result_; -} - -const std::unique_ptr& KinetoEdgeCPUProfiler:: - getProfilerResult() { - TORCH_CHECK( - profiler_result_, - "KinetoEdgeCPUProfiler has not been disabled. " - "use disableProfiler() API first, which returns the ProfilerResult."); - return profiler_result_; -} - KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() { - if (!trace_file_name_.empty()) { - if (profiler_result_) { - profiler_result_->save(trace_file_name_); - } else { - profiler::disableProfiler()->save(trace_file_name_); - } - } + profiler::disableProfiler()->save(trace_file_name_); } } // namespace mobile } // namespace jit diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h index ef37e01ed4c71..a245034e34f9b 100644 --- a/torch/csrc/jit/mobile/profiler_edge.h +++ b/torch/csrc/jit/mobile/profiler_edge.h @@ -2,7 +2,6 @@ #include #include -namespace profiler = torch::autograd::profiler; namespace torch { namespace jit { namespace mobile { @@ -54,9 +53,6 @@ class TORCH_API KinetoEdgeCPUProfiler { const bool with_flops = false, const bool with_modules = false); - const std::unique_ptr& disableProfiler(); - const std::unique_ptr& getProfilerResult(); - ~KinetoEdgeCPUProfiler(); private: @@ -66,7 +62,6 @@ class TORCH_API KinetoEdgeCPUProfiler { */ const mobile::Module& m_; std::string trace_file_name_; - std::unique_ptr profiler_result_; }; } // namespace mobile } // namespace jit From eee054e6ead98fb872b264e092955b87964db75d Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Tue, 31 Aug 2021 08:07:23 -0700 Subject: [PATCH 386/530] [DataPipe] implementing fork() (#63649) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63649 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30493945 Pulled By: NivekT fbshipit-source-id: 40db7d4134facd266d86bc0dc2edf2729c4e5842 --- test/test_datapipe.py | 110 ++++++++++++++++++- torch/utils/data/datapipes/iter/__init__.py | 1 + torch/utils/data/datapipes/iter/combining.py | 108 +++++++++++++++++- 3 files changed, 209 insertions(+), 10 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index c35698e057c89..842e4424e9169 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -591,6 +591,105 @@ def test_concat_datapipe(self): self.assertEqual(list(concat_dp), list(range(10)) + list(range(5))) + + def test_fork_datapipe(self): + input_dp = IDP(range(10)) + + # Test Case: making sure all child DataPipe shares the same reference + dp1, dp2, dp3 = input_dp.fork(num_instances=3) + self.assertTrue(all(n1 is n2 for n1, n2 in zip(dp1, dp2))) + self.assertTrue(all(n1 is n3 for n1, n3 in zip(dp1, dp3))) + + # Test Case: one child DataPipe yields all value at a time + output1, output2, output3 = list(dp1), list(dp2), list(dp3) + self.assertEqual(list(range(10)), output1) + self.assertEqual(list(range(10)), output2) + self.assertEqual(list(range(10)), output3) + + # Test Case: two child DataPipes yield value together + dp1, dp2 = input_dp.fork(num_instances=2) + output = [] + for n1, n2 in zip(dp1, dp2): + output.append((n1, n2)) + self.assertEqual([(i, i) for i in range(10)], output) + + # Test Case: one child DataPipe yields all value first, but buffer_size = 5 being too small + dp1, dp2 = input_dp.fork(num_instances=2, buffer_size=5) + it1 = iter(dp1) + for _ in range(5): + next(it1) + with self.assertRaises(BufferError): + next(it1) + + # Test Case: two child DataPipes yield value together with buffer size 1 + dp1, dp2 = input_dp.fork(num_instances=2, buffer_size=1) + output = [] + for n1, n2 in zip(dp1, dp2): + output.append((n1, n2)) + self.assertEqual([(i, i) for i in range(10)], output) + + # Test Case: make sure logic related to slowest_ptr is working properly + dp1, dp2, dp3 = input_dp.fork(num_instances=3) + output1, output2 , output3 = [], [], [] + for i, (n1, n2) in enumerate(zip(dp1, dp2)): + output1.append(n1) + output2.append(n2) + if i == 4: # yield all of dp3 when halfway through dp1, dp2 + output3 = list(dp3) + break + self.assertEqual(list(range(5)), output1) + self.assertEqual(list(range(5)), output2) + self.assertEqual(list(range(10)), output3) + + # Test Case: DataPipe doesn't reset if this pipe hasn't been read + dp1, dp2 = input_dp.fork(num_instances=2) + i1, i2 = iter(dp1), iter(dp2) + output2 = [] + for i, n2 in enumerate(i2): + output2.append(n2) + if i == 4: + i1 = iter(dp1) # Doesn't reset because i1 hasn't been read + self.assertEqual(list(range(10)), output2) + + # Test Case: DataPipe reset when some of it have been read + dp1, dp2 = input_dp.fork(num_instances=2) + i1, i2 = iter(dp1), iter(dp2) + output1, output2 = [], [] + for i, (n1, n2) in enumerate(zip(i1, i2)): + output1.append(n1) + output2.append(n2) + if i == 4: + with warnings.catch_warnings(record=True) as wa: + i1 = iter(dp1) # Reset both all child DataPipe + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + self.assertEqual(list(range(5)) + list(range(10)), output1) + self.assertEqual(list(range(5)) + list(range(10)), output2) + + # Test Case: DataPipe reset, even when some other child DataPipes are not read + dp1, dp2, dp3 = input_dp.fork(num_instances=3) + output1, output2 = list(dp1), list(dp2) + self.assertEqual(list(range(10)), output1) + self.assertEqual(list(range(10)), output2) + output1, output2 = list(dp1), list(dp2) + with warnings.catch_warnings(record=True) as wa: + self.assertEqual(list(range(10)), list(dp1)) # Resets even though dp3 has not been read + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + output3 = [] + for i, n3 in enumerate(dp3): + output3.append(n3) + if i == 4: + with warnings.catch_warnings(record=True) as wa: + output1 = list(dp1) # Resets even though dp3 is only partially read + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + self.assertEqual(list(range(5)), output3) + self.assertEqual(list(range(10)), output1) + break + self.assertEqual(list(range(10)), list(dp3)) # dp3 has to read from the start again + + def test_map_datapipe(self): input_dp = IDP(range(10)) @@ -1333,24 +1432,25 @@ def test_simple_traverse(self): expected: Dict[Any, Any] = {mapped_dp: {numbers_dp: {}}} self.assertEqual(expected, graph) - # TODO(VitalyFedyunin): This test is incorrect because of 'buffer' nature - # of the fork fake implementation, update fork first and fix this test too @skipIfNoDill def test_traverse_forked(self): numbers_dp = NumbersDataset(size=50) - dp0, dp1, dp2 = numbers_dp.fork(3) + dp0, dp1, dp2 = numbers_dp.fork(num_instances=3) dp0_upd = dp0.map(lambda x: x * 10) dp1_upd = dp1.filter(lambda x: x % 3 == 1) combined_dp = dp0_upd.mux(dp1_upd, dp2) graph = torch.utils.data.graph.traverse(combined_dp) - expected = {combined_dp: {dp0_upd: {dp0: {}}, dp1_upd: {dp1: {}}, dp2: {}}} + expected = {combined_dp: {dp0_upd: {dp0: {dp0.main_datapipe: {dp0.main_datapipe.main_datapipe: {}}}}, + dp1_upd: {dp1: {dp1.main_datapipe: {dp1.main_datapipe.main_datapipe: {}}}}, + dp2: {dp2.main_datapipe: {dp2.main_datapipe.main_datapipe: {}}}}} self.assertEqual(expected, graph) class TestSharding(TestCase): + def _get_pipeline(self): numbers_dp = NumbersDataset(size=10) - dp0, dp1 = numbers_dp.fork(2) + dp0, dp1 = numbers_dp.fork(num_instances=2) dp0_upd = dp0.map(lambda x: x * 10) dp1_upd = dp1.filter(lambda x: x % 3 == 1) combined_dp = dp0_upd.mux(dp1_upd) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index b55bbf6667509..b460d4d77cf5a 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -8,6 +8,7 @@ ) from torch.utils.data.datapipes.iter.combining import ( ConcaterIterDataPipe as Concater, + ForkerIterDataPipe as Forker, ZipperIterDataPipe as Zipper, ) from torch.utils.data.datapipes.iter.filelister import ( diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index 879e8be27ff0c..85b37324992fc 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -1,7 +1,9 @@ import functools +import warnings from torch.utils.data import IterDataPipe, functional_datapipe -from typing import Iterator, Optional, Sized, Tuple, TypeVar +from typing import Any, Iterator, Optional, Sized, Tuple, TypeVar, Deque +from collections import deque T_co = TypeVar('T_co', covariant=True) @@ -46,6 +48,7 @@ def __len__(self) -> int: # This is fake class to show API, going to be replaced by the copy from torchdata # TODO(VitalyFedyunin): Replace with valid version, documentation and tests class IterateBuffer(IterDataPipe): + def __init__(self, buffer): self.buffer = buffer @@ -56,11 +59,106 @@ def __iter__(self): @functional_datapipe('fork') class ForkerIterDataPipe(IterDataPipe): + r""" :class:`ForkerIterDataPipe`. + + Iterable DataPipe to create multiple instances of the same Iterable DataPipe. + args: + datapipe: Iterable DataPipe being copied + num_instances: number of instances of the datapipe to create + buffer_size: this restricts how far ahead the leading child DataPipe + can read relative to the slowest child DataPipe + """ + def __new__(cls, datapipe: IterDataPipe, num_instances: int, buffer_size: int = 1000): + container = _ForkerIterDataPipe(datapipe, num_instances, buffer_size) + return [_ChildDataPipe(container, i) for i in range(num_instances)] - def __new__(cls, datapipe, instances): - result = [] - buffer = list(datapipe) - return [IterateBuffer(buffer) for i in range(instances)] + +class _ForkerIterDataPipe(IterDataPipe): + r""" :class:`_ForkerIterDataPipe`. + + Container to hold instance-specific information on behalf of ForkerIterDataPipe. It tracks + the state of its child DataPipes, maintains the buffer, and yields the next value + as requested by the child DataPipes. + """ + def __init__(self, datapipe: IterDataPipe, num_instances: int, buffer_size: int = 1000): + self.main_datapipe = datapipe + self._datapipe_iterator: Optional[Iterator[Any]] = None + self.num_instances = num_instances + self.buffer: Deque = deque() + self.buffer_size = buffer_size + self.child_pointers = [0] * num_instances # Indicate the indices of the next element to get + self.slowest_ptr = 0 + self.leading_ptr = 0 + self.end_ptr: Optional[int] = None + + def get_next_element_by_instance(self, instance_id: int): + if self._datapipe_iterator is None: + self._datapipe_iterator = iter(self.main_datapipe) + while self.end_ptr is None or self.child_pointers[instance_id] < self.end_ptr: + if not self.buffer or self.child_pointers[instance_id] > self.leading_ptr: + self.leading_ptr = self.child_pointers[instance_id] + if self.leading_ptr - self.slowest_ptr + 1 > self.buffer_size: + raise BufferError("ForkerIterDataPipe buffer overflow," + + f"buffer size {self.buffer_size} is insufficient.") + try: + self.buffer.append(next(self._datapipe_iterator)) + self.child_pointers[instance_id] += 1 + yield self.buffer[-1] + except StopIteration: + self.end_ptr = self.leading_ptr + else: # Child pointer is slower than or equal to the leading_ptr + buffer_index = self.child_pointers[instance_id] - self.slowest_ptr + return_val = self.buffer[buffer_index] + self.child_pointers[instance_id] += 1 + if self.child_pointers[instance_id] - 1 == self.slowest_ptr: + new_min = min(self.child_pointers) # Can optimize by avoiding the call to min() + if self.slowest_ptr < new_min: + self.slowest_ptr = new_min + self.buffer.popleft() + yield return_val + + def is_instance_started(self, instance_id: int) -> bool: + return self.child_pointers[instance_id] != 0 + + def is_every_instance_exhausted(self) -> bool: + return all(self.end_ptr == ptr for ptr in self.child_pointers) + + def reset(self): + self._datapipe_iterator = iter(self.main_datapipe) + self.buffer = deque() + self.child_pointers = [0] * self.num_instances + self.slowest_ptr = 0 + self.leading_ptr = 0 + self.end_ptr = None + +class _ChildDataPipe(IterDataPipe): + r""" :class:`_ChildDataPipe`. + + Iteratable Datapipe that is a child of a main DataPipe. The instance of this class + will pass its instance_id to get the next value from its main DataPipe. + args: + main_datapipe: Main DataPipe with a method 'get_next_element_by_instance(instance_id)' + instance_id: integer identifier of this instance + """ + def __init__(self, main_datapipe, instance_id: int): + required_attrs = ["get_next_element_by_instance", "is_instance_started", "is_every_instance_exhausted", "reset"] + required_ops = [getattr(main_datapipe, attr) for attr in required_attrs] + if any(not callable(op) for op in required_ops): + raise NotImplementedError(f"Main Datapipe must have methods {required_attrs} implemented.") + self.main_datapipe = main_datapipe + self.instance_id = instance_id + + def __iter__(self): + if self.main_datapipe.is_instance_started(self.instance_id): # Only reset if the DataPipe started to read + if not self.main_datapipe.is_every_instance_exhausted(): + warnings.warn("Some child DataPipes are not exhausted when __iter__ is called. We are resetting " + "the buffer and each child DataPipe will read from the start again.", UserWarning) + self.main_datapipe.reset() + # We want to separate the code for reset and yield, so that 'reset' exeutes before __next__ is called + return self.get_generator_by_instance(self.instance_id) + + def get_generator_by_instance(self, instance_id: int): + yield from self.main_datapipe.get_next_element_by_instance(self.instance_id) @functional_datapipe('demux') From 0deb7a0bc08bf4e65f346f2956de4e5227f3a12e Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Tue, 31 Aug 2021 08:07:23 -0700 Subject: [PATCH 387/530] [DataPipe] implementing demux() (#63650) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63650 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30493944 Pulled By: NivekT fbshipit-source-id: 0aa06dee8c7fb1744975b8f6a0694b90c11ef80d --- test/test_datapipe.py | 99 ++++++++++++++++++++ torch/utils/data/datapipes/iter/__init__.py | 2 + torch/utils/data/datapipes/iter/combining.py | 94 +++++++++++++++++-- 3 files changed, 187 insertions(+), 8 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 842e4424e9169..b6e3513622e13 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -690,6 +690,105 @@ def test_fork_datapipe(self): self.assertEqual(list(range(10)), list(dp3)) # dp3 has to read from the start again + def test_demux_datapipe(self): + input_dp = IDP(range(10)) + + # Test Case: split into 2 DataPipes and output them one at a time + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) + output1, output2 = list(dp1), list(dp2) + self.assertEqual(list(range(0, 10, 2)), output1) + self.assertEqual(list(range(1, 10, 2)), output2) + + # Test Case: split into 2 DataPipes and output them together + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) + output = [] + for n1, n2 in zip(dp1, dp2): + output.append((n1, n2)) + self.assertEqual([(i, i + 1) for i in range(0, 10, 2)], output) + + # Test Case: values of the same classification are lumped together, and buffer_size = 3 being too small + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=4) + it1 = iter(dp1) + with self.assertRaises(BufferError): + next(it1) # Buffer raises because first 5 elements all belong to the a different child + + # Test Case: values of the same classification are lumped together, and buffer_size = 5 is just enough + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=5) + output1, output2 = list(dp1), list(dp2) + self.assertEqual(list(range(5, 10)), output1) + self.assertEqual(list(range(0, 5)), output2) + + # Test Case: classifer returns a value outside of [0, num_instance - 1] + dp = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2) + it = iter(dp[0]) + with self.assertRaises(ValueError): + next(it) + next(it) + + # Test Case: DataPipe doesn't reset when it has not been read + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) + i1 = iter(dp1) + output2 = [] + i = 0 + for i, n2 in enumerate(dp2): + output2.append(n2) + if i == 4: + i1 = iter(dp1) + self.assertEqual(list(range(1, 10, 2)), output2) + + # Test Case: DataPipe reset when some of it has been read + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) + output1, output2 = [], [] + for n1, n2 in zip(dp1, dp2): + output1.append(n1) + output2.append(n2) + if n1 == 4: + break + with warnings.catch_warnings(record=True) as wa: + i1 = iter(dp1) # Reset all child DataPipes + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + for n1, n2 in zip(dp1, dp2): + output1.append(n1) + output2.append(n2) + self.assertEqual([0, 2, 4] + list(range(0, 10, 2)), output1) + self.assertEqual([1, 3, 5] + list(range(1, 10, 2)), output2) + + # Test Case: DataPipe reset, even when not all child DataPipes are exhausted + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) + output1 = list(dp1) + self.assertEqual(list(range(0, 10, 2)), output1) + with warnings.catch_warnings(record=True) as wa: + self.assertEqual(list(range(0, 10, 2)), list(dp1)) # Reset even when dp2 is not read + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + output2 = [] + for i, n2 in enumerate(dp2): + output2.append(n2) + if i == 1: + self.assertEqual(list(range(1, 5, 2)), output2) + with warnings.catch_warnings(record=True) as wa: + self.assertEqual(list(range(0, 10, 2)), list(dp1)) # Can reset even when dp2 is partially read + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + break + output2 = list(dp2) # output2 has to read from beginning again + self.assertEqual(list(range(1, 10, 2)), output2) + + # Test Case: drop_none = True + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2 if x % 5 != 0 else None, + drop_none=True) + self.assertEqual([2, 4, 6, 8], list(dp1)) + self.assertEqual([1, 3, 7, 9], list(dp2)) + + # Test Case: drop_none = False + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2 if x % 5 != 0 else None, + drop_none=False) + it1 = iter(dp1) + with self.assertRaises(ValueError): + next(it1) + + def test_map_datapipe(self): input_dp = IDP(range(10)) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index b460d4d77cf5a..d4baef788ecca 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -8,7 +8,9 @@ ) from torch.utils.data.datapipes.iter.combining import ( ConcaterIterDataPipe as Concater, + DemultiplexerIterDataPipe as Demultiplexer, ForkerIterDataPipe as Forker, + MultiplexerIterDataPipe as Multiplexer, ZipperIterDataPipe as Zipper, ) from torch.utils.data.datapipes.iter.filelister import ( diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index 85b37324992fc..f44db96c15743 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -1,8 +1,7 @@ -import functools import warnings from torch.utils.data import IterDataPipe, functional_datapipe -from typing import Any, Iterator, Optional, Sized, Tuple, TypeVar, Deque +from typing import Any, Callable, Iterator, List, Optional, Sized, Tuple, TypeVar, Deque from collections import deque T_co = TypeVar('T_co', covariant=True) @@ -163,14 +162,93 @@ def get_generator_by_instance(self, instance_id: int): @functional_datapipe('demux') class DemultiplexerIterDataPipe(IterDataPipe): + r""" :class:`DemultiplexerIterDataPipe`. - def __new__(cls, datapipe, instances, classifier_fn): - result = [] - buffer = list(datapipe) + Iterable DataPipe to split the input DataPipe into multiple child DataPipes, using the given + classification function. A list of the child DataPipes is returned from this operation. + args: + datapipe: Iterable DataPipe being filtered + num_instances: number of instances of the DataPipe to create + classifier_fn: a function that maps values to an integer within the range [0, num_instances - 1] or None + drop_none: defaults to False, if True, the function will skip over elements classified as None + buffer_size: this defines the maximum number of inputs that the buffer can hold across all child + DataPipes while waiting for their values to be yielded + """ + def __new__(cls, datapipe: IterDataPipe, num_instances: int, + classifier_fn: Callable[[T_co], int], drop_none: bool = False, buffer_size: int = 1000): + container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size) + return [_ChildDataPipe(container, i) for i in range(num_instances)] + + +class _DemultiplexerIterDataPipe(IterDataPipe): + r""" :class:`_DemultiplexerIterDataPipe`. + + Container to hold instance-specific information on behalf of DemultiplexerIterDataPipe. It tracks + the state of its child DataPipes, maintains the buffer, classifies and yields the next correct value + as requested by the child DataPipes. + """ + + def __init__(self, datapipe: IterDataPipe[T_co], num_instances: int, + classifier_fn: Callable[[T_co], int], drop_none: bool, buffer_size: int): + self.main_datapipe = datapipe + self._datapipe_iterator: Optional[Iterator[Any]] = None + self.num_instances = num_instances + self.max_buffer_size = buffer_size + self.current_buffer_usage = 0 + self.child_buffers: List[Deque[T_co]] = [deque() for _ in range(num_instances)] + self.instance_started: List[bool] = [False] * num_instances + self.classifier_fn = classifier_fn + self.drop_none = drop_none + self.main_datapipe_exhausted = False + + def _find_next(self, instance_id: int) -> T_co: + while True: + if self._datapipe_iterator is None: + raise ValueError("_datapipe_iterator has not been set, likely because this private method is called directly " + "without invoking get_next_element_by_instance() first.") + value = next(self._datapipe_iterator) + classification = self.classifier_fn(value) + if classification is None and self.drop_none: + continue + if classification is None or classification >= self.num_instances or classification < 0: + raise ValueError(f"Output of the classification fn should be between 0 and {self.num_instances - 1}. " + + f"{classification} is returned.") + if classification == instance_id: + return value + self.child_buffers[classification].append(value) + self.current_buffer_usage += 1 + if self.current_buffer_usage > self.max_buffer_size: + raise BufferError( + f"DemultiplexerIterDataPipe buffer overflow, buffer size {self.max_buffer_size} is insufficient.") - def filter_fn(classifier_fn, i, x): - return classifier_fn(x) == i - return [IterateBuffer(buffer).filter(functools.partial(filter_fn, classifier_fn, i)) for i in range(instances)] + def get_next_element_by_instance(self, instance_id: int): + if self._datapipe_iterator is None: + self._datapipe_iterator = iter(self.main_datapipe) + stop = False + self.instance_started[instance_id] = True + while not stop: + if self.child_buffers[instance_id]: + self.current_buffer_usage -= 1 + yield self.child_buffers[instance_id].popleft() + else: + try: + yield self._find_next(instance_id) + except StopIteration: + stop = True + self.main_datapipe_exhausted = True + + def is_instance_started(self, instance_id: int) -> bool: + return self.instance_started[instance_id] + + def is_every_instance_exhausted(self) -> bool: + return self.main_datapipe_exhausted and all(not child_buffer for child_buffer in self.child_buffers) + + def reset(self): + self._datapipe_iterator = iter(self.main_datapipe) + self.current_buffer_usage = 0 + self.child_buffers = [deque() for _ in range(self.num_instances)] + self.instance_started = [False] * self.num_instances + self.main_datapipe_exhausted = False @functional_datapipe('mux') class MultiplexerIterDataPipe(IterDataPipe): From 0ef8760bf6b3e8098ef42df60f1e451234151f32 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Tue, 31 Aug 2021 08:07:23 -0700 Subject: [PATCH 388/530] [DataPipe] implementing __len__ for fork (no valid length for demux) (#64215) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64215 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30648672 Pulled By: NivekT fbshipit-source-id: 4780f2f6a79ae15a4009092475e7d92f96dd09a2 --- test/test_datapipe.py | 13 +++++++++++++ torch/utils/data/datapipes/iter/combining.py | 15 ++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index b6e3513622e13..4e37f41565226 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -689,6 +689,12 @@ def test_fork_datapipe(self): break self.assertEqual(list(range(10)), list(dp3)) # dp3 has to read from the start again + # Test Case: Each DataPipe inherits the source datapipe's length + dp1, dp2, dp3 = input_dp.fork(num_instances=3) + self.assertEqual(len(input_dp), len(dp1)) + self.assertEqual(len(input_dp), len(dp2)) + self.assertEqual(len(input_dp), len(dp3)) + def test_demux_datapipe(self): input_dp = IDP(range(10)) @@ -788,6 +794,13 @@ def test_demux_datapipe(self): with self.assertRaises(ValueError): next(it1) + # Test Case: __len__ not implemented + dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) + with self.assertRaises(TypeError): + len(dp1) # It is not implemented as we do not know length for each child in advance + with self.assertRaises(TypeError): + len(dp2) + def test_map_datapipe(self): input_dp = IDP(range(10)) diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index f44db96c15743..a837c5bb101c7 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -61,7 +61,8 @@ class ForkerIterDataPipe(IterDataPipe): r""" :class:`ForkerIterDataPipe`. Iterable DataPipe to create multiple instances of the same Iterable DataPipe. - args: + + Args: datapipe: Iterable DataPipe being copied num_instances: number of instances of the datapipe to create buffer_size: this restricts how far ahead the leading child DataPipe @@ -90,6 +91,9 @@ def __init__(self, datapipe: IterDataPipe, num_instances: int, buffer_size: int self.leading_ptr = 0 self.end_ptr: Optional[int] = None + def __len__(self): + return len(self.main_datapipe) + def get_next_element_by_instance(self, instance_id: int): if self._datapipe_iterator is None: self._datapipe_iterator = iter(self.main_datapipe) @@ -135,7 +139,8 @@ class _ChildDataPipe(IterDataPipe): Iteratable Datapipe that is a child of a main DataPipe. The instance of this class will pass its instance_id to get the next value from its main DataPipe. - args: + + Args: main_datapipe: Main DataPipe with a method 'get_next_element_by_instance(instance_id)' instance_id: integer identifier of this instance """ @@ -156,6 +161,9 @@ def __iter__(self): # We want to separate the code for reset and yield, so that 'reset' exeutes before __next__ is called return self.get_generator_by_instance(self.instance_id) + def __len__(self): + return len(self.main_datapipe) + def get_generator_by_instance(self, instance_id: int): yield from self.main_datapipe.get_next_element_by_instance(self.instance_id) @@ -166,7 +174,8 @@ class DemultiplexerIterDataPipe(IterDataPipe): Iterable DataPipe to split the input DataPipe into multiple child DataPipes, using the given classification function. A list of the child DataPipes is returned from this operation. - args: + + Args: datapipe: Iterable DataPipe being filtered num_instances: number of instances of the DataPipe to create classifier_fn: a function that maps values to an integer within the range [0, num_instances - 1] or None From ca8dd296ee42fd68b8c9360d10916e02e009eeff Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Tue, 31 Aug 2021 09:45:09 -0700 Subject: [PATCH 389/530] Add OpInfo for `nn.functional.cosine_similarity` (#62959) Summary: Please see https://github.com/facebookresearch/functorch/issues/78 and https://github.com/pytorch/pytorch/issues/54261. Notes: * Some redundant tests from `test_nn.py` have been removed. I'm unsure about precision checks if they can be removed as well. * Broadcasting is also checked in the OpInfo for `cosine_similarity`. cc: mruberry zou3519 Chillee Pull Request resolved: https://github.com/pytorch/pytorch/pull/62959 Reviewed By: heitorschueroff Differential Revision: D30520176 Pulled By: zou3519 fbshipit-source-id: 14e902eb4bcce875edab28a1669a2ea021052b9b --- test/test_nn.py | 20 -------------- .../_internal/common_methods_invocations.py | 27 +++++++++++++++++++ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/test/test_nn.py b/test/test_nn.py index 96321ba183be0..5008c7256acf7 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -9617,25 +9617,6 @@ def test_huber_loss_zero_delta(): test_huber_loss_zero_delta() def test_cosine_similarity(self): - input1 = torch.randn(4, 4, requires_grad=True) - input2 = torch.randn(4, 4, requires_grad=True) - self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y), (input1, input2))) - - input1 = torch.randn(4, 5, 6, requires_grad=True) - input2 = torch.randn(4, 5, 6, requires_grad=True) - self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=0), (input1, input2))) - self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2))) - - input1 = torch.randn((), requires_grad=True) - input2 = torch.randn((), requires_grad=True) - self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=0), (input1, input2))) - self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2))) - - # Check broadcasting - input1 = torch.randn(2, 1, 3, requires_grad=True) - input2 = torch.randn(1, 2, 3, requires_grad=True) - self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2))) - # Check cosine_similarity input/output shapes input_size = (1, 3, 2, 1) expected_size = (1, 2, 1) @@ -9662,7 +9643,6 @@ def test_cosine_similarity(self): with self.assertRaises(RuntimeError): F.cosine_similarity(input1, input2) - # Check type promotion, issue #61454 input = torch.tensor(12.) out = F.cosine_similarity(input.to(torch.int8), input, dim=-1) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index e7d93807511a5..3579310dc68c9 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1249,6 +1249,26 @@ def sample_inputs_linalg_norm(op_info, device, dtype, requires_grad): dim=(0, 1)))) return inputs +def sample_inputs_cosine_similarity(op_info, device, dtype, requires_grad, **kwargs): + make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + # Ordered as input_shape, dict of dim and eps + cases: Tuple[tuple, dict] = ( # type: ignore[assignment] + ((S, S), {'dim': 1}), + ((S, 2), {'dim': -1}), + ((S,), {'dim': 0, 'eps': 0.5}), + ((), {'dim': 0}), + ((S, S, M), {'dim': 2}), + ((S, S), {}) + ) + + def generator(): + for input_shape, kwargs in cases: + yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs) + # Test for Broadcasting + yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1}) + + return list(generator()) def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -7175,6 +7195,13 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): # FIXME: aminmax does not check for safe casting to output SkipInfo('TestCommon', 'test_out'), )), + OpInfo('nn.functional.cosine_similarity', + aten_name="cosine_similarity", + dtypes=floating_types_and(torch.bfloat16), + dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + supports_out=False, + supports_forward_ad=True, + sample_inputs_func=sample_inputs_cosine_similarity), OpInfo('nn.functional.adaptive_avg_pool2d', dtypes=floating_types(), dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), From b9275a40034377a99f245cde36f63462a9dc0995 Mon Sep 17 00:00:00 2001 From: Raghuraman Krishnamoorthi Date: Tue, 31 Aug 2021 09:45:28 -0700 Subject: [PATCH 390/530] [ao][docs] Add description of qconfig and qengine to quantization page (#63582) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63582 Current quantization docs do not define qconfig and qengine. Added text to define these concepts before they are used. ghstack-source-id: 137051719 Test Plan: Imported from OSS Reviewed By: HDCharles Differential Revision: D30658656 fbshipit-source-id: a45a0fcdf685ca1c3f5c3506337246a430f8f506 --- docs/source/quantization.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index eb6c74c72facd..7053ca68d920e 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -35,6 +35,13 @@ that perform all or part of the computation in lower precision. Higher-level APIs are provided that incorporate typical workflows of converting FP32 model to lower precision with minimal accuracy loss. +Quantization requires users to be aware of three concepts: + +#. Quantization Config (Qconfig): Specifies how weights and activations are to be quantized. Qconfig is needed to create a quantized model. +#. Backend: Refers to kernels that support quantization, usually with different numerics. +#. Quantization engine (torch.backends.quantization.engine): When a quantized model is executed, the qengine specifies which backend is to be used for execution. It is important to ensure that the qengine is consistent with the Qconfig. + + Natively supported backends --------------------------- @@ -45,7 +52,8 @@ Today, PyTorch supports the following backends for running quantized operators e * ARM CPUs (typically found in mobile/embedded devices), via `qnnpack` (``_). -The corresponding implementation is chosen automatically based on the PyTorch build mode. +The corresponding implementation is chosen automatically based on the PyTorch build mode, though users +have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`. .. note:: @@ -58,7 +66,7 @@ The corresponding implementation is chosen automatically based on the PyTorch bu When preparing a quantized model, it is necessary to ensure that qconfig -and the qengine used for quantized computations match the backend on which +and the engine used for quantized computations match the backend on which the model will be executed. The qconfig controls the type of observers used during the quantization passes. The qengine controls whether `fbgemm` or `qnnpack` specific packing function is used when packing weights for linear From 83e28a7d281c91a6d1a12b86bd5fb212dd424a85 Mon Sep 17 00:00:00 2001 From: Saketh Are Date: Tue, 31 Aug 2021 10:59:57 -0700 Subject: [PATCH 391/530] Use stacklevel for floordiv deprecation warnings (#64034) Summary: Fixes https://github.com/pytorch/pytorch/issues/60548 `Tensor.__floordiv__` was indirectly deprecated by deprecation of `torch.floor_divide` (see https://github.com/pytorch/pytorch/issues/43874). Deprecating it directly provides clearer feedback. Repro: ``` import torch x = torch.tensor(0) x // 1 ``` Before this change, a deprecation warning was triggered within the C++ implementation of floor_divide: ``` UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at ../aten/src/ATen/native/BinaryOps.cpp:571.) return torch.floor_divide(self, other) ``` After this change, the warning instead cites the user's offending line of Python code: ``` UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). x // 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/64034 Reviewed By: mruberry Differential Revision: D30658010 Pulled By: saketh-are fbshipit-source-id: b0e6c5008d741897509d102f4a89efb47de4aa2a --- test/test_binary_ufuncs.py | 4 ++-- test/test_sparse.py | 4 ++-- torch/_tensor.py | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index 1e9e804ab86d1..2695ab6a86115 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -1622,7 +1622,7 @@ def test_floor_divide_tensor(self, device, dtype): x = torch.randn(10, device=device).mul(30).to(dtype) y = torch.arange(1, 11, dtype=dtype, device=device) - with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): + with self.assertWarnsOnceRegex(UserWarning, "__floordiv__"): z = x // y z_alt = torch.trunc(x.double() / y.double()).to(dtype) @@ -1634,7 +1634,7 @@ def test_floor_divide_tensor(self, device, dtype): def test_floor_divide_scalar(self, device, dtype): x = torch.randn(100, device=device).mul(10).to(dtype) - with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): + with self.assertWarnsOnceRegex(UserWarning, "__floordiv__"): z = x // 3 z_alt = torch.tensor([math.trunc(v.item() / 3.) for v in x], dtype=x.dtype, device=device) diff --git a/test/test_sparse.py b/test/test_sparse.py index aaf045c4b0ea0..8fa32edbc5e8a 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -1562,7 +1562,7 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device, self.assertEqual(self.safeToDense(y1), expected) self.assertEqual(self.safeToDense(y2), expected) - with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'): + with self.assertWarnsOnceRegex(UserWarning, '__floordiv__'): y1 = x1 // 37.5 y2 = x1.clone() with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'): @@ -2915,7 +2915,7 @@ def test_div_by_sparse_error(self, device): / torch.tensor(1., device=device).to_sparse()) def test_floor_divide_by_sparse_error(self, device): - self.assertRaisesRegex(RuntimeError, 'Sparse floor division requires', + self.assertRaisesRegex(RuntimeError, 'Sparse division requires', lambda: torch.tensor(1., device=device).to_sparse() // torch.tensor(1., device=device).to_sparse()) diff --git a/torch/_tensor.py b/torch/_tensor.py index b4cee9aa2a32c..e7bc4ed9165a2 100644 --- a/torch/_tensor.py +++ b/torch/_tensor.py @@ -582,11 +582,21 @@ def __rpow__(self, other): @_wrap_type_error_to_not_implemented def __floordiv__(self, other): - return torch.floor_divide(self, other) + warnings.warn("__floordiv__ is deprecated, and its behavior will change in a future version of pytorch. " + "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). " + "This results in incorrect rounding for negative values. " + "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), " + "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3) + return torch.div(self, other, rounding_mode='trunc') @_wrap_type_error_to_not_implemented def __rfloordiv__(self, other): - return torch.floor_divide(other, self) + warnings.warn("__rfloordiv__ is deprecated, and its behavior will change in a future version of pytorch. " + "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). " + "This results in incorrect rounding for negative values. " + "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), " + "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3) + return torch.div(other, self, rounding_mode='trunc') @_wrap_type_error_to_not_implemented def __rlshift__(self, other): From 845bc89811f59822fe585cf44e774857adefcff7 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Tue, 31 Aug 2021 11:29:07 -0700 Subject: [PATCH 392/530] [fx2trt] Add acc_ops.sign and converter for it (#63876) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63876 Add `acc_ops.sign` which maps from `torch.sign`. Add a plugin (not support dynamic shape currently) for `acc_ops.sign`. The plugin calls `at::sign` directly. Test Plan: buck test mode/opt -c python.package_style=inplace -c fbcode.nvcc_arch=a100 caffe2/torch/fb/fx2trt:test_unary_ops Reviewed By: yinghai Differential Revision: D30518081 fbshipit-source-id: a0b9e6c30deac0b04b8cb09a162579e229985330 --- .../fx2trt/converters/acc_ops_converters.py | 1 - torch/fx/experimental/fx2trt/fx2trt.py | 18 +++++++++++------- torch/fx/experimental/fx_acc/acc_ops.py | 6 ++++++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py index ba370b2b067d4..e101b6b7f22ff 100644 --- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py +++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py @@ -1098,7 +1098,6 @@ def acc_ops_clamp(network, target, args, kwargs, name): return input_val - @tensorrt_converter(acc_ops.tuple_construct) def acc_ops_tuple_construct(network, target, args, kwargs, name): return kwargs["tensors"] diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py index 72497a7d2aafc..f1d17e701790d 100644 --- a/torch/fx/experimental/fx2trt/fx2trt.py +++ b/torch/fx/experimental/fx2trt/fx2trt.py @@ -4,6 +4,7 @@ import tensorrt as trt import torch import torch.fx +from torch.fx.node import _get_qualified_name # Borrowed from torch2trt @@ -226,14 +227,15 @@ def __init__( else: self.network = self.builder.create_network() + missing_ops = self.validate_conversion() + if missing_ops: + warnings.warn("Interpretation will fail due to missing operations \n" + + "\n".join(f"{i}" for i in missing_ops)) + self.optimization_profiles: Optional[List] = None self.input_specs = input_specs self.input_specs_iter = 0 self.validate_input_specs() - missing_ops = self.validate_conversion - if not missing_ops: - warnings.warn("Interpretation may fail due to missing operations \n" - + "\n".join(f"{i}" for i in missing_ops)) self._cur_node_name: Optional[str] = None self._input_names: List[str] = [] self._output_names: List[str] = [] @@ -299,13 +301,15 @@ def validate_conversion(self): missing_converter = set() for node in self.module.graph.nodes: - if node.op in ["call_function", "call_method"] and not CONVERTERS.get(node.target): - missing_converter.add(f"{node.op} {node.target}") + if node.op == "call_function" and not CONVERTERS.get(node.target): + missing_converter.add(f"{node.op} {_get_qualified_name(node.target)}") + elif node.op == "call_method" and not CONVERTERS.get(node.target): + missing_converter.add(f"{node.op} torch.Tensor.{node.target}") elif node.op == "call_module": submod = self.fetch_attr(node.target) submod_type = getattr(submod, "_base_class_origin", type(submod)) if not CONVERTERS.get(submod_type): - missing_converter.add(f"{node.op} {submod_type}") + missing_converter.add(f"{node.op} {torch.typename(submod_type)}") return missing_converter diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py index 1b4b4690f732d..b10d35edd5baa 100644 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ b/torch/fx/experimental/fx_acc/acc_ops.py @@ -95,6 +95,12 @@ def avg_pool2d( return nn.functional.avg_pool2d(**locals()) +@register_acc_op_mapping(op_and_target=("call_function", torch.sign)) +@register_acc_op +def sign(*, input): + return torch.sign(input) + + @register_acc_op def size(*, input): return input.size() From 3a46edb8d8fa1fdb120102a9af4517c08864c580 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Tue, 31 Aug 2021 12:09:59 -0700 Subject: [PATCH 393/530] ns for fx: make layer types more readable (#64270) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64270 Before this PR, layer types were populated by doing `str(module_instance)` and `str(function)`. This resulted in moderately readable strings for modules, and poorly readable strings for functions. This PR switches the logic to use `torch.typename` utility instead. The results are significantly more readable. Example function type: ``` # before '' # after 'torch._ops.quantized.PyCapsule.linear' ``` Example module type: ``` # before "" # after 'torch.nn.quantized.modules.conv.Conv2d' ``` Test Plan: Manually inspect NS results for modules and functions, verify they are more readable. Manually inspect NS results for modules and functions, verify they are more readable. Imported from OSS Differential Revision: D30669545 D30669545 Reviewed By: jerryzh168 Pulled By: vkuzo fbshipit-source-id: 60959e5cafa0a4992b083bf99f5d8260f9acdac0 --- torch/quantization/ns/utils.py | 6 +++--- torch/quantization/ns/weight_utils.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/torch/quantization/ns/utils.py b/torch/quantization/ns/utils.py index 678f60a00c8cc..62397d0de0f94 100644 --- a/torch/quantization/ns/utils.py +++ b/torch/quantization/ns/utils.py @@ -317,15 +317,15 @@ def get_arg_indices_of_inputs_to_log(node: Node) -> List[int]: def get_target_type_str(node: Node, gm: GraphModule) -> str: """ Returns a string representation of the type of the function or module - pointed to by this node, or '' for other op types. + pointed to by this node, or '' for other node types. """ target_type = "" if node.op in ("call_function", "call_method"): - target_type = str(node.target) + target_type = torch.typename(node.target) elif node.op == "call_module": assert isinstance(node.target, str) target_mod = getattr_from_fqn(gm, node.target) - target_type = str(type(target_mod)) + target_type = torch.typename(target_mod) return target_type diff --git a/torch/quantization/ns/weight_utils.py b/torch/quantization/ns/weight_utils.py index 724cdc7a40ae6..36e183efe1d8e 100644 --- a/torch/quantization/ns/weight_utils.py +++ b/torch/quantization/ns/weight_utils.py @@ -231,6 +231,8 @@ def extract_weight_from_node( op_to_type_to_weight_extraction_fn = get_op_to_type_to_weight_extraction_fn() ref_node_type = get_target_type_str(node, gm) + # for extracting weights, these are always the same + prev_node_type = ref_node_type if node.op == 'call_function': function_mapping = op_to_type_to_weight_extraction_fn['call_function'] @@ -241,7 +243,7 @@ def extract_weight_from_node( 'type': res_type, 'values': [weight], 'prev_node_name': node.name, - 'prev_node_target_type': str(node.target), + 'prev_node_target_type': prev_node_type, 'ref_node_name': node.name, 'ref_node_target_type': ref_node_type, 'index_within_arg': 0, @@ -261,7 +263,7 @@ def extract_weight_from_node( 'type': res_type, 'values': [weight], 'prev_node_name': node.name, - 'prev_node_target_type': str(type(mod)), + 'prev_node_target_type': prev_node_type, 'ref_node_name': node.name, 'ref_node_target_type': ref_node_type, 'index_within_arg': 0, From 347ef69529ae2bc38878ed97345b8fc8039dfa56 Mon Sep 17 00:00:00 2001 From: Raghuraman Krishnamoorthi Date: Tue, 31 Aug 2021 12:22:13 -0700 Subject: [PATCH 394/530] [ao][docs] Clarify operator support for quantization (#63270) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63270 Add table to quantization main page showing supported modules for static and dynamic quantization. ghstack-source-id: 137087204 Test Plan: Imported from OSS Reviewed By: HDCharles Differential Revision: D30658654 fbshipit-source-id: a82c998e1db6370596d5b0ca4c7cc96c1c90f30e --- docs/source/quantization.rst | 41 +++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index 7053ca68d920e..a86368ef8d660 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -147,16 +147,13 @@ The following table compares the differences between Eager Mode Quantization and +-----------------+-------------------+-------------------+ -Eager Mode Quantization -^^^^^^^^^^^^^^^^^^^^^^^ - -There are three types of quantization supported in Eager Mode Quantization: +There are three types of quantization supported: 1. dynamic quantization (weights quantized with activations read/stored in floating point and quantized for compute.) 2. static quantization (weights quantized, activations quantized, calibration required post training) -3. quantization aware training (weights quantized, activations quantized, +3. static quantization aware training (weights quantized, activations quantized, quantization numerics modeled during training) Please see our `Introduction to Quantization on Pytorch @@ -164,6 +161,40 @@ Please see our `Introduction to Quantization on Pytorch for a more comprehensive overview of the tradeoffs between these quantization types. +Operator coverage varies between dynamic and static quantization and is captured in the table below. +Note that for FX quantization, the corresponding functionals are also supported. + ++---------------------------+-------------------+--------------------+ +| |Static | Dynamic | +| |Quantization | Quantization | ++---------------------------+-------------------+--------------------+ +| | nn.Linear | | Y | | Y | +| | nn.Conv1d/2d/3d | | Y | | N | ++---------------------------+-------------------+--------------------+ +| | nn.LSTM | | N | | Y | +| | nn.GRU | | N | | Y | ++---------------------------+-------------------+--------------------+ +| | nn.RNNCell | | N | | Y | +| | nn.GRUCell | | N | | Y | +| | nn.LSTMCell | | N | | Y | ++---------------------------+-------------------+--------------------+ +|nn.EmbeddingBag | Y (activations | | +| | are in fp32) | Y | ++---------------------------+-------------------+--------------------+ +|nn.Embedding | Y | N | ++---------------------------+-------------------+--------------------+ +|nn.MultiheadAttention |Not Supported | Not supported | ++---------------------------+-------------------+--------------------+ +|Activations |Broadly supported | Un-changed, | +| | | computations | +| | | stay in fp32 | ++---------------------------+-------------------+--------------------+ + + +Eager Mode Quantization +^^^^^^^^^^^^^^^^^^^^^^^ + + Dynamic Quantization ~~~~~~~~~~~~~~~~~~~~ From 555171a273ef7bbea65b517508141192c83c95c5 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Tue, 31 Aug 2021 12:50:11 -0700 Subject: [PATCH 395/530] .circleci: Remove migrated jobs, move docs builds (#64222) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64222 Removes both backwards_compat as well as docs_test from the general gcc5.4 config and moves the docs build from being run on every PR to only being run on master. We can remove docs builds when we migrate the docs push job (including all secrets associated with that) Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30650953 Pulled By: seemethere fbshipit-source-id: ac11da6a551a6c81f3dc1d47fd81846cbfe9975a --- .../cimodel/data/pytorch_build_definitions.py | 29 ++----------------- .circleci/config.yml | 28 ++++++++++-------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py index d7b20158759d0..305bbb4d354bb 100644 --- a/.circleci/cimodel/data/pytorch_build_definitions.py +++ b/.circleci/cimodel/data/pytorch_build_definitions.py @@ -214,7 +214,7 @@ def gen_docs_configs(xenial_parent_config): HiddenConf( "pytorch_python_doc_build", parent_build=xenial_parent_config, - filters=gen_filter_dict(branches_list=r"/.*/", + filters=gen_filter_dict(branches_list=["master"], tags_list=RC_PATTERN), ) ) @@ -230,7 +230,7 @@ def gen_docs_configs(xenial_parent_config): HiddenConf( "pytorch_cpp_doc_build", parent_build=xenial_parent_config, - filters=gen_filter_dict(branches_list=r"/.*/", + filters=gen_filter_dict(branches_list=["master"], tags_list=RC_PATTERN), ) ) @@ -241,13 +241,6 @@ def gen_docs_configs(xenial_parent_config): branch="master", ) ) - - configs.append( - HiddenConf( - "pytorch_doc_test", - parent_build=xenial_parent_config - ) - ) return configs @@ -396,24 +389,6 @@ def instantiate_configs(only_slow_gradcheck): if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch and not is_slow_gradcheck: c.dependent_tests = gen_dependent_configs(c) - if ( - compiler_name == "gcc" - and compiler_version == "5.4" - and not is_libtorch - and not is_vulkan - and not is_pure_torch - and parallel_backend is None - ): - bc_breaking_check = Conf( - "backward-compatibility-check", - [], - is_xla=False, - restrict_phases=["test"], - is_libtorch=False, - is_important=True, - parent_build=c, - ) - c.dependent_tests.append(bc_breaking_check) if ( compiler_name != "clang" diff --git a/.circleci/config.yml b/.circleci/config.yml index 1bb32b5cc0a3d..324e5fdd9b3cb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7112,7 +7112,8 @@ workflows: - pytorch_python_doc_build: filters: branches: - only: /.*/ + only: + - master tags: only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ requires: @@ -7132,7 +7133,8 @@ workflows: - pytorch_cpp_doc_build: filters: branches: - only: /.*/ + only: + - master tags: only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ requires: @@ -7149,16 +7151,6 @@ workflows: name: pytorch_cpp_doc_push requires: - pytorch_cpp_doc_build - - pytorch_doc_test: - requires: - - pytorch_linux_xenial_py3_6_gcc5_4_build - - pytorch_linux_test: - name: pytorch_linux_backward_compatibility_check_test - requires: - - pytorch_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-linux-backward-compatibility-check-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - pytorch_linux_test: name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc5_4_distributed_test requires: @@ -9386,6 +9378,18 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-xenial-py3.6-gcc7" image_name: "pytorch-linux-xenial-py3.6-gcc7" + - pytorch_linux_build: + name: pytorch_linux_xenial_py3_6_gcc5_4_build + requires: + - "docker-pytorch-linux-xenial-py3.6-gcc5.4" + build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + - pytorch_python_doc_build: + requires: + - pytorch_linux_xenial_py3_6_gcc5_4_build + - pytorch_cpp_doc_build: + requires: + - pytorch_linux_xenial_py3_6_gcc5_4_build - pytorch_linux_build: name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build requires: From 1c2b5e59ae53ef3042ad1b5cf9aa275391c92971 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 31 Aug 2021 12:51:20 -0700 Subject: [PATCH 396/530] Remove ref to test_distributed_fork (#64197) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64197 Removes this line as test is gone. ghstack-source-id: 136986275 Test Plan: CI Reviewed By: walterddr Differential Revision: D30642929 fbshipit-source-id: a0c7dfdfb35a4a7f7ec1b881dbea53d85136012c --- test/run_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index 77e7f150c16e9..d0871fa2a0d30 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -345,7 +345,6 @@ ] DISTRIBUTED_TESTS = [ - "distributed/test_distributed_fork", "distributed/test_distributed_spawn", ] From 8d08b103be936d78d5d4ed90c0547aeccb8ce166 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 31 Aug 2021 13:29:39 -0700 Subject: [PATCH 397/530] [CUDA graphs] Prototype API and documentation (#63269) Summary: RFC: https://github.com/pytorch/pytorch/issues/61880 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63269 Reviewed By: mruberry Differential Revision: D30596643 Pulled By: ngimel fbshipit-source-id: b1f8061406364b667e2c2d4d30fbce1f0d8456be --- aten/src/ATen/cuda/CUDAGraphsUtils.cuh | 13 + aten/src/ATen/native/cudnn/Conv_v7.cpp | 4 + c10/cuda/CUDACachingAllocator.cpp | 2 + docs/source/cuda.rst | 11 + docs/source/notes/cuda.rst | 453 ++++++++++++++++++++++++- test/test_cuda.py | 116 +++++-- torch/_C/__init__.pyi.in | 9 +- torch/csrc/cuda/Graph.cpp | 25 +- torch/cuda/__init__.py | 3 +- torch/cuda/graphs.py | 408 ++++++++++++++++++++++ torch/cuda/streams.py | 5 - 11 files changed, 996 insertions(+), 53 deletions(-) create mode 100644 torch/cuda/graphs.py diff --git a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh index c25ba88a6537c..9d42ed759939b 100644 --- a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh +++ b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh @@ -42,5 +42,18 @@ inline void assertNotCapturing(std::string attempt) { status); } +inline void errorIfCapturingCudnnBenchmark(std::string version_specific) { + auto status = currentStreamCaptureStatus(); + TORCH_CHECK(status == CaptureStatus::None, + "Current cudaStreamCaptureStatus: ", + status, + "\nCapturing ", + version_specific, + "is prohibited. Possible causes of this error:\n" + "1. No warmup iterations occurred before capture.\n" + "2. The convolutions you're trying to capture use dynamic shapes, " + "in which case capturing them is generally prohibited."); +} + } // namespace cuda } // namespace at diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index 7d16f0a9a910f..27863d060d2dd 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -292,6 +293,7 @@ struct algorithm_search { } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); + at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind"); AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx( args.handle, args.idesc.desc(), args.input.data_ptr(), @@ -362,6 +364,7 @@ struct algorithm_search { } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); + at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind"); AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx( args.handle, args.wdesc.desc(), args.weight.data_ptr(), @@ -434,6 +437,7 @@ struct algorithm_search { } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); + at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind"); AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx( args.handle, args.idesc.desc(), args.input.data_ptr(), diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 055375352ee08..659fea351d467 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -308,6 +308,8 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) { } else { // It's ok to capture cudaMallocs, as long as we never cudaFree those // addresses before replay. + // Capturing cudaMalloc behaves nicely: it gives the graph new VA, + // but is ignored (won't leakily allocate new memory) in replays. at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed}; return cudaMalloc(p, size); } diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst index d4783c867b82a..75029332aa481 100644 --- a/docs/source/cuda.rst +++ b/docs/source/cuda.rst @@ -71,6 +71,17 @@ Streams and events Stream Event +Graphs (prototype) +------------------ +.. autosummary:: + :toctree: generated + :nosignatures: + + graph_pool_handle + CUDAGraph + graph + make_graphed_callables + Memory management ----------------- .. autosummary:: diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst index 264017f0203cc..5d7c0ea48f669 100644 --- a/docs/source/notes/cuda.rst +++ b/docs/source/notes/cuda.rst @@ -262,7 +262,7 @@ have the same stream-semantics relationship as any group of ops:: BC note: Using grads on the default stream ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In prior versions of Pytorch (1.9 and earlier), the autograd engine always synced +In prior versions of PyTorch (1.9 and earlier), the autograd engine always synced the default stream with all backward ops, so the following pattern:: with torch.cuda.stream(s): @@ -270,7 +270,7 @@ the default stream with all backward ops, so the following pattern:: use grads was safe as long as ``use grads`` happened on the default stream. -In present Pytorch, that pattern is no longer safe. If ``backward()`` +In present PyTorch, that pattern is no longer safe. If ``backward()`` and ``use grads`` are in different stream contexts, you must sync the streams:: with torch.cuda.stream(s): @@ -513,3 +513,452 @@ by GIL of Python interpreter. If you use :class:`~torch.nn.parallel.DistributedDataParallel`, you could use `torch.distributed.launch` utility to launch your program, see :ref:`distributed-launch`. + +.. _cuda-graph-semantics: + +CUDA Graphs +----------- + +A CUDA graph is a record of the work (mostly kernels and their arguments) that a +CUDA stream and its dependent streams perform. +For general principles and details on the underlying CUDA API, see +`Getting Started with CUDA Graphs`_ and the +`Graphs section`_ of the CUDA C Programming Guide. + +PyTorch supports the construction of CUDA graphs using `stream capture`_, which puts a +CUDA stream in *capture mode*. CUDA work issued to a capturing stream doesn't actually +run on the GPU. Instead, the work is recorded in a graph. + +After capture, the graph can be *launched* to run the GPU work as many times as needed. +Each replay runs the same kernels with the same arguments. For pointer arguments this +means the same memory addresses are used. +By filling input memory with new data (e.g., from a new batch) before each replay, +you can rerun the same work on new data. + +Why CUDA Graphs? +^^^^^^^^^^^^^^^^ + +Replaying a graph sacrifices the dynamic flexibility of typical eager execution in exchange for +**greatly reduced CPU overhead**. A graph's arguments and kernels are fixed, so a graph replay +skips all layers of argument setup and kernel dispatch, including Python, C++, and CUDA driver +overheads. Under the hood, a replay submits the entire graph's work to the GPU with +a single call to `cudaGraphLaunch`_. Kernels in a replay also execute slightly faster +on the GPU, but eliding CPU overhead is the main benefit. + +You should try CUDA graphs if all or part of your network is graph-safe (usually this means +static shapes and static control flow, but see the other :ref:`constraints`) +and you suspect its runtime is at least somewhat CPU-limited. + +.. _Getting Started with CUDA Graphs: + https://developer.nvidia.com/blog/cuda-graphs/ +.. _Graphs section: + https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs +.. _stream capture: + https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture +.. _cudaGraphLaunch: + https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597 + +PyTorch API +^^^^^^^^^^^ + +.. warning:: + This API is a prototype and may change in future releases. + +PyTorch exposes graphs via a raw :class:`torch.cuda.CUDAGraph` class +and two convenience wrappers, +:class:`torch.cuda.graph` and +:class:`torch.cuda.make_graphed_callables`. + +:class:`torch.cuda.graph` is a simple, versatile context manager that +captures CUDA work in its context. +Before capture, warm up the workload to be captured by running +a few eager iterations. Warmup must occur on a side stream. +Because the graph reads from and writes to the same memory addresses in every +replay, you must maintain long-lived references to tensors that hold +input and output data during capture. +To run the graph on new input data, copy new data to the capture's input tensor(s), +replay the graph, then read the new output from the capture's output tensor(s). +Example:: + + g = torch.cuda.CUDAGraph() + + # Placeholder input used for capture + static_input = torch.empty((5,), device="cuda") + + # Warmup before capture + s = torch.cuda.Stream() + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + for _ in range(3): + static_output = static_input * 2 + torch.cuda.current_stream().wait_stream(s) + + # Captures the graph + # To allow capture, automatically sets a side stream as the current stream in the context + with torch.cuda.graph(g): + static_output = static_input * 2 + + # Fills the graph's input memory with new data to compute on + static_input.copy_(torch.full((5,), 3, device="cuda")) + g.replay() + # static_output holds the results + print(static_output) # full of 3 * 2 = 6 + + # Fills the graph's input memory with more data to compute on + static_input.copy_(torch.full((5,), 4, device="cuda")) + g.replay() + print(static_output) # full of 4 * 2 = 8 + +See +:ref:`Whole-network capture`, +:ref:`Usage with torch.cuda.amp`, and +:ref:`Usage with multiple streams` +for realistic and advanced patterns. + +:class:`~torch.cuda.make_graphed_callables` is more sophisticated. +:class:`~torch.cuda.make_graphed_callables` accepts Python functions and +:class:`torch.nn.Module`\s. For each passed function or Module, +it creates separate graphs of the forward-pass and backward-pass work. See +:ref:`Partial-network capture`. + +.. _capture-constraints: + +Constraints +~~~~~~~~~~~ + +A set of ops is *capturable* if it doesn't violate any of the following constraints. + +Constraints apply to all work in a +:class:`torch.cuda.graph` context and all work in the forward and backward passes +of any callable you pass to :func:`torch.cuda.make_graphed_callables`. + +Violating any of these will likely cause a runtime error: + +* Capture must occur on a non-default stream. (This is only a concern if you use the raw + :meth:`CUDAGraph.capture_begin` and + :meth:`CUDAGraph.capture_end` calls. + :class:`~torch.cuda.graph` and + :func:`~torch.cuda.make_graphed_callables` set a side stream for you.) +* Ops that sychronize the CPU with the GPU (e.g., ``.item()`` calls) are prohibited. +* CUDA RNG ops are allowed, but must use default generators. For example, explicitly constructing a + new :class:`torch.Generator` instance and passing it as the ``generator`` argument to an RNG function + is prohibited. + +Violating any of these will likely cause silent numerical errors or undefined behavior: + +* Within a process, only one capture may be underway at a time. +* No non-captured CUDA work may run in this process (on any thread) while capture is underway. +* CPU work is not captured. If the captured ops include CPU work, that work will be elided during replay. +* Every replay reads from and writes to the same (virtual) memory addresses. +* Dynamic control flow (based on CPU or GPU data) is prohibited. +* Dynamic shapes are prohibited. The graph assumes every tensor in the captured op sequence + has the same size and layout in every replay. +* Using multiple streams in a capture is allowed, but there are :ref:`restrictions`. + +Non-constraints +~~~~~~~~~~~~~~~ + +* Once captured, the graph may be replayed on any stream. + +.. _whole-network-capture: + +Whole-network capture +^^^^^^^^^^^^^^^^^^^^^^ + +If your entire network is capturable, you can capture and replay an entire iteration:: + + N, D_in, H, D_out = 640, 4096, 2048, 1024 + model = torch.nn.Sequential(torch.nn.Linear(D_in, H), + torch.nn.Dropout(p=0.2), + torch.nn.Linear(H, D_out), + torch.nn.Dropout(p=0.1)).cuda() + loss_fn = torch.nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.1) + + # Placeholders used for capture + static_input = torch.randn(N, D_in, device='cuda') + static_target = torch.randn(N, D_out, device='cuda') + + # warmup + # Uses static_input and static_target here for convenience, + # but in a real setting, because the warmup includes optimizer.step() + # you must use a few batches of real data. + s = torch.cuda.Stream() + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + for i in range(3): + optimizer.zero_grad(set_to_none=True) + y_pred = model(static_input) + loss = loss_fn(y_pred, static_target) + loss.backward() + optimizer.step() + torch.cuda.current_stream().wait_stream(s) + + # capture + g = torch.cuda.CUDAGraph() + # Sets grads to None before capture, so backward() will create + # .grad attributes with allocations from the graph's private pool + optimizer.zero_grad(set_to_none=True) + with torch.cuda.graph(g): + static_y_pred = model(static_input) + static_loss = loss_fn(static_y_pred, static_target) + static_loss.backward() + optimizer.step() + + real_inputs = [torch.rand_like(static_input) for _ in range(10)] + real_targets = [torch.rand_like(static_target) for _ in range(10)] + + for data, target in zip(real_inputs, real_targets): + # Fills the graph's input memory with new data to compute on + static_input.copy_(data) + static_target.copy_(target) + # replay() includes forward, backward, and step. + # You don't even need to call optimizer.zero_grad() between iterations + # because the captured backward refills static .grad tensors in place. + g.replay() + # Params have been updated. static_y_pred, static_loss, and .grad + # attributes hold values from computing on this iteration's data. + +.. _partial-network-capture: + +Partial-network capture +^^^^^^^^^^^^^^^^^^^^^^^^^ + +If some of your network is unsafe to capture (e.g., due to dynamic control flow, +dynamic shapes, CPU syncs, or essential CPU-side logic), you can run the unsafe +part(s) eagerly and use :func:`torch.cuda.make_graphed_callables` to graph only +the capture-safe part(s). + +By default, callables returned by :func:`~torch.cuda.make_graphed_callables` +are autograd-aware, and can be used in the training loop as direct replacements +for the functions or :class:`nn.Module`\ s you passed. + +:func:`~torch.cuda.make_graphed_callables` internally creates +:class:`~torch.cuda.CUDAGraph` objects, runs warmup iterations, and maintains +static inputs and outputs as needed. Therefore (unlike with +:class:`torch.cuda.graph`) you don't need to handle those manually. + +In the following example, data-dependent dynamic control flow means the +network isn't capturable end-to-end, but +:func:`~torch.cuda.make_graphed_callables` +lets us capture and run graph-safe sections as graphs regardless:: + + N, D_in, H, D_out = 640, 4096, 2048, 1024 + + module1 = torch.nn.Linear(D_in, H).cuda() + module2 = torch.nn.Linear(H, D_out).cuda() + module3 = torch.nn.Linear(H, D_out).cuda() + + loss_fn = torch.nn.MSELoss() + optimizer = torch.optim.SGD(chain(module1.parameters() + + module2.parameters() + + module3.parameters()), + lr=0.1) + + # Sample inputs used for capture + # requires_grad state of sample inputs must match + # requires_grad state of real inputs each callable will see. + x = torch.randn(N, D_in, device='cuda') + h = torch.randn(N, H, device='cuda', requires_grad=True) + + module1 = torch.cuda.make_graphed_callables(module1, (x,)) + module2 = torch.cuda.make_graphed_callables(module2, (h,)) + module3 = torch.cuda.make_graphed_callables(module3, (h,)) + + real_inputs = [torch.rand_like(x) for _ in range(10)] + real_targets = [torch.randn(N, D_out, device="cuda") for _ in range(10)] + + for data, target in zip(real_inputs, real_targets): + optimizer.zero_grad(set_to_none=True) + + tmp = module1(data) # forward ops run as a graph + + if tmp.sum().item() > 0: + tmp = module2(tmp) # forward ops run as a graph + else: + tmp = module3(tmp) # forward ops run as a graph + + loss = loss_fn(tmp, y) + # module2's or module3's (whichever was chosen) backward ops, + # as well as module1's backward ops, run as graphs + loss.backward() + optimizer.step() + +.. _graphs-with-amp: + +Usage with torch.cuda.amp +^^^^^^^^^^^^^^^^^^^^^^^^^ + +For typical optimizers, :meth:`GradScaler.step` syncs +the CPU with the GPU, which is prohibited during capture. To avoid errors, either use +:ref:`partial-network capture`, or (if forward, loss, +and backward are capture-safe) capture forward, loss, and backward but not the +optimizer step:: + + # warmup + # In a real setting, use a few batches of real data. + s = torch.cuda.Stream() + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + for i in range(3): + optimizer.zero_grad(set_to_none=True) + with torch.cuda.amp.autocast(): + y_pred = model(static_input) + loss = loss_fn(y_pred, static_target) + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + torch.cuda.current_stream().wait_stream(s) + + # capture + g = torch.cuda.CUDAGraph() + optimizer.zero_grad(set_to_none=True) + with torch.cuda.graph(g): + with torch.cuda.amp.autocast(): + static_y_pred = model(static_input) + static_loss = loss_fn(static_y_pred, static_target) + scaler.scale(static_loss).backward() + # don't capture scaler.step(optimizer) or scaler.update() + + real_inputs = [torch.rand_like(static_input) for _ in range(10)] + real_targets = [torch.rand_like(static_target) for _ in range(10)] + + for data, target in zip(real_inputs, real_targets): + static_input.copy_(data) + static_target.copy_(target) + g.replay() + # Runs scaler.step and scaler.update eagerly + scaler.step(optimizer) + scaler.update() + +.. _multistream-capture: + +Usage with multiple streams +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Capture mode automatically propagates to any streams that sync with a capturing stream. +Within capture, you may expose parallelism by issuing calls to different streams, +but the overall stream dependency DAG must branch out from the +initial capturing stream after capture begins and rejoin the initial stream +before capture ends:: + + with torch.cuda.graph(g): + # at context manager entrance, torch.cuda.current_stream() + # is the initial capturing stream + + # INCORRECT (does not branch out from or rejoin initial stream) + with torch.cuda.stream(s): + cuda_work() + + # CORRECT: + # branches out from initial stream + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + cuda_work() + # rejoins initial stream before capture ends + torch.cuda.current_stream().wait_stream(s) + +.. note:: + + To avoid confusion for power users looking at replays in nsight systems or nvprof: + Unlike eager execution, the graph interprets a nontrivial stream DAG in capture + as a hint, not a command. During replay, the graph may reorganize independent ops + onto different streams or enqueue them in a different order (while respecting your + original DAG's overall dependencies). + +Usage with DistributedDataParallel +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +NCCL < 2.9.6 +~~~~~~~~~~~~ + +NCCL versions earlier than 2.9.6 don't allow collectives to be captured. +You must use :ref:`partial-network capture`, +which defers allreduces to happen outside graphed sections of backward. + +Call :func:`~torch.cuda.make_graphed_callables` on graphable network sections +*before* wrapping the network with DDP. + +NCCL >= 2.9.6 +~~~~~~~~~~~~~ + +NCCL versions 2.9.6 or later allow collectives in the graph. +Approaches that capture an :ref:`entire backward pass` +are a viable option, but need three setup steps. + +1. Disable DDP's internal async error handling:: + + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + torch.distributed.init_process_group(...) + +2. Before full-backward capture, DDP must be constructed in a side-stream context:: + + with torch.cuda.stream(s): + model = DistributedDataParallel(model) + +3. Your warmup must run at least 11 DDP-enabled eager iterations before capture. + +.. _graph-memory-management: + +Graph memory management +^^^^^^^^^^^^^^^^^^^^^^^ + +A captured graph acts on the same virtual addresses every time it replays. +If PyTorch frees the memory, a later replay can hit an illegal memory access. +If PyTorch reassigns the memory to new tensors, the replay can corrupt the values +seen by those tensors. Therefore, the virtual addresses used by the graph must be +reserved for the graph across replays. The PyTorch caching allocator achieves this +by detecting when capture is underway and satisfying the capture's allocations +from a graph-private memory pool. The private pool stays alive until its +:class:`~torch.cuda.CUDAGraph` object and all tensors created during capture +go out of scope. + +Private pools are maintained automatically. By default, the allocator creates a +separate private pool for each capture. If you capture multiple graphs, +this conservative approach ensures graph replays never corrupt each other's values, +but sometimes needlessly wastes memory. + +To economize the memory stashed in private pools, :class:`torch.cuda.graph` +and :func:`torch.cuda.make_graphed_callables` optionally allow different +captures to share the same private pool. +It's safe for a set of graphs to share a private pool if you know they'll always +be replayed in the same order they were captured, +and never be replayed concurrently. + +Sharing memory across captures with torch.cuda.graph +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:class:`torch.cuda.graph`'s ``pool`` argument is a hint to use a particular private pool, +and can be used to share memory across graphs as shown:: + + g1 = torch.cuda.CUDAGraph() + g2 = torch.cuda.CUDAGraph() + + # (create static inputs for g1 and g2, run warmups of their workloads...) + + # Captures g1 + with torch.cuda.graph(g1): + static_out_1 = g1_workload(static_in_1) + + # Captures g2, hinting that g2 may share a memory pool with g1 + with torch.cuda.graph(g2, pool=g1.pool()): + static_out_2 = g2_workload(static_in_2) + + static_in_1.copy_(real_data_1) + static_in_2.copy_(real_data_2) + g1.replay() + g2.replay() + +Sharing memory across captures with torch.cuda.make_graphed_callables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With :func:`torch.cuda.make_graphed_callables`, if you want to graph several +callables and you know they'll always run in the same order (and never concurrently) +pass them as a tuple in the same order they'll run in the live workload, and +:func:`~torch.cuda.make_graphed_callables` will capture their graphs using a shared +private pool. + +If, in the live workload, your callables will run in an order that occasionally changes, +or if they'll run concurrently, passing them as a tuple to a single invocation of +:func:`~torch.cuda.make_graphed_callables` is not allowed. Instead, you must call +:func:`~torch.cuda.make_graphed_callables` separately for each one. diff --git a/test/test_cuda.py b/test/test_cuda.py index e90cb1703c06e..70f5a6ee4f586 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3089,7 +3089,7 @@ def test_graph_capture_simple(self): with torch.cuda.stream(s): a = torch.full((1000,), 1, device="cuda") - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() torch.cuda.empty_cache() g.capture_begin() b = a @@ -3125,7 +3125,7 @@ def run(op, kwargs): with torch.cuda.stream(stream): torch.cuda.manual_seed(5) - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() torch.cuda.empty_cache() g.capture_begin() graph_out = graph_in @@ -3212,7 +3212,7 @@ def run(module, op, args, kwargs): with torch.cuda.stream(stream): torch.cuda.manual_seed(5) - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() torch.cuda.empty_cache() if (module == "torch"): g.capture_begin() @@ -3279,14 +3279,14 @@ def func_with_temps(t, val): s = torch.cuda.Stream() for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"): - g0 = torch.cuda._Graph() - g1 = torch.cuda._Graph() + g0 = torch.cuda.CUDAGraph() + g1 = torch.cuda.CUDAGraph() a = torch.ones((size,), device="cuda") s.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(s): - g0_args = (torch.cuda._graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () + g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () g0.capture_begin(*g0_args) b = a.clone() for _ in range(5): @@ -3343,8 +3343,8 @@ def func_with_temps(t, val): s = torch.cuda.Stream() for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"): - g0 = torch.cuda._Graph() - g1 = torch.cuda._Graph() + g0 = torch.cuda.CUDAGraph() + g1 = torch.cuda.CUDAGraph() s0 = torch.cuda.Stream() s1 = torch.cuda.Stream() @@ -3353,7 +3353,7 @@ def func_with_temps(t, val): s.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(s): - g0_args = (torch.cuda._graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () + g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () g0.capture_begin(*g0_args) b = a.clone() for _ in range(5): @@ -3407,13 +3407,13 @@ def test_graph_three_successive(self): for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"): a = torch.ones((size,), device="cuda") - g0 = torch.cuda._Graph() - g1 = torch.cuda._Graph() - g2 = torch.cuda._Graph() + g0 = torch.cuda.CUDAGraph() + g1 = torch.cuda.CUDAGraph() + g2 = torch.cuda.CUDAGraph() s.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(s): - g0_args = (torch.cuda._graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () + g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () g0.capture_begin(*g0_args) b = a.clone() c = b + 1 @@ -3499,7 +3499,7 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self): delta_active_blocks = 1 # We only check the large pool, which isn't affected by rng offset holder delta_active_bytes = numel * elem - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() s.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(s): # Allocation stat estimates assume input is created on the same stream as capture_begin() @@ -3573,7 +3573,7 @@ def test_graph_record_stream(self): s0 = torch.cuda.Stream() s1 = torch.cuda.Stream() s2 = torch.cuda.Stream() - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() torch.cuda.synchronize() with torch.cuda.stream(s0): @@ -3620,7 +3620,7 @@ def test_graph_cudnn_dropout(self): y = model(x) - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() s = torch.cuda.Stream() s.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(s): @@ -3638,7 +3638,7 @@ def test_graph_grad_scaling(self): torch.cuda.empty_cache() scaler = torch.cuda.amp.GradScaler(init_scale=4.) - g = torch.cuda._Graph() + g = torch.cuda.CUDAGraph() s = torch.cuda.Stream() weight = torch.ones((100,), device="cuda", requires_grad=True) @@ -3646,18 +3646,15 @@ def test_graph_grad_scaling(self): static_input = torch.ones_like(weight) static_grad = torch.ones_like(weight) - s.wait_stream(torch.cuda.current_stream()) - with torch.cuda.stream(s): - # warmup - loss = (weight.half() * static_input).sum() - scaler.scale(loss).backward() - opt.zero_grad(set_to_none=True) - # capture - g.capture_begin() + # warmup + loss = (weight.half() * static_input).sum() + scaler.scale(loss).backward() + opt.zero_grad(set_to_none=True) + + # capture + with torch.cuda.graph(g): loss = (weight.half() * static_input).sum() scaler.scale(loss).backward() - g.capture_end() - torch.cuda.current_stream().wait_stream(s) input_vals = [5, 20000, 5, 40000] # If the scale gets updated properly, these are the scale, growth tracker, @@ -3678,6 +3675,71 @@ def test_graph_grad_scaling(self): self.assertEqual(scaler._scale, scale) self.assertEqual(scaler._growth_tracker, growth_tracker) + @unittest.skipIf((not TEST_CUDA) or + TEST_WITH_ROCM or + int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") + def test_graph_make_graphed_callables(self): + torch.manual_seed(5) + torch.cuda.manual_seed(5) + + N, D_in, H, D_out = 640, 4096, 2048, 1024 + + models = [] + for _ in range(2): + model_section1 = torch.nn.Sequential(torch.nn.Linear(D_in, H), + torch.nn.Dropout(p=0.1)).cuda() + model_section2 = torch.nn.Sequential(torch.nn.Linear(H, D_out), + torch.nn.Dropout(p=0.2)).cuda() + models.append(torch.nn.Sequential(model_section1, model_section2)) + + model_graphed = models[0] + model_control = models[1] + + model_graphed.load_state_dict(model_control.state_dict()) + + opt_graphed = torch.optim.SGD(model_graphed.parameters(), lr=0.1) + opt_control = torch.optim.SGD(model_control.parameters(), lr=0.1) + + x = torch.randn(N, D_in, device='cuda') + h = torch.randn(N, H, device='cuda', requires_grad=True) + y_pred = torch.randn(N, D_out, device='cuda', requires_grad=True) + y = torch.randn(N, D_out, device='cuda') + + loss_fn_control = torch.nn.functional.mse_loss + relu_control = torch.nn.functional.relu + + # This is a good stress test. It graphs four callables: two Modules and two python functions. + model_graphed[0], model_graphed[1], relu_graphed, loss_fn_graphed = \ + torch.cuda.make_graphed_callables((model_graphed[0], model_graphed[1], relu_control, loss_fn_control), + ((x,), (h,), (y_pred,), (y_pred, y))) + + real_inputs = [torch.rand_like(x) for _ in range(10)] + real_targets = [torch.rand_like(y) for _ in range(10)] + + for m, opt, relu, loss_fn in zip((model_graphed, model_control), + (opt_graphed, opt_control), + (relu_graphed, relu_control), + (loss_fn_graphed, loss_fn_control)): + # Resets RNC states before iterations for graphed and ungraphed models, + # so dropout math should be bitwise identical for both. + torch.manual_seed(5) + torch.cuda.manual_seed(5) + for data, target in zip(real_inputs, real_targets): + opt.zero_grad(set_to_none=True) + y_pred = m(data) + y_pred = relu(y_pred) + loss = loss_fn(y_pred, target) + loss.backward() + opt.step() + + for p, pc in zip(model_graphed.parameters(), model_control.parameters()): + self.assertEqual(p, pc) + + # We graphed the models in training mode. Eval should still run ungraphed. + model_graphed.eval() + model_control.eval() + self.assertEqual(model_graphed(real_inputs[0]), model_control(real_inputs[0])) + def test_batch_norm_gather_stats(self): input = torch.randn(1, 3, 3, 3, device='cuda') mean, invstd = torch.batch_norm_gather_stats( diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index c847e8deced62..352edbee6bc5e 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -888,8 +888,13 @@ class _CudaEventBase: def ipc_handle(self) -> bytes: ... # Defined in torch/csrc/cuda/Graph.cpp -class _CudaGraphBase: - ... +class _CUDAGraph: + def capture_begin(self, + pool: Optional[Tuple[_int, _int]]=...) -> None: ... + def capture_end(self) -> None: ... + def replay(self) -> None: ... + def reset(self) -> None: ... + def pool(self) -> Tuple[_int, _int]: ... def _graph_pool_handle() -> Tuple[_int, _int]: ... diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp index 123abb9666ee5..beacefa3f8878 100644 --- a/torch/csrc/cuda/Graph.cpp +++ b/torch/csrc/cuda/Graph.cpp @@ -23,36 +23,29 @@ void THCPGraph_init(PyObject *module) { auto torch_C_m = py::handle(module).cast(); torch_C_m - .def("_graph_pool_handle", &::at::cuda::graph_pool_handle); + .def("_graph_pool_handle", + &::at::cuda::graph_pool_handle); - shared_ptr_class_<::at::cuda::CUDAGraph>(torch_C_m, "_CudaGraphBase") + shared_ptr_class_<::at::cuda::CUDAGraph> + (torch_C_m, + "_CUDAGraph") .def(py::init<>()) // I'm not sure this is the correct order of all the arguments. Pybind11 docs // aren't clear. But it works. .def("capture_begin", &::at::cuda::CUDAGraph::capture_begin, py::call_guard(), - R"(``capture_begin`` begins Cuda graph capture on the current stream.)", py::arg("pool") = c10::cuda::MempoolId_t{0, 0}) .def("capture_end", &::at::cuda::CUDAGraph::capture_end, - py::call_guard(), - R"(``capture_end`` ends Cuda graph capture on the current stream. - After ``capture_end``, ``replay`` may be called on this instance.)") + py::call_guard()) .def("replay", &::at::cuda::CUDAGraph::replay, - py::call_guard(), - R"(``replay`` replays the Cuda graph captured by this instance.)") - // reset is called in __del__ on the Python side - // (see class Graph in torch/cuda/streams.py for reasons and caveats) + py::call_guard()) .def("reset", &::at::cuda::CUDAGraph::reset, - py::call_guard(), - R"(``reset`` deletes the graph currently held by this instance.)") + py::call_guard()) .def("pool", &::at::cuda::CUDAGraph::pool, - py::call_guard(), - R"(``pool`` retrieves the id of this graph's memory pool. - This id can optionally be passed to another graph's capture_begin, - which hints that other graph may share the same memory pool.)"); + py::call_guard()); } diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index d5a9cbb52f34f..924782de8d024 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -16,7 +16,8 @@ import threading from typing import List, Optional, Tuple, Union, Any from ._utils import _get_device_index, _dummy_type -from .streams import Stream, Event, _Graph, _graph_pool_handle +from .graphs import CUDAGraph, graph_pool_handle, graph, make_graphed_callables +from .streams import Stream, Event from .. import device as _device import torch._C diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py new file mode 100644 index 0000000000000..ff8a07f989f9d --- /dev/null +++ b/torch/cuda/graphs.py @@ -0,0 +1,408 @@ +import gc +import torch + +from ._utils import _dummy_type + + +if not hasattr(torch._C, '_CudaStreamBase'): + # Define dummy base classes + torch._C.__dict__['_CUDAGraph'] = _dummy_type('_CUDAGraph') + torch._C.__dict__['_graph_pool_handle'] = _dummy_type('_graph_pool_handle') + +from torch._C import _CUDAGraph # noqa: F401 +from torch._C import _graph_pool_handle + + +# Python shim helps Sphinx process docstrings more reliably. +def graph_pool_handle(): + r""" + Returns an opaque token representing the id of a graph memory pool. + See :ref:`Graph memory management`. + + .. warning:: + This API is a prototype and may change in future releases. + """ + return _graph_pool_handle() + + +# Python shim helps Sphinx process docstrings more reliably. +class CUDAGraph(torch._C._CUDAGraph): + r""" + Wrapper around a CUDA graph. + + .. warning:: + This API is a prototype and may change in future releases. + """ + def __new__(cls): + return super(CUDAGraph, cls).__new__(cls) + + def __init__(self): + super(CUDAGraph, self).__init__() + + def capture_begin(self, pool=None): + r""" + Begins capturing CUDA work on the current stream. + + Typically, you shouldn't call ``capture_begin`` yourself. + Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`, + which call ``capture_begin`` internally. + + Arguments: + pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or + :meth:`other_Graph_instance.pool()`) that hints this graph may share memory + with the indicated pool. See :ref:`Graph memory management`. + """ + # I'm not sure if pybind11 converts a None arg to the default defined on the C++ side, + # so I'm not taking any chances. + if pool is None: + super(CUDAGraph, self).capture_begin() + else: + super(CUDAGraph, self).capture_begin(pool) + + def capture_end(self): + r""" + Ends CUDA graph capture on the current stream. + After ``capture_end``, ``replay`` may be called on this instance. + + Typically, you shouldn't call ``capture_end`` yourself. + Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`, + which call ``capture_end`` internally. + """ + super(CUDAGraph, self).capture_end() + + def replay(self): + r""" + Replays the CUDA work captured by this graph. + """ + super(CUDAGraph, self).replay() + + def reset(self): + r""" + Deletes the graph currently held by this instance. + """ + super(CUDAGraph, self).reset() + + def pool(self): + r""" + Returns an opaque token representing the id of this graph's memory pool. + This id can optionally be passed to another graph's ``capture_begin``, + which hints the other graph may share the same memory pool. + """ + return super(CUDAGraph, self).pool() + + +class graph(object): + r""" + Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` + object for later replay. + + See :ref:`CUDA Graphs ` for a general introduction, + detailed use, and constraints. + + Arguments: + cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture. + pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or + :meth:`other_Graph_instance.pool()`) hinting this graph's capture + may share memory from the specified pool. See :ref:`Graph memory management`. + stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context. + If not supplied, ``graph`` sets its own internal side stream as the current stream in the context. + + .. note:: + For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture + used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture. + + .. warning:: + This API is a prototype and may change in future releases. + """ + default_capture_stream = None + + def __init__(self, + cuda_graph, + pool=None, + stream=None): + # Lazy-init of default_capture_stream helps avoid circular-import errors. + # Not thread safe, but graphs already have the general (explicitly documented) + # restriction that only one capture may be underway at a time in the process. + if self.__class__.default_capture_stream is None: + self.__class__.default_capture_stream = torch.cuda.Stream() + + self.pool = () if pool is None else (pool,) + self.capture_stream = stream if stream is not None else self.__class__.default_capture_stream + assert self.capture_stream is not None + self.stream_ctx = torch.cuda.stream(self.capture_stream) + self.cuda_graph = cuda_graph + + def __enter__(self): + # Free as much memory as we can for the graph + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + + # Stackoverflow seems comfortable with this pattern + # https://stackoverflow.com/questions/26635684/calling-enter-and-exit-manually#39172487 + self.stream_ctx.__enter__() + + self.cuda_graph.capture_begin(*self.pool) + + + def __exit__(self, exc_type, exc_value, traceback): + self.cuda_graph.capture_end() + self.stream_ctx.__exit__(exc_type, exc_value, traceback) + # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__() + + +def make_graphed_callables(callables, sample_args): + r""" + Accepts callables (functions or :class:`nn.Module`\ s) + and returns graphed versions. + + Each graphed callable's forward pass runs its source callable's + forward CUDA work as a CUDA graph inside a single autograd node. + + The graphed callable's forward pass also appends + a backward node to the autograd graph. During backward, this node runs the + callable's backward work as a CUDA graph. + + Therefore, each graphed callable should be a drop-in replacement for its source callable + in an autograd-enabled training loop. + + See :ref:`Partial-network capture` for detailed use and constraints. + + If you pass a tuple of several callables, their captures will use the same memory pool. + See :ref:`Graph memory management` for when this is appropriate. + + Arguments: + callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph. + See :ref:`Graph memory management` for when passing a tuple of callables + is appropriate. If you pass a tuple of callables, their order in the tuple must be the same order + they'll run in the live workload. + sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable. + If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors. + If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors. + + .. note:: + The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state + that's expected for the corresponding real input in the training loop. + + .. warning:: + This API is a prototype and may change in future releases. + + .. warning:: + ``sample_args`` for each callable must be a tuple of Tensors. Other types and keyword args + are not allowed. + + .. warning:: + Returned callables do not support higher order differentiation (e.g., double backward). + + .. warning:: + In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters + may be trainable. Buffers must have ``requires_grad=False``. + + .. warning:: + After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`, + you may not add or remove any of that Module's parameters or buffers. + + .. warning:: + :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks + registered on them at the time they are passed. However, registering hooks on modules *after* passing them + through :func:`~torch.cuda.make_graphed_callables` is allowed. + + .. warning:: + When running a graphed callable, you must pass its arguments in the same order and format + they appeared in that callable's ``sample_args``. + + .. warning:: + All Tensor outputs of graphed callables must require grad. + """ + just_one_callable = False + + if not isinstance(callables, tuple): + just_one_callable = True + callables = (callables,) + sample_args = (sample_args,) + + for c, args in zip(callables, sample_args): + if isinstance(c, torch.nn.Module): + assert len(c._backward_hooks) == 0 and len(c._forward_hooks) == 0 and len(c._forward_pre_hooks) == 0, \ + "Modules must not have hooks registered at the time they are passed. However, registering hooks " + \ + "on modules after passing them through make_graphed_callables is allowed." + assert all(b.requires_grad is False for b in c.buffers()), "In any :class:`~torch.nn.Module` passed to " + \ + ":func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have " + \ + "``requires_grad=False``." + assert all(isinstance(arg, torch.Tensor) for arg in args), "In the prototype API, sample_args " + \ + "for each callable must be a tuple of Tensors. Other types and keyword args are not allowed." + + + # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly + # passes to forward (ie, its sample_args) AND the module's parameter attributes. + per_callable_len_user_args = [len(args) for args in sample_args] + per_callable_module_params = [tuple(c.parameters()) if isinstance(c, torch.nn.Module) else () + for c in callables] + per_callable_static_input_surfaces = [sample_args[i] + per_callable_module_params[i] + for i in range(len(callables))] + + fwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))] + bwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))] + + mempool = graph_pool_handle() + + # Warmup + # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work + # from ending up in any captures. + torch.cuda.synchronize() + with torch.cuda.stream(torch.cuda.Stream()): + for func, args, static_input_surface in zip(callables, + sample_args, + per_callable_static_input_surfaces): + for _ in range(3): + outputs = func(*args) + outputs = (outputs,) if isinstance(outputs, torch.Tensor) else outputs + grad_inputs = torch.autograd.grad(outputs=outputs, + inputs=tuple(i for i in static_input_surface if i.requires_grad), + grad_outputs=tuple(torch.empty_like(o) for o in outputs), + only_inputs=True, + allow_unused=False) + del outputs, grad_inputs + torch.cuda.synchronize() + + # All captures here share a mempool. To avoid replays corrupting each other's memory, + # the safest approach is to capture all passes in the same order they'll run: + # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1. + + # Capture forward graphs + per_callable_static_outputs = [] + per_callable_output_was_tensor = [] + for func, args, fwd_graph in zip(callables, + sample_args, + fwd_graphs): + with torch.cuda.graph(fwd_graph, pool=mempool): + outputs = func(*args) + + # Assumes model output is a tensor or tuple of tensors + if isinstance(outputs, torch.Tensor): + per_callable_output_was_tensor.append(True) + outputs = (outputs,) + else: + per_callable_output_was_tensor.append(False) + + per_callable_static_outputs.append(outputs) + + # Capture backward graphs in reverse order + per_callable_static_grad_outputs = [] + per_callable_static_grad_inputs = [] + for static_input_surface, static_outputs, bwd_graph, module_params in \ + zip(reversed(per_callable_static_input_surfaces), + reversed(per_callable_static_outputs), + reversed(bwd_graphs), + reversed(per_callable_module_params)): + + # For now, assumes all static_outputs require grad + assert all(o.requires_grad for o in static_outputs), "Outputs of graphed callables must require grad." + static_grad_outputs = tuple(torch.empty_like(o) for o in static_outputs) + + with torch.cuda.graph(bwd_graph, pool=mempool): + grad_inputs = torch.autograd.grad(outputs=static_outputs, + inputs=tuple(i for i in static_input_surface if i.requires_grad), + grad_outputs=static_grad_outputs, + only_inputs=True, + allow_unused=False) + + # Constructs a tuple suitable for returning from Graphed.backward: + # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad. + # I couldn't think of a slick one-liner for this pattern. + static_grad_inputs = [] + grad_idx = 0 + for arg in static_input_surface: + if arg.requires_grad: + static_grad_inputs.append(grad_inputs[grad_idx]) + grad_idx += 1 + else: + static_grad_inputs.append(None) # type: ignore[arg-type] + static_grad_inputs = tuple(static_grad_inputs) # type: ignore[assignment] + + per_callable_static_grad_outputs.append(static_grad_outputs) + per_callable_static_grad_inputs.append(static_grad_inputs) + + # Reverses the most recent two lists + per_callable_static_grad_outputs = list(reversed(per_callable_static_grad_outputs)) + per_callable_static_grad_inputs = list(reversed(per_callable_static_grad_inputs)) + # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable. + + def make_graphed_autograd_function(fwd_graph, + bwd_graph, + module_params, + len_user_args, + output_was_tensor, + static_input_surface, + static_outputs, + static_grad_outputs, + static_grad_inputs): + class Graphed(torch.autograd.Function): + @staticmethod + def forward(ctx, *inputs): + # At this stage, only the user args may (potentially) be new tensors. + for i in range(len_user_args): + if static_input_surface[i].data_ptr() != inputs[i].data_ptr(): + static_input_surface[i].copy_(inputs[i]) + fwd_graph.replay() + assert isinstance(static_outputs, tuple) + return tuple(o.detach() for o in static_outputs) + + @staticmethod + @torch.autograd.function.once_differentiable + def backward(ctx, *grads): + for g, grad in zip(static_grad_outputs, grads): + if g is None: + assert grad is None + else: + # don't copy if autograd gods have been kind and the + # incoming grad is already in the right place + if g.data_ptr() != grad.data_ptr(): + g.copy_(grad) + bwd_graph.replay() + + # Input args that didn't require grad expect a None gradient. + assert isinstance(static_grad_inputs, tuple) + return tuple(b.detach() if b is not None else b for b in static_grad_inputs) + + def functionalized(*user_args): + # Runs the autograd function with inputs == all inputs to the graph that might require grad + # (explicit user args + module parameters) + # Assumes module params didn't change since capture. + out = Graphed.apply(*(user_args + module_params)) + return out[0] if output_was_tensor else out + + return functionalized + + # Put together the final graphed callables + ret = [] + for i, func in enumerate(callables): + graphed = make_graphed_autograd_function(fwd_graphs[i], + bwd_graphs[i], + per_callable_module_params[i], + per_callable_len_user_args[i], + per_callable_output_was_tensor[i], + per_callable_static_input_surfaces[i], + per_callable_static_outputs[i], + per_callable_static_grad_outputs[i], + per_callable_static_grad_inputs[i]) + + if isinstance(func, torch.nn.Module): + def make_graphed_forward(func, graph_training_state, graphed, orig_fwd): + def new_fwd(*user_args): + # If the module's training-or-eval state matches what we graphed, + # run the graph, otherwise run the original forward method + if func.training == graph_training_state: + return graphed(*user_args) + else: + return orig_fwd(*user_args) + return new_fwd + func.forward = make_graphed_forward(func, func.training, graphed, func.forward) # type: ignore[assignment] + ret.append(func) + else: + ret.append(graphed) + + if just_one_callable: + return ret[0] + + return tuple(ret) diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py index 0f983728f630a..2b4cc479e095f 100644 --- a/torch/cuda/streams.py +++ b/torch/cuda/streams.py @@ -8,8 +8,6 @@ # Define dummy base classes torch._C.__dict__['_CudaStreamBase'] = _dummy_type('_CudaStreamBase') torch._C.__dict__['_CudaEventBase'] = _dummy_type('_CudaEventBase') - torch._C.__dict__['_CudaGraphBase'] = _dummy_type('_CudaGraphBase') - torch._C.__dict__['_graph_pool_handle'] = _dummy_type('_graph_pool_handle') class Stream(torch._C._CudaStreamBase): r"""Wrapper around a CUDA stream. @@ -226,6 +224,3 @@ def __repr__(self): return ''.format(self._as_parameter_.value) else: return '' - -_Graph = torch._C._CudaGraphBase -_graph_pool_handle = torch._C._graph_pool_handle From 13484084a64df5f2c5deea26b8cc2f30833038f3 Mon Sep 17 00:00:00 2001 From: Rishi Puri Date: Tue, 31 Aug 2021 13:47:29 -0700 Subject: [PATCH 398/530] fix syntax error in bfloat16 PR (#64122) Summary: fixes prior syntax error from PR ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/64122 Reviewed By: H-Huang Differential Revision: D30643596 Pulled By: ngimel fbshipit-source-id: 0a2d5a40fb6dc7339cd03112e57ef0e1bf8a000e --- test/test_cuda.py | 40 ++++++++++++++++++++++++++++++++++++++-- torch/cuda/__init__.py | 3 ++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/test/test_cuda.py b/test/test_cuda.py index 70f5a6ee4f586..6f742ec59f931 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -46,12 +46,15 @@ TEST_LARGE_TENSOR = TEST_CUDA TEST_MEDIUM_TENSOR = TEST_CUDA TEST_CUDNN = TEST_CUDA +TEST_BF16 = False if TEST_CUDA: torch.ones(1).cuda() # initialize cuda context TEST_CUDNN = TEST_CUDA and (TEST_WITH_ROCM or torch.backends.cudnn.is_acceptable(torch.tensor(1., device=torch.device('cuda:0')))) TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9 TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9 + TEST_BF16 = torch.cuda.is_bf16_supported() + types = [ torch.FloatTensor, @@ -2707,9 +2710,9 @@ def cast(val, to_type): if add_kwargs is None: add_kwargs = {} - + fast_dtype = torch.bfloat16 if run_as_type == torch.bfloat16 else torch.float16 self.assertFalse(torch.is_autocast_enabled()) - with torch.autocast('cuda', ): + with torch.autocast('cuda', dtype=fast_dtype): self.assertTrue(torch.is_autocast_enabled()) out_type = out_type if out_type is not None else run_as_type @@ -2784,6 +2787,27 @@ def test_autocast_torch_fp16(self): if not skip_test: self._run_autocast_outofplace(op, args, torch.float16) + @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') + def test_autocast_torch_bf16(self): + with torch.backends.cudnn.flags(enabled=True, deterministic=True): + for op_with_args in self.autocast_lists.torch_fp16: + skip_test = False + op, args = op_with_args[0], op_with_args[1] + if len(op_with_args) == 3: + skip_test = op_with_args[2] # TEST_WITH_ROCM + should_error_from_not_implemented = 'cudnn' in op or 'prelu' in op or 'thnn' in op \ + or 'fused' in op or 'gru' in op or op == '_thnn_fused_lstm_cell' or op == 'lstm_cell' + if not skip_test: + if should_error_from_not_implemented: + with self.assertRaises(RuntimeError, msg=str(op) + ' should not be supported for bfloat16!'): + self._run_autocast_outofplace(op, args, torch.bfloat16) + else: + if torch.cuda.is_bf16_supported(): + self._run_autocast_outofplace(op, args, torch.bfloat16) + else: + with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'): + self._run_autocast_outofplace(op, args, torch.bfloat16) + @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') def test_autocast_torch_fp32(self): for op_with_args in self.autocast_lists.torch_fp32: @@ -2806,6 +2830,18 @@ def test_autocast_nn_fp16(self): for op, args in self.autocast_lists.nn_fp16: self._run_autocast_outofplace(op, args, torch.float16, module=torch._C._nn) + + + @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') + def test_autocast_nn_bf16(self): + with torch.backends.cudnn.flags(enabled=True, deterministic=True): + for op, args in self.autocast_lists.nn_fp16: + if torch.cuda.is_bf16_supported(): + self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn) + else: + with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'): + self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn) + @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') def test_autocast_nn_fp32(self): for op, args in self.autocast_lists.nn_fp32: diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 924782de8d024..80d9e108643b4 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -83,7 +83,8 @@ def is_bf16_supported(): r"""Returns a bool indicating if the current CUDA device supports dtype bfloat16""" cu_vers = torch.version.cuda if cu_vers is not None: - cuda_maj_decide = int(cu_vers.split(',')[0]) >= 11 + cuda_maj_decide = int(cu_vers.split('.')[0]) >= 11 + else: cuda_maj_decide = False return torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8 and cuda_maj_decide From 9a0456939b4ce6173b32714513faceed102b229c Mon Sep 17 00:00:00 2001 From: zhouzhuojie Date: Tue, 31 Aug 2021 13:48:28 -0700 Subject: [PATCH 399/530] Try the forked checkout action with retry (#64120) Summary: Fixes #{issue number} The main difference is: https://github.com/zhouzhuojie/checkout/commit/ffc6f93ad4b6e3cdcdd1a34e8c896765002f9b34 Can test multiple times in this PR to see if it works, will make the `retry` number configurable if it's usable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64120 Reviewed By: malfet Differential Revision: D30656099 Pulled By: zhouzhuojie fbshipit-source-id: a89932196bb0c44e412a34664ed6a061b02ef92e --- .github/templates/bazel_ci_workflow.yml.j2 | 2 +- .github/templates/linux_ci_workflow.yml.j2 | 10 +++++----- .github/templates/windows_ci_workflow.yml.j2 | 6 +++--- .github/workflows/build_linux_conda.yml | 6 +++--- .github/workflows/build_linux_libtorch.yml | 6 +++--- .github/workflows/build_linux_wheels.yml | 6 +++--- .github/workflows/create_release.yml | 2 +- ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 4 ++-- ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 4 ++-- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 8 ++++---- ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 8 ++++---- ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 8 ++++---- ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 8 ++++---- .../generated-linux-xenial-py3.6-gcc5.4.yml | 10 +++++----- ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 4 ++-- ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 4 ++-- ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 8 ++++---- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 6 +++--- .../generated-win-vs2019-cpu-py3.yml | 6 +++--- .../generated-win-vs2019-cuda10.1-py3.yml | 6 +++--- .../generated-win-vs2019-cuda11.3-py3.yml | 6 +++--- .github/workflows/lint.yml | 20 +++++++++---------- .../workflows/push_nightly_docker_ghcr.yml | 2 +- .github/workflows/run_torchbench.yml | 4 ++-- .github/workflows/test_tools.yml | 2 +- tools/test/test_extract_scripts.py | 2 +- 26 files changed, 79 insertions(+), 79 deletions(-) diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 index f4e0034a0f5d6..e9907ed679e3f 100644 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ b/.github/templates/bazel_ci_workflow.yml.j2 @@ -41,7 +41,7 @@ on: # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index d7be808898476..d5de86b1bbcfd 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -86,7 +86,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -163,7 +163,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -270,7 +270,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -308,7 +308,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -470,7 +470,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 84a30bda92a36..38c346c1134f8 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -95,7 +95,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} @@ -177,7 +177,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -206,7 +206,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} diff --git a/.github/workflows/build_linux_conda.yml b/.github/workflows/build_linux_conda.yml index 2037f0c1cf561..536a18771831e 100644 --- a/.github/workflows/build_linux_conda.yml +++ b/.github/workflows/build_linux_conda.yml @@ -16,7 +16,7 @@ jobs: image: python:3.9 steps: - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating build matrix id: set-matrix run: | @@ -57,12 +57,12 @@ jobs: - name: Clean runner workspace run: rm -rf "$GITHUB_WORKSPACE" - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: path: pytorch submodules: recursive - name: Clone pytorch/builder - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: repository: pytorch/builder path: builder diff --git a/.github/workflows/build_linux_libtorch.yml b/.github/workflows/build_linux_libtorch.yml index 9d4964a8594b1..9321c6ac8bf88 100644 --- a/.github/workflows/build_linux_libtorch.yml +++ b/.github/workflows/build_linux_libtorch.yml @@ -16,7 +16,7 @@ jobs: image: python:3.9 steps: - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating build matrix id: set-matrix run: | @@ -51,12 +51,12 @@ jobs: - name: Clean runner workspace run: rm -rf "$GITHUB_WORKSPACE" - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: path: pytorch submodules: recursive - name: Clone pytorch/builder - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: repository: pytorch/builder path: builder diff --git a/.github/workflows/build_linux_wheels.yml b/.github/workflows/build_linux_wheels.yml index c32eee6892033..15a38f6cee0fe 100644 --- a/.github/workflows/build_linux_wheels.yml +++ b/.github/workflows/build_linux_wheels.yml @@ -16,7 +16,7 @@ jobs: image: python:3.9 steps: - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating build matrix id: set-matrix run: | @@ -46,12 +46,12 @@ jobs: - name: Clean runner workspace run: rm -rf "$GITHUB_WORKSPACE" - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: path: pytorch submodules: recursive - name: Clone pytorch/builder - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: repository: pytorch/builder path: builder diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index fa65168a4709c..4cd0568be5aad 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -14,7 +14,7 @@ jobs: name: Create Release runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: 'recursive' - name: Fake name for PRs diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index 72a9c4effeee3..27b21bc6b523b 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 937a531c977e5..b90a497441802 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index f34765c98160b..274e68e38bdb4 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -246,7 +246,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -284,7 +284,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 6162b3cac1604..e03a019fbe7b9 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -246,7 +246,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -284,7 +284,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 2fe24a515ea2a..6aea843037eae 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -246,7 +246,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -284,7 +284,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 59b0e2535b3bb..1fdae9d1a0320 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -246,7 +246,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -284,7 +284,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 1fa72f51255dd..e932b488d0c71 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -140,7 +140,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -246,7 +246,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -284,7 +284,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -458,7 +458,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 233144210dbcd..89deda0704df1 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -60,7 +60,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -135,7 +135,7 @@ jobs: # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index 09989ef516a7e..63b462a19cf25 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -58,7 +58,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -138,7 +138,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 246d5cabd86de..768146ee8cab2 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -58,7 +58,7 @@ jobs: rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -138,7 +138,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive @@ -244,7 +244,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -282,7 +282,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow sharding to use git rev-list submodules: recursive diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 6c87f40accd64..aaf2d26e05af5 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -56,7 +56,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} @@ -143,7 +143,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -169,7 +169,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 30f328ae71fdd..08656c9dd99d8 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -56,7 +56,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} @@ -135,7 +135,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -161,7 +161,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index 72dd21dce3899..2a8570d1e84b1 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -58,7 +58,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} @@ -145,7 +145,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -171,7 +171,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index eb6e02fb5c2f3..7235db7f3c9f5 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -58,7 +58,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} @@ -145,7 +145,7 @@ jobs: - name: Install dependencies run: pip install typing-extensions - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Generating test matrix id: set-matrix run: .github/scripts/generate_pytorch_test_matrix.py @@ -171,7 +171,7 @@ jobs: working-directory: pytorch-${{ github.run_id }} steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: recursive path: pytorch-${{ github.run_id }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f036bc17d2a2d..f69d2b01ab35a 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -16,7 +16,7 @@ jobs: python-version: 3.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Install requirements id: requirements run: pip3 install -r requirements.txt --user @@ -101,7 +101,7 @@ jobs: python-version: 3.x architecture: x64 - name: Fetch PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow us to use git merge-base - name: Run clang-format @@ -140,7 +140,7 @@ jobs: python-version: 2.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Attempt to run setup.py run: | if ! python2 setup.py | grep -q "Python 2 has reached end-of-life and is no longer supported by PyTorch."; then @@ -159,7 +159,7 @@ jobs: python-version: 3.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Install requirements id: requirements run: | @@ -168,7 +168,7 @@ jobs: run: | pip3 install Jinja2==3.0.1 --user - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Regenerate workflows id: generate_workflows run: .github/scripts/generate_ci_workflows.py @@ -238,7 +238,7 @@ jobs: - name: Setup Node uses: actions/setup-node@v2 - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Install markdown-toc run: npm install -g markdown-toc - name: Regenerate ToCs and check that they didn't change @@ -274,7 +274,7 @@ jobs: python-version: 3.x architecture: x64 - name: Fetch PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 2 # to allow us to use github.event.pull_request.head.sha - name: Prepare output dir with HEAD commit SHA @@ -326,7 +326,7 @@ jobs: image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # to allow tools/linter/clang_tidy.py to do its thing - name: Prepare output dir with HEAD commit SHA @@ -412,7 +412,7 @@ jobs: python-version: 3.x architecture: x64 - name: Fetch PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Install dependencies run: | set -eux @@ -434,7 +434,7 @@ jobs: python-version: 3.8 architecture: x64 - name: Fetch PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Install dependencies run: | set -eux diff --git a/.github/workflows/push_nightly_docker_ghcr.yml b/.github/workflows/push_nightly_docker_ghcr.yml index 311aa94601d6a..892cb5c17aa86 100644 --- a/.github/workflows/push_nightly_docker_ghcr.yml +++ b/.github/workflows/push_nightly_docker_ghcr.yml @@ -14,7 +14,7 @@ jobs: GHCR_PAT: ${{ secrets.GHCR_PAT }} steps: - name: Checkout - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: ref: master - name: Build and upload nightly docker diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml index 0ae189e99f06a..786d25f4e3b0f 100644 --- a/.github/workflows/run_torchbench.yml +++ b/.github/workflows/run_torchbench.yml @@ -18,11 +18,11 @@ jobs: timeout-minutes: 720 steps: - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: path: pytorch - name: Checkout TorchBench - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: repository: pytorch/benchmark path: benchmark diff --git a/.github/workflows/test_tools.yml b/.github/workflows/test_tools.yml index 19a0fd9d4e7e7..afc790bb10e2b 100644 --- a/.github/workflows/test_tools.yml +++ b/.github/workflows/test_tools.yml @@ -16,7 +16,7 @@ jobs: python-version: 3.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@v2 + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: fetch-depth: 0 # deep clone, to allow us to use git log - name: Install dependencies diff --git a/tools/test/test_extract_scripts.py b/tools/test/test_extract_scripts.py index 29802517963b3..3126893c4bb39 100644 --- a/tools/test/test_extract_scripts.py +++ b/tools/test/test_extract_scripts.py @@ -20,7 +20,7 @@ def test_extract_none(self) -> None: self.assertEqual( extract_scripts.extract({ 'name': 'Checkout PyTorch', - 'uses': 'actions/checkout@v2', + 'uses': 'zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9', }), None, ) From 491bf7cb7474d82e4349ea4687b544840e591b50 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Tue, 31 Aug 2021 13:55:59 -0700 Subject: [PATCH 400/530] [DataPipe] adding description, __len__, tests for mux() (#64224) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64224 cc VitalyFedyunin ejguan Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30651551 Pulled By: NivekT fbshipit-source-id: f8af98ba71a592900b992a8077432062ec57bb48 --- test/test_datapipe.py | 42 ++++++++++++++++++++ torch/utils/data/datapipes/iter/combining.py | 33 +++++++++++---- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 4e37f41565226..24d0ce20d63dd 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -354,6 +354,15 @@ def test_demux_mux_datapipe(self): n = n1.mux(n2, n3) self.assertEqual(list(range(10)), list(n)) + # Test Case: Uneven DataPipes + source_numbers = list(range(0, 10)) + [10, 12] + numbers_dp = IDP(source_numbers) + n1, n2 = numbers_dp.demux(2, lambda x: x % 2) + self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1)) + self.assertEqual([1, 3, 5, 7, 9], list(n2)) + n = n1.mux(n2) + self.assertEqual(source_numbers, list(n)) + class FileLoggerSimpleHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): def __init__(self, *args, logfile=None, **kwargs): @@ -1221,6 +1230,39 @@ def fn(item, dtype=torch.float, *, sum=False): map_dp[index], torch.tensor(input_dp[index], dtype=torch.int).sum() ) + def test_mux_datapipe(self): + + # Test Case: Elements are yielded one at a time from each DataPipe, until they are all exhausted + input_dp1 = IDP(range(4)) + input_dp2 = IDP(range(4, 8)) + input_dp3 = IDP(range(8, 12)) + output_dp = input_dp1.mux(input_dp2, input_dp3) + expected_output = [0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11] + self.assertEqual(len(expected_output), len(output_dp)) + self.assertEqual(expected_output, list(output_dp)) + + # Test Case: Uneven input Data Pipes + input_dp1 = IDP([1, 2, 3, 4]) + input_dp2 = IDP([10]) + input_dp3 = IDP([100, 200, 300]) + output_dp = input_dp1.mux(input_dp2, input_dp3) + expected_output = [1, 10, 100, 2, 200, 3, 300, 4] + self.assertEqual(len(expected_output), len(output_dp)) + self.assertEqual(expected_output, list(output_dp)) + + # Test Case: Empty Data Pipe + input_dp1 = IDP([0, 1, 2, 3]) + input_dp2 = IDP([]) + output_dp = input_dp1.mux(input_dp2) + self.assertEqual(len(input_dp1), len(output_dp)) + self.assertEqual(list(input_dp1), list(output_dp)) + + # Test Case: raises TypeError when __len__ is called and an input doesn't have __len__ + input_dp1 = IDP(range(10)) + input_dp_no_len = IDP_NoLen(range(10)) + output_dp = input_dp1.mux(input_dp_no_len) + with self.assertRaises(TypeError): + len(output_dp) # Metaclass conflict for Python 3.6 # Multiple inheritance with NamedTuple is not supported for Python 3.9 diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index a837c5bb101c7..ed1256fa1e757 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -1,7 +1,7 @@ import warnings from torch.utils.data import IterDataPipe, functional_datapipe -from typing import Any, Callable, Iterator, List, Optional, Sized, Tuple, TypeVar, Deque +from typing import Any, Callable, Iterator, List, Optional, Set, Sized, Tuple, TypeVar, Deque from collections import deque T_co = TypeVar('T_co', covariant=True) @@ -261,24 +261,41 @@ def reset(self): @functional_datapipe('mux') class MultiplexerIterDataPipe(IterDataPipe): + r""" :class:`MultiplexerIterDataPipe`. + Iterable DataPipe that yields one element at a time from each input Iterable DataPipe + (i.e. one element from the 1st input DataPipe, then one element from the 2nd DataPipe in the next iteration, + and so on). It skips over DataPipes that are exhausted, and ends when all input DataPipes are exhausted. + + Args: + datapipes: Iterable DataPipes that will take turn to yield their elements, until they are all exhausted + """ def __init__(self, *datapipes): self.datapipes = datapipes + self.length: Optional[int] = None def __iter__(self): iterators = [iter(x) for x in self.datapipes] - finished = {} - had_more = True - while had_more: - had_more = False + finished: Set[int] = set() + while len(finished) < len(iterators): for i in range(len(iterators)): if i not in finished: try: - value = iterators[i].__next__() - had_more = True + value = next(iterators[i]) yield value except StopIteration: - finished[i] = 1 + finished.add(i) + + def __len__(self): + if self.length is not None: + if self.length == -1: + raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) + return self.length + if all(isinstance(dp, Sized) for dp in self.datapipes): + self.length = sum(len(dp) for dp in self.datapipes) + else: + self.length = -1 + return len(self) @functional_datapipe('zip') From 6c8cb9bd76299be00d787be2ba02daed0b3921a6 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Tue, 31 Aug 2021 13:55:59 -0700 Subject: [PATCH 401/530] [DataPipe] export fork, mux, demux for public usage (#64279) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64279 cc VitalyFedyunin ejguan Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30671971 Pulled By: NivekT fbshipit-source-id: 056ac12ef7183b254d1eec341145594639e47ef6 --- torch/utils/data/datapipes/iter/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index d4baef788ecca..26d715d310234 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -54,14 +54,17 @@ 'BucketBatcher', 'Collator', 'Concater', + 'Demultiplexer', 'FileLister', 'FileLoader', 'Filter', + 'Forker', 'Grouper', 'HttpReader', 'IterableWrapper', 'LineReader', 'Mapper', + 'Multiplexer', 'RoutedDecoder', 'Sampler', 'Shuffler', From 0457a85d459479881ad07e84a8e9f53bf82bb48d Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Tue, 31 Aug 2021 14:53:01 -0700 Subject: [PATCH 402/530] Revert D30543236: Add python mode Test Plan: revert-hammer Differential Revision: D30543236 (https://github.com/pytorch/pytorch/commit/4bd03b02424d93b72f15e28c542ede13f88ea929) Original commit changeset: ef5444d96a5a fbshipit-source-id: b0042ac2c22765fa11d6d00bf751f6a4489eb6d8 --- aten/src/ATen/PythonModeTLS.cpp | 26 ------- aten/src/ATen/PythonModeTLS.h | 17 ----- aten/src/ATen/ThreadLocalState.cpp | 3 - aten/src/ATen/ThreadLocalState.h | 3 - aten/src/ATen/core/PythonFallbackKernel.cpp | 13 +--- c10/core/TensorImpl.cpp | 20 +---- c10/core/TensorImpl.h | 35 +-------- test/run_test.py | 1 - test/test_python_dispatch.py | 81 +-------------------- tools/build_variables.bzl | 2 - torch/_C/__init__.pyi.in | 2 - torch/csrc/autograd/init.cpp | 17 ----- torch/csrc/autograd/python_mode.cpp | 27 ------- torch/csrc/autograd/python_mode.h | 17 ----- torch/csrc/autograd/python_variable.cpp | 38 ++-------- torch/csrc/utils/python_arg_parser.cpp | 39 ++-------- torch/csrc/utils/python_arg_parser.h | 11 +-- torch/csrc/utils/tensor_new.cpp | 1 - torch/utils/_python_dispatch.py | 34 --------- 19 files changed, 21 insertions(+), 366 deletions(-) delete mode 100644 aten/src/ATen/PythonModeTLS.cpp delete mode 100644 aten/src/ATen/PythonModeTLS.h delete mode 100644 torch/csrc/autograd/python_mode.cpp delete mode 100644 torch/csrc/autograd/python_mode.h delete mode 100644 torch/utils/_python_dispatch.py diff --git a/aten/src/ATen/PythonModeTLS.cpp b/aten/src/ATen/PythonModeTLS.cpp deleted file mode 100644 index b53043ca84147..0000000000000 --- a/aten/src/ATen/PythonModeTLS.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include - -namespace at { namespace impl { - -thread_local std::shared_ptr pythonModeState; - -void PythonModeTLS::set_state(const std::shared_ptr& state) { - pythonModeState = state; - if (state) { - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); - } else { - PythonModeTLS::reset_state(); - } -} - -const std::shared_ptr& PythonModeTLS::get_state() { - return pythonModeState; -} - -void PythonModeTLS::reset_state() { - pythonModeState.reset((TorchDispatchTypeObject*)nullptr); - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); -} - -} // namespace impl -} // namespace at diff --git a/aten/src/ATen/PythonModeTLS.h b/aten/src/ATen/PythonModeTLS.h deleted file mode 100644 index be52b182c659b..0000000000000 --- a/aten/src/ATen/PythonModeTLS.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace at { -namespace impl { - -struct TORCH_API PythonModeTLS { - static void set_state(const std::shared_ptr& state); - static const std::shared_ptr& get_state(); - static void reset_state(); -}; - -} // namespace impl -} // namespace at diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 19cfa89967ccb..98c2519e045ce 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -17,7 +17,6 @@ ThreadLocalState::ThreadLocalState() saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks(); bumped_record_all_functions_ = at::checkRecordAllFunctions(); - python_mode_state_ = at::impl::PythonModeTLS::get_state(); } void ThreadLocalState::set_grad_mode(bool enabled) { @@ -31,8 +30,6 @@ void ThreadLocalState::setThreadLocalState( // restore the dispatch key set TLS at the same time. c10::AutogradState::set_tls_state(state.autograd_tls_); - at::impl::PythonModeTLS::set_state(state.python_mode_state_); - at::set_record_function_tls_(state.rf_tls_); SavedTensorDefaultHooks::set_hooks( diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index c99ca6158ffa5..41146912819b4 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -6,7 +6,6 @@ #include #include -#include namespace at { @@ -41,8 +40,6 @@ class TORCH_API ThreadLocalState { // TLS for AutogradModes AutogradState autograd_tls_; - std::shared_ptr python_mode_state_; - // TLS for saved tensors default hooks std::pair saved_tensors_default_hooks_; diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 8e77d0952ec75..276eabfe458c0 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -1,18 +1,9 @@ #include #include -#include namespace { void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - // If Python Mode is active, use its PyInterpreter for dispatch - const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state(); - if (maybe_python_mode_state) { - maybe_python_mode_state->pyinterpreter()->dispatch(op, stack, maybe_python_mode_state); - return; - } - - // Otherwise, find a PyInterpreter on a Tensor const auto& schema = op.schema(); const auto num_arguments = schema.arguments().size(); // It is safe to dispatch on the very first Tensor with a pyobj_interpreter @@ -24,7 +15,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { if (ivalue.isTensor()) { auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_interpreter(); if (interpreter) { - interpreter->dispatch(op, stack, nullptr); + interpreter->dispatch(op, stack); return; } } else if (ivalue.isTensorList()) { @@ -33,7 +24,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { for (const auto& nv : ivalue.toListRef()) { auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter(); if (interpreter) { - interpreter->dispatch(op, stack, nullptr); + interpreter->dispatch(op, stack); return; } } diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 9a72659711743..de829c493732d 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -40,8 +40,7 @@ static c10::intrusive_ptr noop_detach_fn( static void noop_dispatch_fn( const PyInterpreter*, const c10::OperatorHandle& op, - torch::jit::Stack* stack, - const std::shared_ptr& type) { + torch::jit::Stack* stack) { TORCH_INTERNAL_ASSERT( 0, "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died"); @@ -609,23 +608,6 @@ void TensorImpl::copy_tensor_metadata( } } -TorchDispatchTypeObject::TorchDispatchTypeObject( - PyObject* type_object, - c10::impl::PyInterpreter* pyinterpreter) - : data_(type_object), pyinterpreter_(pyinterpreter) {} - -TorchDispatchTypeObject::~TorchDispatchTypeObject() { - pyinterpreter_->decref(data_); -} - -c10::impl::PyInterpreter* TorchDispatchTypeObject::pyinterpreter() const { - return pyinterpreter_; -} - -PyObject* TorchDispatchTypeObject::ptr() const { - return data_; -} - namespace impl { namespace { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index d110a17b46590..7051e36b35516 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -161,9 +161,6 @@ struct C10_API AutogradMetaInterface { virtual ~AutogradMetaInterface(); }; -// forward declared -struct TorchDispatchTypeObject; - namespace impl { // Unfortunately, the definition of AutogradMeta lives in a separate @@ -258,8 +255,7 @@ struct C10_API PyInterpreter { using dispatch_sig = void( const PyInterpreter*, const c10::OperatorHandle&, - torch::jit::Stack* stack, - const std::shared_ptr& type); + torch::jit::Stack* stack); PyInterpreter( name_sig* name_fn, @@ -303,9 +299,8 @@ struct C10_API PyInterpreter { // Invoke the Python boxed fallback dispatch to go back into Python __ubsan_ignore_function__ void dispatch( const c10::OperatorHandle& op, - torch::jit::Stack* stack, - const std::shared_ptr& type) const { - return (*dispatch_fn_)(this, op, stack, type); + torch::jit::Stack* stack) const { + return (*dispatch_fn_)(this, op, stack); } // Disarm this PyInterpreter, making all of its methods noops. @@ -353,30 +348,6 @@ struct C10_API NamedTensorMetaInterface { }; }; -// NOTE [What is TorchDispatchTypeObject?] -// A TorchDispatchTypeObject represents the type of a Tensor subclass that has -// a __torch_dispatch__ classmethod. Concretely, it holds the class as a -// PyObject* and a PyInterpreter* that says which python interpreter the class -// came from. -// -// See NOTE [dispatch_fn's type argument] for more details -struct C10_API TorchDispatchTypeObject { - // Steals a reference to type_object - TorchDispatchTypeObject( - PyObject* type_object, - c10::impl::PyInterpreter* pyinterpreter); - - // Releases the stolen reference to type_object - ~TorchDispatchTypeObject(); - - c10::impl::PyInterpreter* pyinterpreter() const; - PyObject* ptr() const; - - private: - PyObject* data_; - c10::impl::PyInterpreter* pyinterpreter_; -}; - // NOTE [ Version Counter Sharing ] // // Every Tensor has a version counter. Version counters are incremented whenever diff --git a/test/run_test.py b/test/run_test.py index d0871fa2a0d30..55b2f3841d7b5 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -104,7 +104,6 @@ "test_optim", "test_functional_optim", "test_pytree", - "test_python_dispatch", "test_mobile_optimizer", "test_set_default_mobile_cpu_allocator", "test_xnnpack_integration", diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index e474f1f4783f0..0f5b6b9cbd70e 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -1,7 +1,6 @@ import torch from torch.testing._internal.common_utils import TestCase, run_tests from torch.utils._pytree import tree_map -from torch.utils._python_dispatch import enable_python_mode from typing import Iterator, List import logging @@ -51,10 +50,7 @@ def unwrap(e): def wrap(e): return LoggingTensor(e) if isinstance(e, torch.Tensor) else e - # no_dispatch is only needed if you use enable_python_mode. - # It prevents infinite recursion. - with no_dispatch(): - rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) + rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs) return rs @@ -339,81 +335,6 @@ def backward(ctx, grad_output): $5 = torch._ops.aten.mul($4, $0) $6 = torch._ops.aten.add_($1, $5)''') - def test_enable_python_mode_error(self) -> None: - with self.assertRaisesRegex(ValueError, "__torch_dispatch__"): - with enable_python_mode(torch.Tensor): - pass - z = LoggingTensor(torch.empty([])) - with self.assertRaisesRegex(ValueError, "must be the type"): - with enable_python_mode(z): - pass - - def test_enable_python_mode_basic(self) -> None: - with enable_python_mode(LoggingTensor): - z = torch.empty([]) - self.assertTrue(isinstance(z, LoggingTensor)) - - def test_enable_python_mode_unrelated_tensors(self) -> None: - x = torch.randn([]) - y = torch.randn([]) - with enable_python_mode(LoggingTensor): - z = x + y - self.assertTrue(isinstance(z, LoggingTensor)) - - def test_enable_python_mode_subclass_priority(self) -> None: - class ErrorA(RuntimeError): - pass - - class ErrorB(RuntimeError): - pass - - class A(torch.Tensor): - @staticmethod - def __new__(cls, elem): - return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) - - @classmethod - def __torch_dispatch__(cls, func, types, args=(), kwargs=None): - raise ErrorA - - class B(A): - @staticmethod - def __new__(cls, elem): - return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) - - @classmethod - def __torch_dispatch__(cls, func, types, args=(), kwargs=None): - raise ErrorB - - a = A(torch.empty(1)) - b = B(torch.empty(1)) - with self.assertRaises(ErrorA): - a + a - - # B has precedence over A due to the subclass relationship - with self.assertRaises(ErrorB): - with enable_python_mode(A): - b + b - with self.assertRaises(ErrorB): - with enable_python_mode(B): - a + a - with self.assertRaises(ErrorB): - with enable_python_mode(B): - a + b - - def test_enable_python_mode_respects_no_dispatch(self) -> None: - with enable_python_mode(LoggingTensor): - z = torch.ones([2, 3]) - self.assertTrue(isinstance(z, LoggingTensor)) - with no_dispatch(): - expected = torch.ones([2, 3]) - self.assertEqual(z.elem, expected) - - def test_nested_enable_python_mode(self) -> None: - with self.assertRaisesRegex(RuntimeError, "has already been set"): - with enable_python_mode(LoggingTensor): - with enable_python_mode(LoggingTensor): - pass if __name__ == '__main__': run_tests() diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index dd89981094d4f..34846b5d6c7b3 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -666,7 +666,6 @@ libtorch_python_core_sources = [ "torch/csrc/autograd/init.cpp", "torch/csrc/autograd/python_anomaly_mode.cpp", "torch/csrc/autograd/python_saved_variable_hooks.cpp", - "torch/csrc/autograd/python_mode.cpp", "torch/csrc/autograd/python_cpp_function.cpp", "torch/csrc/autograd/python_engine.cpp", "torch/csrc/autograd/python_function.cpp", @@ -794,7 +793,6 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/ParallelNativeTBB.cpp", "aten/src/ATen/ParallelOpenMP.cpp", "aten/src/ATen/ParallelThreadPoolNative.cpp", - "aten/src/ATen/PythonModeTLS.cpp", "aten/src/ATen/ScalarOps.cpp", "aten/src/ATen/SequenceNumber.cpp", "aten/src/ATen/SparseTensorImpl.cpp", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 352edbee6bc5e..01fdf9e12500a 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -652,8 +652,6 @@ def __set_forward_AD_enabled(enabled: _bool) -> None: ... def __is_forward_AD_enabled() -> _bool: ... def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ... def _reset_default_hooks() -> None: ... -def _enter_python_mode(cls: Type) -> None: ... -def _exit_python_mode() -> None: ... class _InferenceMode(object): def __init__(self, mode: _bool) -> None: ... diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 860aaec466218..697ca871f83c5 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -495,20 +494,6 @@ static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyOb END_HANDLE_TH_ERRORS } -static PyObject * enter_python_mode(PyObject* _unused, PyObject* arg) { - HANDLE_TH_ERRORS - PythonMode::enter(arg); - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -static PyObject * exit_python_mode(PyObject* _unused, PyObject* arg) { - HANDLE_TH_ERRORS - PythonMode::exit(); - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - // autograd methods on torch._C static PyMethodDef methods[] = { // NOLINT {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr}, @@ -529,8 +514,6 @@ static PyMethodDef methods[] = { // NOLINT {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr}, {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr}, {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr}, - {"_enter_python_mode", enter_python_mode, METH_O, nullptr}, - {"_exit_python_mode", exit_python_mode, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr} }; diff --git a/torch/csrc/autograd/python_mode.cpp b/torch/csrc/autograd/python_mode.cpp deleted file mode 100644 index 435842631a5bb..0000000000000 --- a/torch/csrc/autograd/python_mode.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include -#include -#include -#include -#include - -namespace torch { namespace autograd { - -void PythonMode::enter(PyObject* type) { - if (at::impl::PythonModeTLS::get_state()) { - TORCH_CHECK( - false, - "python mode has already been set. We do not yet support nested python ", - "mode. Please file us an issue and reset it before setting it again.") - } - // TorchDispatchTypeObject steals a reference, See NOTE [What is TorchDispatchTypeObject?] - Py_INCREF(type); - auto state = std::make_shared(type, getPyInterpreter()); - at::impl::PythonModeTLS::set_state(state); -} - -void PythonMode::exit() { - TORCH_INTERNAL_ASSERT(at::impl::PythonModeTLS::get_state(), "exiting Python Mode but it wasn't set!"); - at::impl::PythonModeTLS::reset_state(); -} - -}} diff --git a/torch/csrc/autograd/python_mode.h b/torch/csrc/autograd/python_mode.h deleted file mode 100644 index 03da51c1c49e0..0000000000000 --- a/torch/csrc/autograd/python_mode.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include -#include - -namespace torch { namespace autograd { - -struct TORCH_API PythonMode { - // Enter python mode, causing all operators to dispatch to the type's __torch_dispatch__. - // `type` is the type of a Tensor subclass that has __torch_dispatch__. - static void enter(PyObject* type); - - // Exit the current python mode. - static void exit(); -}; - -}} diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index abe90105cde2b..50d6eb9ab7e05 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -32,7 +32,6 @@ #include #include -#include #include @@ -65,12 +64,7 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) { return; pybind11::gil_scoped_acquire gil; - // Two possibilities: - // 1. We are decref-ing a tensor. Then we must be careful about - // PyObject resurrection (this only applies to Tensors, see THPVariable_clear). - // 2. We are decref-ing some other Python object. We don't do - // PyObject resurrection on non-Tensors, so we just carry on as usual - if (THPVariable_Check(pyobj) && Py_REFCNT(pyobj) > 1) { + if (Py_REFCNT(pyobj) > 1) { // It's still alive! This can happen if a weak ref resurrected // the PyObject without flipping ownership. At this point it is // too late to rescue the object, so just stub out the PyObject @@ -88,11 +82,7 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) { }; c10::intrusive_ptr concrete_detach_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self); -void concrete_dispatch_fn( - const c10::impl::PyInterpreter*, - const c10::OperatorHandle& op, - torch::jit::Stack* stack, - const std::shared_ptr& type); +void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack); class PyInterpreterHolder { public: @@ -1501,19 +1491,7 @@ bool isPythonTensor(const Tensor& tensor) { return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python); } -// NOTE [dispatch_fn's type argument] -// `type` is nullable and represents the PythonMode going on. -// Right now we only support a single PythonMode, but in the future we could -// change this to a stack of PythonModes. -// -// If `type` isn't null, then we consider the type for dispatch by prepending -// it to the overloaded_args list. `handle_torch_funciton_no_python_arg_parser` -// is responsible for doing overload resolution. -void concrete_dispatch_fn( - const c10::impl::PyInterpreter*, - const c10::OperatorHandle& op, - torch::jit::Stack* stack, - const std::shared_ptr& type) { +void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack) { const auto& schema = op.schema(); const auto num_returns = schema.returns().size(); @@ -1590,17 +1568,13 @@ void concrete_dispatch_fn( auto args = py::reinterpret_steal(PyTuple_New(positional_default_start)); py::dict kwargs; - if (type) { - append_overloaded_type(&overloaded_args, type->ptr()); - } - // Find overloaded tensors for (int64_t idx = 0; idx < arguments.size(); idx++) { const auto& ivalue = arguments[idx]; if (ivalue.isTensor()) { const auto& tensor = ivalue.toTensor(); if (isPythonTensor(tensor)) { - append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr()); + append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr()); } } else if (ivalue.isList()) { const auto& list = ivalue.toListRef(); @@ -1609,7 +1583,7 @@ void concrete_dispatch_fn( if (nv.isTensor()) { const auto& tensor = nv.toTensor(); if (isPythonTensor(tensor)) { - append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr()); + append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr()); } } } @@ -1659,7 +1633,7 @@ c10::intrusive_ptr concrete_detach_fn(const c10::impl::PyInterpreter Tensor self_t = Tensor(c10::intrusive_ptr::unsafe_reclaim_from_nonowning(const_cast(self))); auto self_p = py::reinterpret_steal(THPVariable_Wrap(self_t)); TORCH_INTERNAL_ASSERT(isPythonTensor(self_t)); - append_overloaded_tensor(&overloaded_args, self_p.ptr()); + append_overloaded_arg(&overloaded_args, self_p.ptr()); auto args = py::reinterpret_steal(PyTuple_New(1)); PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr()); diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 3ee20c055bf94..6115dcdfbe61a 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -200,28 +200,12 @@ auto handle_torch_function(PyObject* self, const std::string& func_name, PyObjec return ret.release().ptr(); } -// Note: [Overloaded args] -// An overloaded arg may be one of the following: -// - an instance of an object that has a __torch_function__ method -// - an instance of an object that has a __torch_dispatch__ classmethod -// - a class type that has a __torch_dispatch__ classmethod -// -// This function returns the type of the arg (if the arg is an instance), -// otherwise, it returns the arg. -static PyObject* get_type_of_overloaded_arg(PyObject* obj_or_type) { - if (PyType_Check(obj_or_type)) { - return obj_or_type; - } - return (PyObject*)Py_TYPE(obj_or_type); -} - -// See Note: [Overloaded args] for what they hold auto handle_torch_function_no_python_arg_parser(const std::vector &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name, const char* torch_function_name) -> PyObject* { // overloaded_args already all have unique types std::vector overloaded_types; overloaded_types.reserve(overloaded_args.size()); for (auto &arg : overloaded_args) { - overloaded_types.push_back(py::reinterpret_borrow(get_type_of_overloaded_arg(arg.ptr()))); + overloaded_types.push_back(py::reinterpret_borrow((PyObject *) Py_TYPE(arg.ptr()))); } py::tuple py_types = py::cast(overloaded_types); py::object ret; @@ -247,7 +231,7 @@ auto handle_torch_function_no_python_arg_parser(const std::vector &o ss << "no implementation found for '" << module_name << "." << func_name << "' on types that implement " << torch_function_name << ": ["; for (auto &arg : overloaded_args) { - ss << PyObject_Repr(get_type_of_overloaded_arg(arg.ptr())); + ss << arg.ptr()->ob_type->tp_name; if (!arg.is(overloaded_args.back())) { ss << ", "; } @@ -344,11 +328,10 @@ auto handle_torch_function_indexing(PyObject* self, PyObject* index, PyObject* v * */ -static void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj, bool obj_is_type) { +void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj) { bool class_not_seen_yet = true; - PyObject* obj_type = obj_is_type ? obj : (PyObject*)Py_TYPE(obj); for (auto &arg : *overloaded_args) { - if (obj_type == get_type_of_overloaded_arg(arg.ptr())) { + if (Py_TYPE(obj) == Py_TYPE(arg.ptr())) { // obj is the same type as another parameter we've seen in a prior // iteration of the loop over parameters so we already have an entry // with the proper __torch_function__ implementation to call, so skip @@ -360,7 +343,7 @@ static void append_overloaded_arg(std::vector* overloaded_args, PyOb if (class_not_seen_yet) { int arg_index = overloaded_args->size(); for(const auto j : c10::irange(arg_index)) { - if (PyObject_IsSubclass(obj_type, (PyObject*)(get_type_of_overloaded_arg((*overloaded_args)[j].ptr())))) { + if (PyObject_IsInstance(obj, (PyObject*)(Py_TYPE((*overloaded_args)[j].ptr())))) { // obj is a subclass of another object we've seen already so its // __torch_function__ should be called first, therefore we // insert it into overloaded_args before the superclass @@ -375,14 +358,6 @@ static void append_overloaded_arg(std::vector* overloaded_args, PyOb } } -void append_overloaded_tensor(std::vector* overloaded_args, PyObject* obj) { - append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/false); -} - -void append_overloaded_type(std::vector* overloaded_args, PyObject* obj) { - append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/true); -} - bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* overloaded_args) { if (THPVariable_CheckExact(obj)) { // torch.Tensor instances (not subclasses, except for Parameter) @@ -391,7 +366,7 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* ove if (check_has_torch_function(obj)) { // tensor subclasses and unrelated objects with __torch_function__ - append_overloaded_tensor(overloaded_args, obj); + append_overloaded_arg(overloaded_args, obj); return true; } else if (THPVariable_Check(obj)) { // tensor subclasses without __torch_function__ @@ -930,7 +905,7 @@ bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs, int i = 0; if (self != nullptr && check_has_torch_function(self)) { - append_overloaded_tensor(&this->overloaded_args, self); + append_overloaded_arg(&this->overloaded_args, self); } for (auto& param : params) { PyObject* obj = nullptr; diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 6a05807e5a314..d132185ccaefb 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -818,15 +818,6 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector * 'overloaded_args': the vector to append the overloaded args * 'obj': the input tensor that is overloaded */ -void append_overloaded_tensor(std::vector* overloaded_args, PyObject* obj); - -/* Given an argument that is definitely a type and is definitely overloaded, - * append it to the overloaded arguments list. Use this only with __torch_dispatch__, - * where we operate on classes that have a __torch_dispatch__ classmethod. - * - * 'overloaded_args': the vector to append the overloaded type - * 'obj': the input class that has a __torch_dispatch__ classmethod. - */ -void append_overloaded_type(std::vector* overloaded_args, PyObject* obj); +void append_overloaded_arg(std::vector* overloaded_args, PyObject* obj); } // namespace torch diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 25e9a5962614f..17d7acc37640c 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -267,7 +267,6 @@ Tensor internal_new_from_data( { at::AutoDispatchBelowADInplaceOrView guard; // TODO: remove at::tracer::impl::NoTracerDispatchMode tracer_guard; - c10::impl::ExcludeDispatchKeyGuard pythonmode_guard(c10::DispatchKey::Python); // functorch uses FuncTorchDynamicLayerBackMode as a mode key to wrap all // tensors returned from operators in special TensorWrapper tensor extension // The problem with this is that TensorWrapper does not have storage so diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py deleted file mode 100644 index a7cfae10c37c1..0000000000000 --- a/torch/utils/_python_dispatch.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -import contextlib -from typing import Iterator - -# Context manager that causes all pytorch operators to dispatch to the passed-in -# type's __torch_dispatch__ function. -# operation that accepts no tensors but returns a tensor. -# -# enable_python_mode is affected by torch._C._DisableTorchDispatch. -# -# NB: Calling an operator inside __torch_dispatch__ does go through -# __torch_dispatch__ again. Please use _DisableTorchDispatch inside -# __torch_dispatch__ to prevent infinite recursion. -# -# TODO: Limitations and things about enable_python_mode we should fix before exposing it: -# - it currently cannot be nested. This should be simple to implement; we need a -# stack of TorchDispatchTypeObjects and the next bullet point. -# - We need a better user-facing api for torch._C._DisableTorchDispatch that -# is able to selectively disable __torch_dispatch__ of a particular class. -# - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor) -# - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694) -@contextlib.contextmanager -def enable_python_mode(cls) -> Iterator[None]: - if not hasattr(cls, '__torch_dispatch__'): - raise ValueError('The class passed to enable_python_mode ' - 'must have a __torch_dispatch__ classmethod') - if not isinstance(cls, type) or not issubclass(cls, (torch.Tensor,)): - raise ValueError('The argument passed to enable_python_mode ' - 'must be the type of a Tensor subclass') - torch._C._enter_python_mode(cls) - try: - yield - finally: - torch._C._exit_python_mode() From c2da103fe69d493dfce48b7075e56643ac8fce58 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Aug 2021 17:19:11 -0700 Subject: [PATCH 403/530] Discover new tests in run_tests.py (#64246) Summary: Introduce `discover_tests` function that globs for all Python files starting with `test_` in test folder excluding subfolders which are executed differently Fixes https://github.com/pytorch/pytorch/issues/64178 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64246 Reviewed By: walterddr, seemethere Differential Revision: D30661652 Pulled By: malfet fbshipit-source-id: a52e78ec717b6846add267579dd8d9ae75326bf9 --- test/run_test.py | 220 +++++++++++++++++------------------------------ 1 file changed, 81 insertions(+), 139 deletions(-) mode change 100755 => 100644 test/run_test.py diff --git a/test/run_test.py b/test/run_test.py old mode 100755 new mode 100644 index 55b2f3841d7b5..5953919b16323 --- a/test/run_test.py +++ b/test/run_test.py @@ -50,145 +50,87 @@ ) -TESTS = [ - "test_import_time", - "test_public_bindings", - "test_type_hints", - "test_ao_sparsity", - "test_autograd", - "benchmark_utils/test_benchmark_utils", - "test_binary_ufuncs", - "test_buffer_protocol", - "test_bundled_inputs", - "test_complex", - "test_cpp_api_parity", - "test_cpp_extensions_aot_no_ninja", - "test_cpp_extensions_aot_ninja", - "test_cpp_extensions_jit", - "distributed/test_c10d_common", - "distributed/test_c10d_gloo", - "distributed/test_c10d_nccl", - "distributed/test_jit_c10d", - "distributed/test_c10d_spawn_gloo", - "distributed/test_c10d_spawn_nccl", - "distributed/test_store", - "distributed/test_pg_wrapper", - "distributed/algorithms/test_join", - "test_cuda", - "test_autocast", - "test_jit_cuda_fuser", - "test_cuda_primary_ctx", - "test_dataloader", - "test_datapipe", - "distributed/test_data_parallel", - "distributed/test_distributed_spawn", - "distributions/test_constraints", - "distributions/test_distributions", - "test_dispatch", - "test_foreach", - "test_indexing", - "test_jit", - "test_linalg", - "test_logging", - "test_mkldnn", - "test_model_dump", - "test_module_init", - "test_modules", - "test_multiprocessing", - "test_multiprocessing_spawn", - "distributed/test_nccl", - "test_native_functions", - "test_numba_integration", - "test_nn", - "test_ops", - "test_optim", - "test_functional_optim", - "test_pytree", - "test_mobile_optimizer", - "test_set_default_mobile_cpu_allocator", - "test_xnnpack_integration", - "test_vulkan", - "test_sparse", - "test_sparse_csr", - "test_quantization", - "test_pruning_op", - "test_spectral_ops", - "test_serialization", - "test_shape_ops", - "test_show_pickle", - "test_sort_and_select", - "test_tensor_creation_ops", - "test_testing", - "test_torch", - "test_type_info", - "test_unary_ufuncs", - "test_utils", - "test_view_ops", - "test_vmap", - "test_namedtuple_return_api", - "test_numpy_interop", - "test_jit_profiling", - "test_jit_legacy", - "test_jit_fuser_legacy", - "test_tensorboard", - "test_namedtensor", - "test_reductions", - "test_type_promotion", - "test_jit_disabled", - "test_function_schema", - "test_overrides", - "test_jit_fuser_te", - "test_tensorexpr", - "test_tensorexpr_pybind", - "test_openmp", - "test_profiler", - "distributed/test_launcher", - "distributed/nn/jit/test_instantiator", - "distributed/rpc/test_faulty_agent", - "distributed/rpc/test_tensorpipe_agent", - "distributed/rpc/cuda/test_tensorpipe_agent", - "test_determination", - "test_futures", - "test_fx", - "test_fx_experimental", - "test_functional_autograd_benchmark", - "test_package", - "test_license", - "distributed/pipeline/sync/skip/test_api", - "distributed/pipeline/sync/skip/test_gpipe", - "distributed/pipeline/sync/skip/test_inspect_skip_layout", - "distributed/pipeline/sync/skip/test_leak", - "distributed/pipeline/sync/skip/test_portal", - "distributed/pipeline/sync/skip/test_stash_pop", - "distributed/pipeline/sync/skip/test_tracker", - "distributed/pipeline/sync/skip/test_verify_skippables", - "distributed/pipeline/sync/test_balance", - "distributed/pipeline/sync/test_bugs", - "distributed/pipeline/sync/test_checkpoint", - "distributed/pipeline/sync/test_copy", - "distributed/pipeline/sync/test_deferred_batch_norm", - "distributed/pipeline/sync/test_dependency", - "distributed/pipeline/sync/test_inplace", - "distributed/pipeline/sync/test_microbatch", - "distributed/pipeline/sync/test_phony", - "distributed/pipeline/sync/test_pipe", - "distributed/pipeline/sync/test_pipeline", - "distributed/pipeline/sync/test_stream", - "distributed/pipeline/sync/test_transparency", - "distributed/pipeline/sync/test_worker", - "distributed/optim/test_zero_redundancy_optimizer", - "distributed/elastic/timer/api_test", - "distributed/elastic/timer/local_timer_example", - "distributed/elastic/timer/local_timer_test", - "distributed/elastic/events/lib_test", - "distributed/elastic/metrics/api_test", - "distributed/elastic/utils/logging_test", - "distributed/elastic/utils/util_test", - "distributed/elastic/utils/distributed_test", - "distributed/elastic/multiprocessing/api_test", - "distributed/_sharding_spec/test_sharding_spec", - "distributed/_sharded_tensor/test_sharded_tensor", -] +def discover_tests( + base_dir: Optional[pathlib.Path] = None, + blocklisted_patterns: Optional[List[str]] = None, + blocklisted_tests: Optional[List[str]] = None, + extra_tests: Optional[List[str]] = None) -> List[str]: + """ + Searches for all python files starting with test_ excluding one specified by patterns + """ + def skip_test_p(name: str) -> bool: + rc = False + if blocklisted_patterns is not None: + rc |= any(name.startswith(pattern) for pattern in blocklisted_patterns) + if blocklisted_tests is not None: + rc |= name in blocklisted_tests + return rc + cwd = pathlib.Path(__file__).resolve().parent if base_dir is None else base_dir + all_py_files = list(cwd.glob('**/test_*.py')) + rc = [str(fname.relative_to(cwd))[:-3] for fname in all_py_files] + # Invert slashes on Windows + if sys.platform == "win32": + rc = [name.replace('\\', '/') for name in rc] + rc = [test for test in rc if not skip_test_p(test)] + if extra_tests is not None: + rc += extra_tests + return sorted(rc) + + +TESTS = discover_tests( + blocklisted_patterns=[ + 'ao', + 'bottleneck_test', + 'custom_backend', + 'custom_operator', + 'fx', # executed by test_fx.py + 'jit', # executed by test_jit.py + 'mobile', + 'onnx', + 'package', # executed by test_package.py + 'quantization', # executed by test_quantization.py + ], + blocklisted_tests=[ + 'test_bundled_images', + 'test_cpp_extensions_aot', + 'test_gen_backend_stubs', + 'test_jit_fuser', + 'test_jit_simple', + 'test_jit_string', + 'test_kernel_launch_checks', + 'test_metal', + 'test_nnapi', + 'test_python_dispatch', + 'test_segment_reductions', + 'test_static_runtime', + 'test_throughput_benchmark', + 'test_typing', + "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks", + "distributed/algorithms/quantization/test_quantization", + "distributed/bin/test_script", + "distributed/elastic/multiprocessing/bin/test_script", + "distributed/launcher/bin/test_script", + "distributed/launcher/bin/test_script_init_method", + "distributed/launcher/bin/test_script_is_torchelastic_launched", + "distributed/launcher/bin/test_script_local_rank", + "distributed/test_c10d_spawn", + 'distributions/test_transforms', + 'distributions/test_utils', + ], + extra_tests=[ + "test_cpp_extensions_aot_ninja", + "test_cpp_extensions_aot_no_ninja", + "distributed/elastic/timer/api_test", + "distributed/elastic/timer/local_timer_example", + "distributed/elastic/timer/local_timer_test", + "distributed/elastic/events/lib_test", + "distributed/elastic/metrics/api_test", + "distributed/elastic/utils/logging_test", + "distributed/elastic/utils/util_test", + "distributed/elastic/utils/distributed_test", + "distributed/elastic/multiprocessing/api_test", + ] +) # Tests need to be run with pytest. USE_PYTEST_LIST = [ From 44fcb00a569231be09419a97e6933152d64d92b7 Mon Sep 17 00:00:00 2001 From: Jay Leverett Date: Tue, 31 Aug 2021 17:28:42 -0700 Subject: [PATCH 404/530] Fix redundant class definition in GraphModule singleton constructor (#64274) Summary: Fixes https://github.com/pytorch/pytorch/issues/63883 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64274 Reviewed By: jamesr66a Differential Revision: D30675970 Pulled By: jayleverett fbshipit-source-id: e74ef2a28013f0fa7c58d14f38e66cfe48d26b74 --- test/test_fx.py | 13 +++++++++++++ torch/fx/graph_module.py | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/test/test_fx.py b/test/test_fx.py index eadcf6cc0b2f4..f4e4ab203a7bc 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -188,6 +188,19 @@ def forward(self, A, b=4, *args, c=5, **kwargs): t = T() symbolic_trace(t) + # test for issue described at https://github.com/pytorch/pytorch/issues/63883 + class M3(torch.nn.Module): + def forward(self, x): + return torch.relu(x) + + m3 = M3() + gm3 = symbolic_trace(m3) + new_instance = gm3.__new__(type(gm3)) + new_instance.__init__(gm3, gm3.graph) + + x = torch.randn(5, 3) + torch.testing.assert_allclose(new_instance(x), torch.relu(x)) + def test_custom_import(self): graph = torch.fx.Graph() a = graph.placeholder('x') diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index e7750db9353bd..89685bf3953a0 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -240,6 +240,14 @@ def __new__(cls: 'Type[GraphModule]', *args, **kwargs): # it is a subclass of the user-defined class, the only difference # is an extra layer to install the forward method + # address issue described at https://github.com/pytorch/pytorch/issues/63883 + # in other words, traverse class hierarchy to fix the redundant class definition problem + for t in cls.__mro__: + c = t.__qualname__.split('.')[-1] + if c != 'GraphModuleImpl': + cls = t + break + class GraphModuleImpl(cls): # type: ignore[misc, valid-type] pass return super().__new__(GraphModuleImpl) From 9e25634833735f26f7090d0e5341d6ad38b7eebb Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 31 Aug 2021 17:32:00 -0700 Subject: [PATCH 405/530] [TensorExpr] Move declaration of buildErrorMessage to exception.h (#64301) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64301 Test Plan: Imported from OSS Reviewed By: navahgar, huiguoo Differential Revision: D30678215 Pulled By: ZolotukhinM fbshipit-source-id: 599c83b3890450a0fb6526815f037eec9563661c --- torch/csrc/jit/tensorexpr/exceptions.h | 2 ++ torch/csrc/jit/tensorexpr/kernel.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h index cf23bbc2289c4..7194dfe166aa8 100644 --- a/torch/csrc/jit/tensorexpr/exceptions.h +++ b/torch/csrc/jit/tensorexpr/exceptions.h @@ -84,6 +84,8 @@ class malformed_ir : public std::runtime_error { "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {} }; +TORCH_API std::string buildErrorMessage(const std::string& s); + } // namespace tensorexpr } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index bdb9802ccdc3a..4b92b020fce31 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -300,8 +300,6 @@ TORCH_API void annotateInputShapes( TORCH_API std::shared_ptr removeUnusedSelfArgument( const std::shared_ptr& graph); -TORCH_API std::string buildErrorMessage(const std::string& s); - } // namespace tensorexpr } // namespace jit } // namespace torch From 5ecb966e0ff383d65531c8f6de23e704b9cafc54 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Aug 2021 17:33:11 -0700 Subject: [PATCH 406/530] Add ciflow-tracking issue to pytorch-probot (#64125) Summary: Doesn't do anything yet... Pull Request resolved: https://github.com/pytorch/pytorch/pull/64125 Reviewed By: zhouzhuojie Differential Revision: D30620283 Pulled By: malfet fbshipit-source-id: 91869d35c1b70a55e32261d2c32fb0136ec33960 --- .github/pytorch-probot.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index c1e1218b76091..627b2648ad426 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1 +1,2 @@ tracking_issue: 24422 +ciflow_tracking_issue: 64124 From 23da90ab84d4fa6006e8f65e6d795d2016d37f3b Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Tue, 31 Aug 2021 17:38:42 -0700 Subject: [PATCH 407/530] .github: Consolidate linux setup / teardown (#64229) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64229 Consolidates linux setup / teardown into easy to use jinja2 macros Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: zhouzhuojie, driazati Differential Revision: D30683810 Pulled By: seemethere fbshipit-source-id: 2578630df3e212fb79392a699090553baef44cc2 --- .github/scripts/display_ec2_information.sh | 14 -- .github/templates/bazel_ci_workflow.yml.j2 | 23 +-- .github/templates/common.yml.j2 | 67 +++++++- .github/templates/linux_ci_workflow.yml.j2 | 128 ++------------ ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 73 ++++++-- ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 73 ++++++-- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 116 ++++++++++--- ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 116 ++++++++++--- ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 116 ++++++++++--- ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 116 ++++++++++--- .../generated-linux-xenial-py3.6-gcc5.4.yml | 157 ++++++++++++++---- ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 75 +++++++-- ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 73 ++++++-- ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 116 ++++++++++--- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 22 ++- .../generated-win-vs2019-cpu-py3.yml | 22 ++- .../generated-win-vs2019-cuda10.1-py3.yml | 22 ++- .../generated-win-vs2019-cuda11.3-py3.yml | 22 ++- 18 files changed, 991 insertions(+), 360 deletions(-) delete mode 100755 .github/scripts/display_ec2_information.sh diff --git a/.github/scripts/display_ec2_information.sh b/.github/scripts/display_ec2_information.sh deleted file mode 100755 index be47418966025..0000000000000 --- a/.github/scripts/display_ec2_information.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" -} - -echo "ami-id: $(get_ec2_metadata ami-id)" -echo "instance-id: $(get_ec2_metadata instance-id)" -echo "instance-type: $(get_ec2_metadata instance-type)" diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 index e9907ed679e3f..7f9d5230e0d9f 100644 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ b/.github/templates/bazel_ci_workflow.yml.j2 @@ -31,21 +31,8 @@ on: NUM_TEST_SHARDS: !{{ num_test_shards }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - !{{ common.display_ec2_information() }} + !{{ common.setup_ec2_linux() }} + !{{ common.checkout_pytorch("recursive") }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -158,9 +145,5 @@ on: path: test-reports-*.zip !{{ common.upload_test_statistics(build_environment) }} - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af + !{{ common.teardown_ec2_linux() }} {%- endblock %} diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index bf72898d04c25..f9296e017a1cf 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -2,7 +2,16 @@ - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" {%- endmacro -%} {%- macro parse_ref() -%} @@ -29,3 +38,59 @@ python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test {%- endmacro -%} + +{%- macro setup_ec2_linux() -%} + !{{ display_ec2_information() }} + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" +{%- endmacro -%} + +{%- macro teardown_ec2_linux() -%} + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . +{%- endmacro -%} + +{%- macro checkout_pytorch(submodules) -%} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: !{{ submodules }} +{%- endmacro -%} diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index d5de86b1bbcfd..49b6d7dd68c77 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -72,25 +72,8 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - !{{ common.display_ec2_information() }} + !{{ common.setup_ec2_linux() }} + !{{ common.checkout_pytorch("false") }} - name: Calculate docker image tag id: calculate-tag run: | @@ -145,35 +128,11 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: !{{ build_environment }}-build steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - !{{ common.display_ec2_information() }} + !{{ common.setup_ec2_linux() }} + !{{ common.checkout_pytorch("recursive") }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -228,6 +187,7 @@ jobs: path: artifacts.zip {%- endif %} + !{{ common.teardown_ec2_linux() }} - name: Hold runner for 2 hours or until ssh sessions have drained # Always hold for active ssh sessions if: always() @@ -290,29 +250,8 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - !{{ common.display_ec2_information() }} + !{{ common.setup_ec2_linux() }} + !{{ common.checkout_pytorch("recursive") }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -343,9 +282,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -428,17 +364,7 @@ jobs: test-reports-*.zip !{{ common.parse_ref() }} !{{ common.upload_test_statistics(build_environment) }} - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - # Prune all of the docker images - docker system prune -af + !{{ common.teardown_ec2_linux() }} {% endblock %} {%- endif -%} {%- if enable_doc_jobs %} @@ -452,35 +378,11 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} DOCS_TYPE: ${{ matrix.docs_type }} steps: - - name: Log in to ECR - run: | - aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh - bash /tmp/ecr-login.sh - rm /tmp/ecr-login.sh - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE:?}/*" - rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enables SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list - submodules: recursive - !{{ common.display_ec2_information() }} + !{{ common.setup_ec2_linux() }} + !{{ common.checkout_pytorch("recursive") }} - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b name: Download PyTorch Build Artifacts with: @@ -542,13 +444,5 @@ jobs: name: docs_${{ matrix.docs_type }} path: docs_${{ matrix.docs_type }}.zip if-no-files-found: error - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af + !{{ common.teardown_ec2_linux() }} {%- endif -%} diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index 27b21bc6b523b..e621bee2ad666 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -201,6 +232,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index b90a497441802..9daf916ae2642 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -201,6 +232,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 274e68e38bdb4..4821c1e306715 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -212,6 +243,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | @@ -266,15 +315,30 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Clean workspace run: | rm -rf "${GITHUB_WORKSPACE:?}/*" @@ -283,15 +347,15 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -322,9 +386,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -422,10 +483,17 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index e03a019fbe7b9..79edf0d741950 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-py3.8-gcc9-coverage-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -212,6 +243,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | @@ -266,15 +315,30 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Clean workspace run: | rm -rf "${GITHUB_WORKSPACE:?}/*" @@ -283,15 +347,15 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -322,9 +386,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -426,10 +487,17 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 6aea843037eae..316da3604fc91 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -212,6 +243,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | @@ -266,15 +315,30 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Clean workspace run: | rm -rf "${GITHUB_WORKSPACE:?}/*" @@ -283,15 +347,15 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -322,9 +386,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -422,10 +483,17 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 1fdae9d1a0320..6c9e67d380c29 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -212,6 +243,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | @@ -266,15 +315,30 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Clean workspace run: | rm -rf "${GITHUB_WORKSPACE:?}/*" @@ -283,15 +347,15 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -322,9 +386,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -422,10 +483,17 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index e932b488d0c71..2337b4f5bf429 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -122,12 +141,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.6-gcc5.4-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -139,21 +173,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -212,6 +243,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | @@ -266,15 +315,30 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Clean workspace run: | rm -rf "${GITHUB_WORKSPACE:?}/*" @@ -283,15 +347,15 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -322,9 +386,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -422,13 +483,20 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . pytorch_doc_build: runs-on: linux.2xlarge @@ -440,12 +508,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} DOCS_TYPE: ${{ matrix.docs_type }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -453,25 +536,22 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys - - name: "[FB EMPLOYEES] Enables SSH (Click me for login details)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b name: Download PyTorch Build Artifacts with: @@ -537,8 +617,17 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 89deda0704df1..17dc3a6742d73 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -46,12 +46,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -59,15 +74,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -125,24 +144,47 @@ jobs: NUM_TEST_SHARDS: 1 CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -272,8 +314,21 @@ jobs: python3 -m pip install -r requirements.txt python3 -m pip install boto3==1.16.34 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Clean up docker images + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images if: always() run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index 63b462a19cf25..2f5cab7538601 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -44,12 +44,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -57,15 +72,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -120,12 +139,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -137,21 +171,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -199,6 +230,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 768146ee8cab2..8c81ab1717221 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -44,12 +44,27 @@ jobs: outputs: docker_image: ${{ steps.calculate-tag.outputs.docker_image }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -57,15 +72,19 @@ jobs: run: | rm -rf "${GITHUB_WORKSPACE:?}/*" rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: # deep clone, to allow use of git merge-base fetch-depth: 0 - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh + submodules: false - name: Calculate docker image tag id: calculate-tag run: | @@ -120,12 +139,27 @@ jobs: DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-build steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . @@ -137,21 +171,18 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Build PyTorch run: | docker run \ @@ -210,6 +241,24 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh - name: Clean up docker images if: always() run: | @@ -264,15 +313,30 @@ jobs: PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Log in to ECR run: | aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh bash /tmp/ecr-login.sh rm /tmp/ecr-login.sh - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" run: | # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - name: Clean workspace run: | rm -rf "${GITHUB_WORKSPACE:?}/*" @@ -281,15 +345,15 @@ jobs: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + # deep clone, to allow use of git merge-base + fetch-depth: 0 submodules: recursive - - name: Display EC2 information - shell: bash - run: | - .github/scripts/display_ec2_information.sh - name: Pull docker image run: | docker pull "${DOCKER_IMAGE}" @@ -320,9 +384,6 @@ jobs: - name: Output disk space left run: | sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - name: Test PyTorch env: PR_NUMBER: ${{ github.event.pull_request.number }} @@ -420,10 +481,17 @@ jobs: # Always hold for active ssh sessions if: always() run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images + - name: Kill containers, clean up images if: always() run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index aaf2d26e05af5..306e93aca7990 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -65,7 +65,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -178,7 +187,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: seemethere/add-github-ssh-key@v1 with: diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 08656c9dd99d8..f79cad7b04c00 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -65,7 +65,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -170,7 +179,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: seemethere/add-github-ssh-key@v1 with: diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index 2a8570d1e84b1..35c6cede0eefe 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -67,7 +67,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -180,7 +189,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: seemethere/add-github-ssh-key@v1 with: diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 7235db7f3c9f5..4bfc5654186f7 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -67,7 +67,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: Install Visual Studio 2019 toolchain shell: powershell run: | @@ -180,7 +189,16 @@ jobs: - name: Display EC2 information shell: bash run: | - .github/scripts/display_ec2_information.sh + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: seemethere/add-github-ssh-key@v1 with: From 0b9cdeb2956d3dd281a04d30e8cee55bf2101ba9 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Tue, 31 Aug 2021 17:38:42 -0700 Subject: [PATCH 408/530] .circleci: Remove already migrated CUDA configs (#64231) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64231 This migrates over the CUDA 11.1 and CUDA 10.2 configs that we had previously migrated to GHA Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: zhouzhuojie Differential Revision: D30683811 Pulled By: seemethere fbshipit-source-id: 71b0761461557d871c26eb02f665a2e4d9b1d9fb --- .circleci/cimodel/data/pytorch_build_data.py | 18 ------ .circleci/config.yml | 58 -------------------- 2 files changed, 76 deletions(-) diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index 5a85674d74fe9..df0cfa0027554 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -47,17 +47,6 @@ # ]), ]), ]), - ("11.1", [ - ("3.8", [ - ("shard_test", [XImportant(True)]), - # UNCOMMENT THE BELOW TO REENABLE LIBTORCH - # ("libtorch", [ - # (True, [ - # ('build_only', [X(True)]), - # ]), - # ]), - ]), - ]), ]), ]), ("bionic", [ @@ -74,13 +63,6 @@ ]), ]), ]), - ("cuda", [ - ("10.2", [ - ("3.9", [ - ("shard_test", [XImportant(True)]), - ]), - ]), - ]), ("rocm", [ ("3.9", [ ("3.6", [ diff --git a/.circleci/config.yml b/.circleci/config.yml index 324e5fdd9b3cb..9989f1a289b7d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7407,35 +7407,6 @@ workflows: build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7" resource_class: large - - pytorch_linux_build: - name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build - requires: - - "docker-pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" - build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" - - pytorch_linux_test: - name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test1 - requires: - - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build - build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test1" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" - use_cuda_docker_runtime: "1" - resource_class: gpu.medium - - pytorch_linux_test: - name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test2 - requires: - - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build - build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test2" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" - use_cuda_docker_runtime: "1" - resource_class: gpu.medium - - pytorch_linux_test: - name: pytorch_linux_pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed_test - requires: - - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build - build_environment: "pytorch-linux-pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7" - resource_class: large - pytorch_linux_build: name: pytorch_linux_bionic_py3_6_clang9_noarch_build requires: @@ -7475,35 +7446,6 @@ workflows: build_environment: "pytorch-vulkan-linux-bionic-py3.6-clang9-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9" resource_class: large - - pytorch_linux_build: - name: pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build - requires: - - "docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" - build_environment: "pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" - - pytorch_linux_test: - name: pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_test1 - requires: - - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build - build_environment: "pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7-test1" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" - use_cuda_docker_runtime: "1" - resource_class: gpu.medium - - pytorch_linux_test: - name: pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_test2 - requires: - - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build - build_environment: "pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7-test2" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" - use_cuda_docker_runtime: "1" - resource_class: gpu.medium - - pytorch_linux_test: - name: pytorch_linux_pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed_test - requires: - - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build - build_environment: "pytorch-linux-pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7" - resource_class: large - pytorch_linux_build: name: pytorch_linux_bionic_rocm3_9_py3_6_build requires: From 09e610e36d0106410e37e129fd0cd5749c74ad5f Mon Sep 17 00:00:00 2001 From: Ray Peng Date: Tue, 31 Aug 2021 17:45:50 -0700 Subject: [PATCH 409/530] [Static Runtime] Out version for softmax (#64243) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64243 Test Plan: ``` > buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1 ... V0830 16:35:22.524479 613839 impl.cpp:1410] Switch to out variant for node: %5 : Tensor = aten::softmax(%a.1, %dim.1, %dtype.1) ... [ OK ] StaticRuntime.IndividualOps_Softmax (803 ms) ``` Reviewed By: hlu1 Differential Revision: D30656149 fbshipit-source-id: 115b7b4a75448fd6a5c526808080ca9a4251302c --- benchmarks/static_runtime/test_scripts.h | 10 ++++++++ .../static_runtime/test_static_runtime.cc | 16 +++++++++++++ torch/csrc/jit/runtime/static/ops.cpp | 24 +++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 37bb222f6a3d4..99b73db79f3d1 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -676,6 +676,16 @@ const auto argmin_with_keep_dim_script = R"JIT( return torch.argmin(a, dim, True).clone() )JIT"; +const auto softmax_script = R"JIT( + def forward(self, a: Tensor, dim: int): + return torch.softmax(a, dim).clone() +)JIT"; + +const auto softmax_script_with_dtype = R"JIT( + def forward(self, a: Tensor, dim: int, dtype: int): + return torch.softmax(a, dim, dtype=dtype).clone() +)JIT"; + const auto getitem_dict_tensor_script = R"JIT( def forward(self, key: Tensor): d = {key: 1} diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 8e498dbbc664e..16941dab84760 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -1083,6 +1084,21 @@ TEST(StaticRuntime, IndividualOps_Argmin) { testStaticRuntime(argmin_with_keep_dim_script, args_a, args_b); } +TEST(StaticRuntime, IndividualOps_Softmax) { + auto a = at::randn({2, 3}); + auto b = at::randn({3, 3, 3}); + + testStaticRuntime(softmax_script, {a, 0}); + testStaticRuntime(softmax_script, {a, 1}); + + testStaticRuntime(softmax_script, {b, 0}); + testStaticRuntime(softmax_script, {b, 1}); + testStaticRuntime(softmax_script, {b, 2}); + + testStaticRuntime(softmax_script_with_dtype, {a, 1, at::ScalarType::Float}); + testStaticRuntime(softmax_script_with_dtype, {b, 1, at::ScalarType::Float}); +} + TEST(StaticRuntime, IndividualOps_GetItem_Dict) { int int_key = 0; std::string str_key = "str"; diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 0cc38b0812f11..7ede15c524296 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1338,6 +1339,29 @@ REGISTER_OPERATOR_FUNCTOR(aten::argmin, aten_argmin, [](Node* n) -> SROperator { }; }); +REGISTER_OPERATOR_FUNCTOR(aten::softmax, aten_softmax, [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "aten::softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& in_t = p_node->Input(0).toTensor(); + const auto& dim = p_node->Input(1).toInt(); + const auto& dtype = p_node->Input(2).toOptional(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = at::native::softmax(in_t, dim, dtype); + } else { + auto& out_t = p_node->Output(0).toTensor(); + fastResizeToZero(out_t); + + auto half_to_float = in_t.scalar_type() == at::ScalarType::Half && + dtype == at::ScalarType::Float; + at::cpu::_softmax_out(out_t, in_t, dim, half_to_float); + } + }; +}); + REGISTER_OPERATOR_FUNCTOR( static_runtime::layer_norm, aten_layer_norm, From 0733582087b08dc9b3136768a1e4a704fda5b5bb Mon Sep 17 00:00:00 2001 From: soulitzer Date: Tue, 31 Aug 2021 17:51:55 -0700 Subject: [PATCH 410/530] Use the correct overloaded name to skip boxed autograd not implemented kernel registration (#64182) Summary: Some internal use_count tests are failing for `dequantize_self` because we only compare the skip list with the base name `dequantize` when we should be comparing with the full name including the overload Pull Request resolved: https://github.com/pytorch/pytorch/pull/64182 Reviewed By: albanD Differential Revision: D30639909 Pulled By: soulitzer fbshipit-source-id: d4d22dd1a5c8f7180251ce7739830764cce6f151 --- tools/autograd/gen_variable_type.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index d0a9048df47f3..8591a6800605c 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -411,9 +411,9 @@ def gen_variable_type_func( if fn.info is None and not get_base_name(f) in RESET_GRAD_ACCUMULATOR \ and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE \ and len(gen_differentiable_outputs(fn)) > 0 \ - and not get_base_name(f) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \ - and not get_base_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \ - and not get_base_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT: + and not cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \ + and not type_wrapper_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \ + and not type_wrapper_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT: # NOTE: [ Registering AutogradNotImplemented boxed kernel ] # # When there is no derivatives.yaml entry, we register a generic boxed From b23e4f6086f3174ae66243d77c032aef876c0246 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 31 Aug 2021 17:55:23 -0700 Subject: [PATCH 411/530] Convert mul to use opmath_gpu_kernel_with_scalars (#64019) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64019 Note that previously the functor operated on scalar_t and this modifies it to operate on opmath_t, but this is not a problem as half precision was implemented by performing the compute in float anyway. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D30575282 Pulled By: ezyang fbshipit-source-id: cc6900ef996e755740afe48f9cb4d0366858dd47 --- .../ATen/native/cuda/BinaryMulDivKernel.cu | 45 +++++-------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu index da615fe12221b..e6a5300780e57 100644 --- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu @@ -14,16 +14,6 @@ namespace at { namespace native { -template -struct MulScalarFunctor { - MulScalarFunctor(accscalar_t b_): b(b_) {} - __device__ scalar_t operator() (scalar_t a) const { - return a * b; - } - private: - accscalar_t b; -}; - template struct DivFunctor { __device__ scalar_t operator() (scalar_t a, scalar_t b) const { @@ -31,9 +21,9 @@ struct DivFunctor { } }; -template +template struct MulFunctor { - __device__ scalar_t operator() (scalar_t a, scalar_t b) const { + __device__ T operator() (T a, T b) const { return a * b; } }; @@ -53,11 +43,11 @@ void div_true_kernel_cuda(TensorIteratorBase& iter) { // scalar, compute a * reciprocal(b). Note that this may lose one bit of // precision compared to computing the division. AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() { - using accscalar_t = at::acc_type; - auto inv_b = accscalar_t(1.0) / iter.scalar_value(2); + using opmath_t = at::opmath_type; + auto inv_b = opmath_t(1.0) / iter.scalar_value(2); iter.remove_operand(2); - MulScalarFunctor f(inv_b); - gpu_kernel(iter, f); + gpu_kernel(iter, BUnaryFunctor>( + MulFunctor(), inv_b)); }); } else { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() { @@ -180,25 +170,10 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) { } void mul_kernel_cuda(TensorIteratorBase& iter) { - if (!isIntegralType(iter.common_dtype(), /*includeBool*/ true) && - (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) { - //if common dtype is half the scalar constant can overflow in half precision, and yet the result can - //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() { - using accscalar_t = at::acc_type; - int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2; - auto b = iter.scalar_value(scalar_arg); - iter.remove_operand(scalar_arg); - const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1))); - MulScalarFunctor f(b); - gpu_kernel(iter, f); - }); - } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() { - MulFunctor f; - gpu_kernel_with_scalars(iter, f); - }); - } + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() { + using opmath_t = at::opmath_type; + opmath_gpu_kernel_with_scalars(iter, MulFunctor()); + }); } REGISTER_DISPATCH(div_true_stub, &div_true_kernel_cuda); From c59970db6b7831c34053ecd5f86ce688bf68df7d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 31 Aug 2021 18:22:23 -0700 Subject: [PATCH 412/530] [caffe2][easy] Save heap allocation in ConcatOp (#63529) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63529 Output() takes an IntArrayRef, so we can just use a std::initializer_list (stack-allocated array) instead of std::vector here. ghstack-source-id: 137085908 Test Plan: existing CI Reviewed By: mruberry Differential Revision: D29687400 fbshipit-source-id: 9f2a7c6679f2552c098bb1bf7befaca18e0e5d4d --- caffe2/operators/concat_split_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h index bbe355e50420f..f1e8f10d4d3dc 100644 --- a/caffe2/operators/concat_split_op.h +++ b/caffe2/operators/concat_split_op.h @@ -282,7 +282,7 @@ bool ConcatOp::RunOnDevice() { // We can override default options(Context::GetDeviceType()) // by explicitly passing in device type we want Tensor* split = Output( - 1, std::vector(1, InputSize()), at::dtype().device(CPU)); + 1, at::IntArrayRef({InputSize()}), at::dtype().device(CPU)); int* axis_data = split->template mutable_data(); auto& input_zero = Input(0); int adj_size = input_zero.dim() + (add_axis_ ? 1 : 0); From 6bb4b5d150ab51ed15d15ed270471848bb84d4e3 Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Tue, 31 Aug 2021 18:54:44 -0700 Subject: [PATCH 413/530] disallow empty named dims list to flatten(names, name) (#61953) Summary: Fixes https://github.com/pytorch/pytorch/issues/61137 by raising an error if an empty tuple is passed in for the names: ``` >>> torch.empty((2, 3), names=['a', 'b']).flatten((), 'abc') RuntimeError: flatten(tensor, dims, out_dim): dims cannot be empty ``` or from the original issue: ``` >>> torch.empty((2, 3)).flatten((), 'abc') RuntimeError: flatten(tensor, dims, out_dim): dims cannot be empty ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/61953 Reviewed By: iramazanli Differential Revision: D30574571 Pulled By: malfet fbshipit-source-id: e606e84458a8dd66e5da6d0eb1a260f37b4ce91b --- aten/src/ATen/native/TensorShape.cpp | 2 ++ test/test_namedtensor.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 1dc2a270c44c2..edbfa2329a02d 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -2042,6 +2042,8 @@ Tensor flatten(const Tensor& self, Dimname start_dim, Dimname end_dim, Dimname o Tensor flatten(const Tensor& self, DimnameList dims, Dimname out_dim) { auto positions = dimnames_to_positions(self, dims); + TORCH_CHECK(positions.size() > 0, + "flatten(tensor, dims, out_dim): dims cannot be empty"); for (const auto i : c10::irange(positions.size() - 1)) { if (positions[i] + 1 == positions[i + 1]) continue; TORCH_CHECK(positions[i] + 1 == positions[i + 1], diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py index b5e7aac402abb..2c6d2d80a2266 100644 --- a/test/test_namedtensor.py +++ b/test/test_namedtensor.py @@ -1072,6 +1072,11 @@ def test_flatten(self): with self.assertRaisesRegex(RuntimeError, "must be consecutive in"): tensor.flatten(['H', 'D', 'W'], 'features') + def test_flatten_nodims(self): + tensor = torch.empty((2, 3)) + with self.assertRaisesRegex(RuntimeError, "cannot be empty"): + tensor.flatten((), 'abcd') + def test_unflatten(self): # test args: tensor, int, namedshape self.assertTrue(torch.equal( From fa5676a41b2c00a4cea37793cf4977cf995ab0c4 Mon Sep 17 00:00:00 2001 From: Pavel Belevich Date: Tue, 31 Aug 2021 20:14:08 -0700 Subject: [PATCH 414/530] Delete some dead code from RRefMessageBase (#64298) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64298 cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 Test Plan: Imported from OSS Reviewed By: rohan-varma Differential Revision: D30676702 Pulled By: pbelevich fbshipit-source-id: 77dbc0f8064c3518376454ff573d45ed0274956b --- torch/csrc/distributed/rpc/rref_proto.cpp | 16 +--------------- torch/csrc/distributed/rpc/rref_proto.h | 3 --- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/torch/csrc/distributed/rpc/rref_proto.cpp b/torch/csrc/distributed/rpc/rref_proto.cpp index 6f059b1022db0..49e3287f5d778 100644 --- a/torch/csrc/distributed/rpc/rref_proto.cpp +++ b/torch/csrc/distributed/rpc/rref_proto.cpp @@ -46,20 +46,6 @@ const RRefId& RRefMessageBase::rrefId() { return rrefId_; } -c10::intrusive_ptr RRefMessageBase::toMessageImpl() && { - return fromIValues({rrefId_.toIValue()}, type_); -} - -at::IValue RRefMessageBase::fromMessage( - const Message& message, - MessageType type) { - auto values = toIValues(message, type); - - TORCH_INTERNAL_ASSERT( - values.size() == 1, "ScriptUserDelete expects 1 IValue from message."); - return std::move(values.back()); -} - /////////////////////////// ForkMessageBase ////////////////////////////////// const ForkId& ForkMessageBase::forkId() { @@ -76,7 +62,7 @@ std::pair ForkMessageBase::fromMessage( auto ivalues = toIValues(message, type); TORCH_INTERNAL_ASSERT( - ivalues.size() == 2, "ScriptUserDelete expects 2 IValue from message."); + ivalues.size() == 2, "ForkMessageBase expects 2 IValue from message."); return std::make_pair( RRefId::fromIValue(ivalues[0]), ForkId::fromIValue(ivalues[1])); diff --git a/torch/csrc/distributed/rpc/rref_proto.h b/torch/csrc/distributed/rpc/rref_proto.h index d5a82c21f8632..4ce8066dfe1f7 100644 --- a/torch/csrc/distributed/rpc/rref_proto.h +++ b/torch/csrc/distributed/rpc/rref_proto.h @@ -22,9 +22,6 @@ class TORCH_API RRefMessageBase : public RpcCommandBase { const RRefId& rrefId(); - c10::intrusive_ptr toMessageImpl() && override; - static at::IValue fromMessage(const Message& message, MessageType type); - protected: // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) const RRefId rrefId_; From a87808de931a31c242bca0c2305ec4af67f08ef2 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Tue, 31 Aug 2021 20:19:55 -0700 Subject: [PATCH 415/530] Fix bug in ShardedTensorMetadata serde. (#63902) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63902 The 'memory_format' field was not being serialized correctly and used the same encoding for different fields. ghstack-source-id: 137142406 Test Plan: waitforbuildbot Reviewed By: bowangbj Differential Revision: D30527324 fbshipit-source-id: f0f223e2d660ef6e4abae9649d9992acc36e1278 --- .../_sharded_tensor/test_sharded_tensor.py | 50 +++++++++++++++++++ torch/distributed/_sharded_tensor/api.py | 19 ++++--- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py index 718b594c831ee..77e35b76f3731 100644 --- a/test/distributed/_sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py @@ -1,6 +1,8 @@ from functools import wraps import math import io +import itertools +import pickle import sys import torch import torch.distributed as dist @@ -123,6 +125,54 @@ def wrapper(self): self.destroy_comms() return wrapper +class TestShardedTensorMetadata(TestCase): + def test_serialize_and_deserialize(self): + shard_metadatas = [ + ShardMetadata( + shard_offsets=[0, 0], + shard_lengths=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[0, 5], + shard_lengths=[5, 5], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_lengths=[5, 5], + placement="rank:2/cuda:2", + ), + ShardMetadata( + shard_offsets=[5, 5], + shard_lengths=[5, 5], + placement="rank:3/cuda:3", + ) + ] + + dtypes = [ + torch.float, torch.double, torch.cfloat, torch.cdouble, torch.half, + torch.bfloat16, torch.uint8, torch.int8, torch.short, torch.int, + torch.long, torch.bool] + + layouts = [torch.strided, torch.sparse_coo] + requires_grads = [True, False] + memory_formats = [torch.contiguous_format, torch.channels_last, torch.preserve_format] + pin_memories = [True, False] + + for tensor_properties_input in itertools.product(dtypes, layouts, requires_grads, memory_formats, pin_memories): + dtype, layout, requires_grad, memory_format, pin_memory = tensor_properties_input + + expected_st_metadata = _sharded_tensor.ShardedTensorMetadata( + shard_metadatas, + (10, 10), + _sharded_tensor.TensorProperties(dtype, layout, requires_grad, memory_format, pin_memory) + ) + + pickled_obj = pickle.dumps(expected_st_metadata) + st_metadata = pickle.loads(pickled_obj) + self.assertEqual(expected_st_metadata, st_metadata) + class TestCreateTensorFromParams(TestCase): @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed') def test_empty(self): diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py index 3b7476dc25bcf..d6b7a54732445 100644 --- a/torch/distributed/_sharded_tensor/api.py +++ b/torch/distributed/_sharded_tensor/api.py @@ -70,6 +70,13 @@ class TensorProperties(object): memory_format: torch.memory_format = field(default=torch.contiguous_format) pin_memory: bool = False + +class MEM_FORMAT_ENCODING(Enum): + TORCH_CONTIGUOUS_FORMAT = 0 + TORCH_CHANNELS_LAST = 1 + TORCH_PRESERVE_FORMAT = 2 + + @dataclass class ShardedTensorMetadata(object): """ @@ -93,11 +100,11 @@ def __getstate__(self): # Since torch.memory_format cannot be pickled! memory_format = self.tensor_properties.memory_format if memory_format == torch.contiguous_format: - mem_format_encoding = 0 + mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT elif memory_format == torch.channels_last: - mem_format_encoding = 1 + mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST elif memory_format == torch.preserve_format: - mem_format_encoding = 1 + mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT else: raise RuntimeError(f'Invalid torch.memory_format: {memory_format}') @@ -118,11 +125,11 @@ def __setstate__( ): (self.shards_metadata, self.size, dtype, layout, requires_grad, mem_format_encoding, pin_memory) = state - if mem_format_encoding == 0: + if mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT: memory_format = torch.contiguous_format - elif mem_format_encoding == 1: + elif mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST: memory_format = torch.channels_last - elif mem_format_encoding == 2: + elif mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT: memory_format = torch.preserve_format else: raise RuntimeError(f'Invalid torch.memory_format encoding: {mem_format_encoding}') From 8337a3fb3f44a536ba40e895dad7441b2f4a59f2 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 31 Aug 2021 20:27:44 -0700 Subject: [PATCH 416/530] [TensorExpr] Wrap error messages with buildErrorMessage call. (#64330) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64330 Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D30687226 Pulled By: ZolotukhinM fbshipit-source-id: ade1be2ad6847c6afbba60307ef854696821b4e3 --- test/cpp/tensorexpr/test_loopnest.cpp | 2 +- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 19 +-- torch/csrc/jit/tensorexpr/loopnest.cpp | 116 ++++++++++++++---- .../jit/tensorexpr/mem_dependency_checker.cpp | 19 ++- torch/csrc/jit/tensorexpr/registerizer.cpp | 15 ++- 5 files changed, 129 insertions(+), 42 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index c2b33e2a184d2..b1d59a1dee066 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -3554,7 +3554,7 @@ TEST(LoopNest, DetectInlineRankMismatch) { LoopNest l({reshape}, {a, reshape}); ASSERT_THROWS_WITH( l.computeInline(l.getLoopBodyFor(a)), - "Placeholder indexed access is inconsistent with its rank"); + "Number of indices doesn't match buf rank in the fuser."); } TEST(LoopNest, CacheReadsSimple) { diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index b9ea70806ee1a..6c212e623df21 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -903,7 +903,9 @@ void LLVMCodeGenImpl::visit(HalfImmPtr v) { } void LLVMCodeGenImpl::visit(BFloat16ImmPtr v) { - TORCH_INTERNAL_ASSERT(false, "llvm codegen does not support bfloat16"); + TORCH_INTERNAL_ASSERT( + false, + buildErrorMessage("Fuser's LLVM codegen does not support bfloat16")); } void LLVMCodeGenImpl::visit(BoolImmPtr v) { @@ -1535,7 +1537,10 @@ void LLVMCodeGenImpl::emitIsNan(IntrinsicsPtr v) { if (!v->param(0)->dtype().is_floating_point()) { value_ = toVec(llvm::ConstantInt::get(dstType, 0), v->dtype().lanes()); } else { - TORCH_INTERNAL_ASSERT(v->dtype().scalar_type() == ScalarType::Int); + TORCH_INTERNAL_ASSERT( + v->dtype().scalar_type() == ScalarType::Int, + buildErrorMessage( + "Unexpected non-Int dtype of Intrinsics' result value in the fuser.")); auto is_nan = irb_.CreateFCmpUNO( value_, llvm::ConstantFP::get(value_->getType(), 0.)); if (v->dtype().lanes() > 1) { @@ -1762,11 +1767,11 @@ void LLVMCodeGenImpl::visit(IntrinsicsPtr v) { } else { TORCH_INTERNAL_ASSERT( false, - v, - "Unimplemented lowering:", - v->op_type(), - " for input of dtype", - v->dtype().scalar_dtype()); + buildErrorMessage( + std::string("Unimplemented lowering for intrinsic '") + + std::to_string(v->op_type()) + "' for input of dtype " + + std::to_string(v->dtype().scalar_dtype()) + + " in LLVM codegen of the fuser.")); } std::vector params; diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 11020cc2eda08..e67d094065d1a 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -536,7 +536,8 @@ class FunctionInliner : public IRMutator { // (since we don't support in-place writes). Resolves issue 52581. TORCH_INTERNAL_ASSERT( *intValue(i) == 0, - "Constant index impression should always be zero"); + buildErrorMessage( + "Unexpected non-zero constant index in inlined buffer in the fuser.")); producer_index_vars_.push_back(nullptr); } else { throw std::logic_error("cannot inline Buf with compound indices"); @@ -547,21 +548,25 @@ class FunctionInliner : public IRMutator { private: ExprPtr mutate_loads(BufPtr buf, std::vector dims) { std::vector index_vars; - TORCH_INTERNAL_ASSERT(buf->ndim() == producer_index_vars_.size()); + TORCH_INTERNAL_ASSERT( + buf->ndim() == producer_index_vars_.size(), + buildErrorMessage( + "Dimensions of producer and consumer expressions do not match in inliner in the fuser.")); for (const auto i : c10::irange(buf->ndim())) { VarPtr func_callee_arg = producer_index_vars_.at(i); ExprPtr func_caller_param = dims.at(i); if (func_callee_arg == nullptr) { TORCH_INTERNAL_ASSERT( intValue(func_caller_param) && *intValue(func_caller_param) == 0, - "We are implicitly assuming that if you have an index of 0, that must also be inlined into an index of 0"); + buildErrorMessage( + "We are implicitly assuming that if you have an index of 0, that must also be inlined into an index of 0")); continue; } if (func_callee_arg == nullptr) continue; auto iter = inline_mapping_.find(func_callee_arg); if (iter != inline_mapping_.end()) { - throw std::runtime_error( + throw std::logic_error( "Duplicated variables: " + func_callee_arg->name_hint()); } // Add a mapping for each function parameter to it's source name. @@ -603,10 +608,10 @@ class FunctionInliner : public IRMutator { return IRMutator::mutate(v); } - if (v->indices().size() != buf->ndim()) { - throw malformed_input( - "Placeholder indexed access is inconsistent with its rank", v); - } + TORCH_INTERNAL_ASSERT( + v->indices().size() == buf->ndim(), + buildErrorMessage( + "Number of indices doesn't match buf rank in the fuser.")); return mutate_loads(buf, v->indices()); } @@ -646,7 +651,10 @@ class FunctionInliner : public IRMutator { if (v == producer_ && !outputs_.count(buf_)) { in_producer_ = true; producer_ = to(IRMutator::mutate(v)); - TORCH_INTERNAL_ASSERT(producer_ != nullptr); + TORCH_INTERNAL_ASSERT( + producer_, + buildErrorMessage( + "Producer statement for output buf should remain non-null in the fuser")); in_producer_ = false; return nullptr; } else { @@ -748,7 +756,10 @@ bool LoopNest::computeInline(BufPtr b) { } } - TORCH_INTERNAL_ASSERT(relevant_store); + TORCH_INTERNAL_ASSERT( + relevant_store, + buildErrorMessage( + "Cannot find a relevant store to inline a buf in the fuser.")); GRAPH_DEBUG("ComputeInline: Def: ", std::to_string(relevant_store)); FunctionInliner inliner(relevant_store, output_bufs_); @@ -772,7 +783,11 @@ void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) { auto input_bufs = getInputBufs(); for (auto buf : intermediate_bufs) { - TORCH_INTERNAL_ASSERT(buf_load_store_uses.count(buf)); + TORCH_INTERNAL_ASSERT( + buf_load_store_uses.count(buf), + buildErrorMessage( + "Could not find uses of buf '" + buf->name_hint() + + "' in the fuser.")); std::vector& uses = buf_load_store_uses[buf]; auto stores = c10::filter( uses, [](const BufLoadOrStoreUse& use) { return use.isStore; }); @@ -789,7 +804,11 @@ void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) { } } else { // If S is not a store, it must be an ExternalCall. - TORCH_INTERNAL_ASSERT(to(stores[0].s)); + TORCH_INTERNAL_ASSERT( + to(stores[0].s), + buildErrorMessage( + "Expected stmt: " + std::to_string(stores[0].s) + + "\nto be either a Store or an ExternalCall in the fuser.")); } } @@ -1154,7 +1173,10 @@ bool LoopNest::optimizeConditionals() { ifthenelse_exprs.front(), &cond_var, &comp_values, &sub_exprs)) { continue; } - TORCH_INTERNAL_ASSERT(comp_values.size() >= 1); + TORCH_INTERNAL_ASSERT( + comp_values.size() >= 1, + buildErrorMessage( + "Expected at least one expression in optimizeConditional in the fuser.")); comp_values.insert(comp_values.begin(), immLike(comp_values[0], 0)); auto fors = getLoopStmtsFor(store); @@ -1513,7 +1535,10 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) { std::vector LoopNest::distributeLoop( ForPtr loop, const std::unordered_set& pivots) { - TORCH_INTERNAL_ASSERT(loop); + TORCH_INTERNAL_ASSERT( + loop, + buildErrorMessage( + "Expected non-null loop in distributeLoop in the fuser.")); auto root = loop->get_parent(); if (root == nullptr) { throw malformed_input("Loop without parent: ", loop); @@ -1758,7 +1783,10 @@ bool LoopNest::unsafeFuseLoops( break; } } - TORCH_INTERNAL_ASSERT(it != root_block->end()); + TORCH_INTERNAL_ASSERT( + it != root_block->end(), + buildErrorMessage( + "Could not find the given loop in the root stmt in unsafeFuseLoop the fuser.")); for (auto l : loops) { if (*it != l) { return false; @@ -2032,7 +2060,10 @@ std::vector LoopNest::reorder( parent->replace_stmt(loops.front(), empty_block); for (size_t i = 1; i < loops.size(); ++i) { auto block = to(loops[i]->get_parent()); - TORCH_INTERNAL_ASSERT(block); + TORCH_INTERNAL_ASSERT( + block, + buildErrorMessage( + "Expected parent stmt to be a non-null Block in reorder transformation the fuser.")); block->remove_stmt(loops[i]); } @@ -2191,9 +2222,13 @@ std::vector LoopNest::getLoopStmtsInLoopNest(ForPtr f, size_t num) { ForPtr curr_for = f; loops[0] = curr_for; for (size_t i = 1; i < num; ++i) { - TORCH_INTERNAL_ASSERT(curr_for->body()->nstmts() == 1); + TORCH_INTERNAL_ASSERT( + curr_for->body()->nstmts() == 1, + buildErrorMessage("Expected a single stmt in the loop body.")); curr_for = to(curr_for->body()->front()); - TORCH_INTERNAL_ASSERT(curr_for); + TORCH_INTERNAL_ASSERT( + curr_for, + buildErrorMessage("Expected the only child stmt to be a For loop.")); loops[i] = curr_for; } return loops; @@ -2303,7 +2338,10 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) { // Find the parent common to all the buffer accesses. BlockPtr parent = to(writes.front()->get_parent()); - TORCH_INTERNAL_ASSERT(parent); + TORCH_INTERNAL_ASSERT( + parent, + buildErrorMessage( + "Expected parent stmt to be a non-null block in compressBuffer in the fuser.")); for (auto w : writes) { parent = Block::getSharedParent(parent, w); } @@ -2325,7 +2363,10 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) { // Vector to indicate which dimensions could be compressed away. std::vector dims(buf->dims().size(), true); auto check_indices = [&](const std::vector& indices) { - TORCH_INTERNAL_ASSERT(indices.size() == dims.size()); + TORCH_INTERNAL_ASSERT( + indices.size() == dims.size(), + buildErrorMessage( + "Expected ranks to match in compressBuffer in the fuser.")); for (size_t i = 0; i < indices.size(); ++i) { auto index_vars = NodeFinder::find(indices[i]); for (auto iv : index_vars) { @@ -2367,7 +2408,10 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) { // Modify all access to reflect the removed dims. auto get_new_indices = [&](const std::vector& indices) { - TORCH_INTERNAL_ASSERT(indices.size() == dims.size()); + TORCH_INTERNAL_ASSERT( + indices.size() == dims.size(), + buildErrorMessage( + "Expected ranks to match in compressBuffer in the fuser.")); std::vector new_indices(indices); for (size_t i = 0; i < dims.size(); ++i) { if (dims[i]) { @@ -2573,7 +2617,10 @@ class CacheReplacer : public IRMutator { // Map indices to call-parameters. std::vector newIndices; - TORCH_INTERNAL_ASSERT(offsets_.size() == v->indices().size()); + TORCH_INTERNAL_ASSERT( + offsets_.size() == v->indices().size(), + buildErrorMessage( + "Expected ranks to match in CacheReplacer in the fuser.")); for (size_t i = 0; i < v->indices().size(); ++i) { ExprPtr index = v->indices()[i]->accept_mutator(this); ExprPtr offset = offsets_[i]; @@ -2595,7 +2642,10 @@ class CacheReplacer : public IRMutator { // Map indices to call-parameters. std::vector newIndices; - TORCH_INTERNAL_ASSERT(offsets_.size() == v->indices().size()); + TORCH_INTERNAL_ASSERT( + offsets_.size() == v->indices().size(), + buildErrorMessage( + "Expected ranks to match in CacheReplacer in the fuser.")); for (size_t i = 0; i < v->indices().size(); ++i) { ExprPtr index = v->indices()[i]->accept_mutator(this); ExprPtr offset = offsets_[i]; @@ -2643,7 +2693,10 @@ LoopNest::AccessResult LoopNest::cacheAccesses( return {nullptr, nullptr}; } - TORCH_INTERNAL_ASSERT(bounds_it->second.size() == 1); + TORCH_INTERNAL_ASSERT( + bounds_it->second.size() == 1, + buildErrorMessage( + "Unexpected number of bound info entries in cacheAccesses in the fuser.")); TensorAccessBoundsInfo& info = bounds_it->second[0]; bool hasReads = info.kind == kLoad || info.kind == kMutate; bool hasWrites = info.kind == kStore || info.kind == kMutate; @@ -2998,7 +3051,10 @@ class RfactorStoreRewriter : public IRMutator { return IRMutator::mutate(v); } - TORCH_INTERNAL_ASSERT(old_indices_.size() == v->indices().size()); + TORCH_INTERNAL_ASSERT( + old_indices_.size() == v->indices().size(), + buildErrorMessage( + "Expected ranks to match in RfactorStoreRewriter in the fuser.")); bool equal_indices = true; for (size_t i = 0; i < v->indices().size(); ++i) { @@ -3032,7 +3088,10 @@ class RfactorStoreRewriter : public IRMutator { return IRMutator::mutate(v); } - TORCH_INTERNAL_ASSERT(old_indices_.size() == v->indices().size()); + TORCH_INTERNAL_ASSERT( + old_indices_.size() == v->indices().size(), + buildErrorMessage( + "Expected ranks to match in RfactorStoreRewriter in the fuser.")); bool equal_indices = true; for (size_t i = 0; i < v->indices().size(); ++i) { @@ -3141,7 +3200,10 @@ bool LoopNest::rfactor( // X[*indexes] = ReduceOp(X[*indexes] + T[*indexes + {reduction_var}], // reduce_axis={reduction_var}) BlockPtr b = outer_reduction_for->body(); - TORCH_INTERNAL_ASSERT(b->nstmts() == 1); + TORCH_INTERNAL_ASSERT( + b->nstmts() == 1, + buildErrorMessage( + "Expected to have a single stmt in the block in rfactor transformation in the fuser.")); StmtPtr first_reduction_loop = b->stmts().front(); auto rfac_buf_indices = orig_buf_indices; rfac_buf_indices.emplace_back(reduction_var); diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp index e1688e37cbe7f..3f77041f1a202 100644 --- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp +++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp @@ -76,12 +76,16 @@ std::vector AccessInfo::getIndices() const { void AccessInfo::addDependency(const std::shared_ptr& write) { auto res = dependencies_.emplace(write->id(), write); - TORCH_INTERNAL_ASSERT(res.second); + TORCH_INTERNAL_ASSERT( + res.second, + buildErrorMessage("Duplicate entry in mem dep checker in the fuser.")); } void AccessInfo::addDependent(const std::shared_ptr& read) { auto res = dependents_.emplace(read->id(), read); - TORCH_INTERNAL_ASSERT(res.second); + TORCH_INTERNAL_ASSERT( + res.second, + buildErrorMessage("Duplicate entry in mem dep checker in the fuser.")); } bool AccessInfo::hasDependency(const std::shared_ptr& info) const { @@ -590,7 +594,10 @@ bool executionSafetyCheck( if (aStrides.empty() || oStrides.empty()) { return false; } - TORCH_INTERNAL_ASSERT(info->bounds().size() == other->bounds().size()); + TORCH_INTERNAL_ASSERT( + info->bounds().size() == other->bounds().size(), + buildErrorMessage( + "Dimension mismatch for two accesses in mem dep checker in the fuser.")); for (size_t b = 0; b < info->bounds().size(); ++b) { ExprPtr aIndexStride = aStrides[b]; ExprPtr oIndexStride = oStrides[b]; @@ -1150,7 +1157,11 @@ void MemDependencyChecker::visit(FreePtr v) { VarPtr var = v->buffer_var(); auto it = intermediates_.find(var); - TORCH_INTERNAL_ASSERT(it != intermediates_.end()); + TORCH_INTERNAL_ASSERT( + it != intermediates_.end(), + buildErrorMessage( + "Expected to find '" + var->name_hint() + + "' in intermediate vars in mem dep checker in the fuser.")); IndexBounds bounds = it->second->bounds(); auto info = std::make_shared( diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp index 8684f2aabc810..c4c495762a79d 100644 --- a/torch/csrc/jit/tensorexpr/registerizer.cpp +++ b/torch/csrc/jit/tensorexpr/registerizer.cpp @@ -43,8 +43,14 @@ void AccessInfo::addLoad( } void AccessInfo::merge(const std::shared_ptr& other) { - TORCH_INTERNAL_ASSERT(hash_ == other->hash()); - TORCH_INTERNAL_ASSERT(indices_.size() == other->indices().size()); + TORCH_INTERNAL_ASSERT( + hash_ == other->hash(), + buildErrorMessage( + "Expected hashes to match in registerizer in the fuser.")); + TORCH_INTERNAL_ASSERT( + indices_.size() == other->indices().size(), + buildErrorMessage( + "Expected ranks to match in registerizer in the fuser.")); last_usage_ = other->last_usage(); for (auto s : other->stores()) { @@ -68,7 +74,10 @@ void AccessInfo::merge(const std::shared_ptr& other) { bool AccessInfo::overlaps(const std::shared_ptr& other) { // All accesses to a buf must have the same dimensionality. - TORCH_INTERNAL_ASSERT(indices_.size() == other->indices().size()); + TORCH_INTERNAL_ASSERT( + indices_.size() == other->indices().size(), + buildErrorMessage( + "Expected ranks to match in registerizer in the fuser.")); auto& other_indices = other->indices(); From 479fc4e41250a1710e34cc1dcaa6272c89832fb4 Mon Sep 17 00:00:00 2001 From: gmagogsfm Date: Tue, 31 Aug 2021 21:27:46 -0700 Subject: [PATCH 417/530] Remove outdated warning about RecursiveScriptModule not being copiable (#64085) Summary: RecursiveScriptModule has its customized `__copy__` and `__deepcopy__` defined. The warning/error that says it is not copiable is outdated Pull Request resolved: https://github.com/pytorch/pytorch/pull/64085 Reviewed By: rohan-varma Differential Revision: D30598623 Pulled By: gmagogsfm fbshipit-source-id: 0701d8617f42d818bc7b88244caee4cd47fbe976 --- test/test_jit.py | 5 ----- torch/distributed/nn/api/remote_module.py | 2 ++ torch/jit/_script.py | 7 ------- .../_internal/distributed/nn/api/remote_module_test.py | 3 +-- 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index d1a170da6f750..e94ed8db922b0 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -391,11 +391,6 @@ def __init__(self, cpu_device_str): self.assertFalse(m2.p0.is_cuda) self.assertFalse(m2.b0.is_cuda) - def test_model_save_error(self): - with TemporaryFileName() as fname: - with self.assertRaisesRegex(pickle.PickleError, "not supported"): - torch.save(FooToPickle(), fname) - @unittest.skipIf(not RUN_CUDA, "restore device requires CUDA") def test_restore_device_cuda(self): class MyModule(torch.jit.ScriptModule): diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py index ef26db64dbed8..fb3b160c8ebcc 100644 --- a/torch/distributed/nn/api/remote_module.py +++ b/torch/distributed/nn/api/remote_module.py @@ -288,11 +288,13 @@ def get_module_rref(self) -> rpc.RRef[nn.Module]: """ return self.module_rref + @torch.jit.export def __getstate__(self): raise RuntimeError( "Cannot pickle RemoteModule in python pickler. RemoteModule can only be pickled when using RPC" ) + @torch.jit.export def __setstate__(self, state): raise RuntimeError( "Cannot unpickle RemoteModule in python pickler. RemoteModule can only be unpickled when using RPC" diff --git a/torch/jit/_script.py b/torch/jit/_script.py index 09801bab938a7..de32e1ab8de37 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -785,13 +785,6 @@ def __setattr__(self, attr, value): # It's fairly trivial to save enough info to warn in this case. return super(RecursiveScriptModule, self).__setattr__(attr, value) - def __getstate__(self): - raise pickle.PickleError( - "ScriptModules cannot be deepcopied using copy.deepcopy or saved using torch.save. " - + "Mixed serialization of script and non-script modules is not supported. " - + "For purely script modules use my_script_module.save() instead." - ) - def __copy__(self): return torch.jit._recursive.wrap_cpp_module(copy.copy(self._c)) diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py index fb1d5fbbc4f75..997006353bfbd 100644 --- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py +++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py @@ -1,6 +1,5 @@ #!/usr/bin/python3 import enum -import pickle from typing import Tuple import torch @@ -467,7 +466,7 @@ def test_remote_module_py_pickle_not_supported_script(self): dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE] ): with TemporaryFileName() as fname: - with self.assertRaises(pickle.PickleError): + with self.assertRaisesRegex(torch.jit.Error, "can only be pickled when using RPC"): torch.save(remote_module, fname) From 24e50b8453d861b10e799a3c02cef9d06cf996c6 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 31 Aug 2021 21:43:25 -0700 Subject: [PATCH 418/530] [CUDA graphs] hotfix for test_graph_ (#64339) Summary: Graphed workloads that try to capture a full backward pass must do warmup on a non-default stream. If warmup happens on the default stream, AccumulateGrad functions might tag themselves to run on the default stream, and therefore won't be capturable. ngimel and I suspect some test_cuda.py tests run with the default stream as the ambient stream, which breaks `test_graph_grad_scaling` because `test_graph_grad_scaling` does warmup on the ambient stream _assuming_ the ambient stream is a non-default stream. This PR explicitly sets a side stream for the warmup in `test_graph_grad_scaling`, which is what I should have done all along because it's what the new documentation recommends. I pushed the PR branch straight to the main pytorch repo because we need to run ci-all on it, and I'm not sure what the requirements are these days. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64339 Reviewed By: mruberry Differential Revision: D30690711 Pulled By: ngimel fbshipit-source-id: 91ad75f46a11f311e25bc468ea184e22acdcc25a --- test/test_cuda.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_cuda.py b/test/test_cuda.py index 6f742ec59f931..33dbade7380b8 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3683,8 +3683,13 @@ def test_graph_grad_scaling(self): static_grad = torch.ones_like(weight) # warmup - loss = (weight.half() * static_input).sum() - scaler.scale(loss).backward() + s = torch.cuda.Stream() + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + loss = (weight.half() * static_input).sum() + scaler.scale(loss).backward() + torch.cuda.current_stream().wait_stream(s) + opt.zero_grad(set_to_none=True) # capture From 05ecaefbbfa157ad2d6353ef9b2a53892f7ed34c Mon Sep 17 00:00:00 2001 From: Yuchen Huang Date: Tue, 31 Aug 2021 22:00:11 -0700 Subject: [PATCH 419/530] [Metal][GPU] Enable metal for simulators and fix test failures if possible (#64322) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64322 As title ghstack-source-id: 137143877 Test Plan: - `aibench-cli mobile` - Select iOS -> `y` -> `1` -> `n` -> "--metal_op_test" - Select all iPhone 6 + iPhone 7 + iPhone 8 and a iPhone X or 11 or 12 ``` Benchmark Submitted. Find more details at: https://our.intern.facebook.com/intern/aibench/details/318120612514604 Benchmark Status: D10 (https://github.com/pytorch/pytorch/commit/b8256280ce45f02a7e105d3b3db4a547990e683d)AP-12.0.1: DONE N71mAP-14.3: DONE DUMMY latency: D10 (https://github.com/pytorch/pytorch/commit/b8256280ce45f02a7e105d3b3db4a547990e683d)AP-12.0.1: 4319.3 N71mAP-14.3: 8868.51 I0831 16:06:27.210558 605277 ClientSingletonManager.cpp:99] Shutting down Manifold ClientSingletonManager ``` Reviewed By: xta0 Differential Revision: D30147163 fbshipit-source-id: 2de6bbd9bd525e32ca92b2845eb435800855edcc --- aten/src/ATen/native/metal/MetalContext.mm | 6 ------ .../ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm | 8 +++++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/aten/src/ATen/native/metal/MetalContext.mm b/aten/src/ATen/native/metal/MetalContext.mm index 80ee55efa591e..f71d35f97a866 100644 --- a/aten/src/ATen/native/metal/MetalContext.mm +++ b/aten/src/ATen/native/metal/MetalContext.mm @@ -37,9 +37,6 @@ + (instancetype)sharedInstance { - (BOOL)available { #if !defined(__APPLE__) return false; -#elif TARGET_IPHONE_SIMULATOR - // TODO[T90135707]: Enable Metal on iOS Simulators - return false; #elif TARGET_OS_IPHONE if (!MPSSupportsMTLDevice(_device)) { return false; @@ -47,9 +44,6 @@ - (BOOL)available { if ([UIDevice currentDevice].systemVersion.floatValue < 11.0) { return false; } - if (![_device supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2]) { - return false; - } #elif TARGET_OS_MAC if (!MPSSupportsMTLDevice(_device)) { return false; diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm index f337e1dfc824e..5e749983c822d 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm +++ b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm @@ -76,13 +76,15 @@ - (void)registerTests { REG_TEST("test_hardtanh_", test_hardtanh_); REG_TEST("test_hardtanh", test_hardtanh); REG_TEST("test_reshape", test_reshape); + REG_TEST("test_chunk", test_chunk); + REG_TEST("test_chunk3", test_chunk3); + REG_TEST("test_reflection_pad2d", test_reflection_pad2d); +#if !TARGET_IPHONE_SIMULATOR REG_TEST("test_mean_dim", test_mean_dim); REG_TEST("test_mean_dim2", test_mean_dim2); REG_TEST("test_mean_dim3", test_mean_dim3); - REG_TEST("test_chunk", test_chunk); REG_TEST("test_chunk2", test_chunk2); - REG_TEST("test_chunk3", test_chunk3); - REG_TEST("test_reflection_pad2d", test_reflection_pad2d); +#endif } - (NSDictionary*)tests { From 0c4e4e588e2e3308c659f741e7bc5cabf0975c09 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 31 Aug 2021 22:20:41 -0700 Subject: [PATCH 420/530] [FX] Rename reduce functions back to their old, public names (#64324) Summary: Unfortunately pickle serializes the names of these functions. Also put them under backward-compatibility enforcement. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64324 Test Plan: Local repro https://fb.workplace.com/groups/3440841732711443/permalink/4018921611570116/ Reviewed By: SplitInfinity, TailofJune Differential Revision: D30684185 Pulled By: jamesr66a fbshipit-source-id: 900701220155d15115cd0c07cf7774a2891bd04f --- ...mpat-fx_backcompat_function_signatures.expect | 3 +++ torch/fx/graph_module.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect index a73fde735bc2d..20d392fa9cbb1 100644 --- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect +++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect @@ -28,6 +28,9 @@ torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.m torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None torch.fx.graph_module.GraphModule.delete_submodule(self, target: str) -> bool torch.fx.graph_module.GraphModule.recompile(self) -> torch.fx.graph.PythonCode +torch.fx.graph_module.reduce_deploy_graph_module(importer: Callable, body: Dict[Any, Any], import_block: str) -> torch.nn.modules.module.Module +torch.fx.graph_module.reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.modules.module.Module +torch.fx.graph_module.reduce_package_graph_module(importer: Callable, body: Dict[Any, Any], generated_module_name: str) -> torch.nn.modules.module.Module torch.fx.interpreter.Interpreter.__init__(self, module: torch.fx.graph_module.GraphModule, garbage_collect_values: bool = True) torch.fx.interpreter.Interpreter.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any torch.fx.interpreter.Interpreter.call_method(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index 89685bf3953a0..ca82d49e07cbe 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -96,7 +96,8 @@ def _format_import_block(globals: Dict[str, Any], importer: Importer): return '\n'.join(import_strs) -def _reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module: +@compatibility(is_backward_compatible=True) +def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module: # BC: attribute name was changed from `code` to `_code` to facilitate # making `code` into a property and adding a docstring to it fn_src = body.get('_code') or body['code'] @@ -104,14 +105,15 @@ def _reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Mo return _deserialize_graph_module(forward, body) -def _reduce_package_graph_module( +@compatibility(is_backward_compatible=True) +def reduce_package_graph_module( importer: PackageImporter, body: Dict[Any, Any], generated_module_name: str ) -> torch.nn.Module: forward = importer.import_module(generated_module_name).forward return _deserialize_graph_module(forward, body) - -def _reduce_deploy_graph_module( +@compatibility(is_backward_compatible=True) +def reduce_deploy_graph_module( importer: PackageImporter, body: Dict[Any, Any], import_block: str ) -> torch.nn.Module: ns = dict() @@ -626,7 +628,7 @@ def __reduce_deploy__(self, importer: Importer): python_code = self.recompile() import_block = _format_import_block(python_code.globals, importer) - return (_reduce_deploy_graph_module, (dict_without_graph, import_block)) + return (reduce_deploy_graph_module, (dict_without_graph, import_block)) def __reduce_package__(self, exporter: PackageExporter): dict_without_graph = self.__dict__.copy() @@ -638,7 +640,7 @@ def __reduce_package__(self, exporter: PackageExporter): import_block = _format_import_block(python_code.globals, exporter.importer) module_code = import_block + self.code exporter.save_source_string(generated_module_name, module_code) - return (_reduce_package_graph_module, (dict_without_graph, generated_module_name)) + return (reduce_package_graph_module, (dict_without_graph, generated_module_name)) def __reduce__(self): """ @@ -652,7 +654,7 @@ def __reduce__(self): python_code = self.recompile() import_block = _format_import_block(python_code.globals, sys_importer) del dict_without_graph['_graph'] - return (_reduce_graph_module, (dict_without_graph, import_block)) + return (reduce_graph_module, (dict_without_graph, import_block)) # because __reduce__ is defined for serialization, # we need to define deepcopy otherwise it will call __reduce__ From 92b31b59aff908fcecb76e5baff5bb52ce62608a Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Wed, 1 Sep 2021 07:16:55 -0700 Subject: [PATCH 421/530] Revert D29699456: [pytorch][PR] Enable Half, BFloat16, and Complex dtypes for coo-coo sparse matmul [CUDA] Test Plan: revert-hammer Differential Revision: D29699456 (https://github.com/pytorch/pytorch/commit/ad4848565e1d9f4d408c60614f213acb52035181) Original commit changeset: 407ae53392ac fbshipit-source-id: b6c70ba8bb28c0c38de47857030b69792a8470de --- aten/src/ATen/cuda/CUDADataType.h | 61 ------------------- .../ATen/native/sparse/cuda/SparseMatMul.cu | 54 ++++++---------- test/test_sparse.py | 44 ++++++------- torch/testing/_internal/common_cuda.py | 1 - torch/utils/hipify/cuda_to_hip_mappings.py | 31 +++++----- 5 files changed, 52 insertions(+), 139 deletions(-) delete mode 100644 aten/src/ATen/cuda/CUDADataType.h diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h deleted file mode 100644 index 71c9af9af8aac..0000000000000 --- a/aten/src/ATen/cuda/CUDADataType.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include - -#include -#include - -namespace at { -namespace cuda { - -template -cudaDataType getCudaDataType() { - TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to cudaDataType.") -} - -template<> cudaDataType getCudaDataType() { - return CUDA_R_16F; -} -template<> cudaDataType getCudaDataType() { - return CUDA_R_32F; -} -template<> cudaDataType getCudaDataType() { - return CUDA_R_64F; -} -template<> cudaDataType getCudaDataType>() { - return CUDA_C_16F; -} -template<> cudaDataType getCudaDataType>() { - return CUDA_C_32F; -} -template<> cudaDataType getCudaDataType>() { - return CUDA_C_64F; -} - -// HIP doesn't define integral types -#ifndef __HIP_PLATFORM_HCC__ -template<> cudaDataType getCudaDataType() { - return CUDA_R_8U; -} -template<> cudaDataType getCudaDataType() { - return CUDA_R_8I; -} -template<> cudaDataType getCudaDataType() { - return CUDA_R_32I; -} -#endif - -#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && CUDA_VERSION >= 11000 -template<> cudaDataType getCudaDataType() { - return CUDA_R_16I; -} -template<> cudaDataType getCudaDataType() { - return CUDA_R_64I; -} -template<> cudaDataType getCudaDataType() { - return CUDA_R_16BF; -} -#endif - -} // namespace cuda -} // namespace at diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu index a08c93d1d71bd..d5f31a1980bac 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu @@ -16,7 +16,6 @@ #include #include -#include #include #include #include @@ -119,7 +118,14 @@ struct csrMatrixRef { nnz_{nnz}, size_{size} { #if IS_CUSPARSE11_AVAILABLE() - cudaDataType cuda_data_type = at::cuda::getCudaDataType(); + cudaDataType cuda_data_type; + if ( std::is_same::value ) { + cuda_data_type = CUDA_R_32F; + } else if ( std::is_same::value) { + cuda_data_type = CUDA_R_64F; + } else { + TORCH_CHECK(false, "Tensor types must be either float32 or float64"); + } TORCH_CUDASPARSE_CHECK(cusparseCreateCsr( &description_, this->size(0), @@ -186,14 +192,8 @@ struct CusparseMatrixMultiplyOp { cusparseSpGEMMDescr_t spgemmDesc; CusparseMatrixMultiplyOp() { - static_assert( - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same, scalar_t>::value || - std::is_same, scalar_t>::value, - "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double."); + static_assert(std::is_same::value || std::is_same::value, + "cusparse csr sparse-sparse MM only supports data type of float and double."); // SpGEMM Computation TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_createDescr(&spgemmDesc)); } @@ -212,6 +212,14 @@ struct CusparseMatrixMultiplyOp { const int B_num_cols = B.size(1); + cudaDataType computeType; + if ( std::is_same::value ) { + computeType = CUDA_R_32F; + } else if ( std::is_same::value) { + computeType = CUDA_R_64F; + } else { + TORCH_CHECK(false, "Tensor types must be either float32 or float64"); + } csrOutput out({A.size(0), B.size(1)}); out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt)); @@ -244,16 +252,6 @@ struct CusparseMatrixMultiplyOp { cusparseSpMatDescr_t matC = C.description_; //-------------------------------------------------------------------------- - cudaDataType computeType = at::cuda::getCudaDataType(); - - // If a specific GPU model does not provide native support for a given data type, - // the routine returns CUSPARSE_STATUS_ARCH_MISMATCH error - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - TORCH_CHECK(prop->major >= 5 && !((10*prop->major + prop->minor) < 53 && computeType == CUDA_R_16F), - "sparse_mm: CUDA Float16 requires compute capability >= 53 (current: ", prop->major, prop->minor, ")"); - TORCH_CHECK(!(prop->major < 8 && computeType == CUDA_R_16BF), - "sparse_mm: CUDA BFloat16 requires compute capability >= 80 (current: ", prop->major, prop->minor, ")"); - // ask bufferSize1 bytes for external memory TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_workEstimation( handle, @@ -648,14 +646,8 @@ void sparse_sparse_matmul_cuda_kernel( const Tensor& mat1, const Tensor& mat2) { - static_assert( - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same, scalar_t>::value || - std::is_same, scalar_t>::value, - "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double."); + static_assert(std::is_same::value || std::is_same::value, + "sparse_sparse_matmul_cuda_kernel only supports float and double value types"); Tensor mat1_indices_ = mat1._indices().contiguous(); Tensor mat1_values = mat1._values().contiguous(); @@ -783,15 +775,9 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) { auto output = at::native::empty_like(mat1_); output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0); -#if IS_CUSPARSE11_AVAILABLE() - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] { - sparse_sparse_matmul_cuda_kernel(output, mat1_.coalesce(), mat2_.coalesce()); - }); -#else AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] { sparse_sparse_matmul_cuda_kernel(output, mat1_.coalesce(), mat2_.coalesce()); }); -#endif return output; } diff --git a/test/test_sparse.py b/test/test_sparse.py index 8fa32edbc5e8a..fb0a660333583 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -12,12 +12,8 @@ from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version from numbers import Number from typing import Dict, Any -from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes -from torch.testing._internal.common_cuda import \ - (SM53OrLater, SM80OrLater, CUDA11OrLater) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, - deviceCountAtLeast) + (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, onlyCPU, onlyCUDA, deviceCountAtLeast) from torch.testing._internal.common_methods_invocations import \ (sparse_unary_ufuncs) @@ -3221,13 +3217,8 @@ def sparse_log(x): # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA @skipIfRocm @coalescedonoff - @dtypes(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_half=False, include_bfloat16=False)) - @dtypesIfCUDA(*(get_all_complex_dtypes() if CUDA11OrLater else ()), - *get_all_fp_dtypes( - include_half=(CUDA11OrLater and SM53OrLater), - include_bfloat16=(CUDA11OrLater and SM80OrLater))) - @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2}) + @dtypes(torch.double) + @dtypesIfCPU(torch.double, torch.cdouble) def test_sparse_matmul(self, device, dtype, coalesced): """ This function test `torch.sparse.mm` when both the mat1 and mat2 are sparse tensors. @@ -3337,23 +3328,22 @@ def test_sparse_matmul(sparse_dims, nnz, shape_a, shape_b): r2 = torch.sparse.mm(a, b) self.assertEqual(r1, r2) - if dtype in [torch.double, torch.cdouble]: - a.requires_grad_(True) - b.requires_grad_(True) + a.requires_grad_(True) + b.requires_grad_(True) - # check autograd support on sparse matmul - def fn(D1, D2): - return torch.sparse.mm(D1, D2).to_dense() + # check autograd support on sparse matmul + def fn(D1, D2): + return torch.sparse.mm(D1, D2).to_dense() - if a.is_cuda: - # For cuda, `nondet_tol` is set with `1e-5` - # This is because cuSparse sometimes returns approximate zero values like `~e-323` - # TODO: Check this cuSparse issue. - # This happens when you do chain multiplication `torch.sparse.mm` operations - gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5) - else: - gradcheck(fn, (a, b), check_sparse_nnz=True) - grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b) + if a.is_cuda: + # For cuda, `nondet_tol` is set with `1e-5` + # This is because cuSparse sometimes returns approximate zero values like `~e-323` + # TODO: Check this cuSparse issue. + # This happens when you do chain multiplication `torch.sparse.mm` operations + gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5) + else: + gradcheck(fn, (a, b), check_sparse_nnz=True) + grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b) def test_error_cases(): def fn(sparse_dims, nnz, shape_a, shape_b): diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index 36e7f8a178577..5d0849bb8407d 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -20,7 +20,6 @@ CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.') SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3) SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0) -SM80OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0) TEST_MAGMA = TEST_CUDA if TEST_CUDA: diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index 6b60516efe322..558acc24ef3c8 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -554,7 +554,6 @@ ), ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)), ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)), - ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)), ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)), ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)), ( @@ -3787,21 +3786,21 @@ ), ), ("cudaDataType_t", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), - ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("cudaDataType", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_16F", ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_16F", ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_32F", ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_32F", ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_64F", ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_64F", ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_8I", ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_8I", ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_8U", ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_8U", ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_32I", ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_32I", ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_R_32U", ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ("CUDA_C_32U", ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), ( "MAJOR_VERSION", ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), From d1f3d85fd80de2166114af1b5e16070a6d33a898 Mon Sep 17 00:00:00 2001 From: Nima Elyasi Date: Wed, 1 Sep 2021 08:47:44 -0700 Subject: [PATCH 422/530] fix GradBucket.is_last() logic (#63768) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63768 passed number of buckets to GradBucket constructor, to check if index is equal to num_buckets - 1 in the .is_last() function. Test Plan: buck test mode/dev-nosan //caffe2/test/distributed/algorithms/ddp_comm_hooks:test_ddp_hooks test output: https://www.internalfb.com/intern/testinfra/testconsole/testrun/8162774375985873/ Reviewed By: SciPioneer, mrshenli Differential Revision: D30455913 fbshipit-source-id: 8c67ca69cbf191d6e189e09248407eb167bb24b6 --- .../ddp_comm_hooks/test_ddp_hooks.py | 30 +++++++++++++++++++ torch/csrc/distributed/c10d/comm.hpp | 5 +++- torch/csrc/distributed/c10d/reducer.cpp | 2 ++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py index 7b889fdc3f1bb..3d00712ca5354 100644 --- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py +++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py @@ -177,6 +177,36 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self): np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_is_last_hook(self): + + store = dist.FileStore(self.file_name, self.world_size) + process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) + + def hook(flags, bucket): + flags.append(bucket.is_last()) + fut = torch.futures.Future() + fut.set_result(bucket.buffer()) + return fut + + flags = [] + device_id = gpus_for_rank(self.world_size)[self.rank][0] + model = nn.Sequential( + nn.Linear(2, 4000, bias=False), + *[nn.Linear(4000, 4000, bias=False) for _ in range(10)] + ) + gpu_model = DistributedDataParallel( + model.to(device_id), + device_ids=[device_id], + process_group=process_group, + ) + gpu_model.register_comm_hook(state=flags, hook=hook) + input = torch.randn(10, 2) + gpu_model(input).sum().backward() + self.assertTrue(flags[-1]) + self.assertFalse(any(flags[:-1])) + if __name__ == "__main__": assert ( diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp index 9b45795683004..4690c355ce71b 100644 --- a/torch/csrc/distributed/c10d/comm.hpp +++ b/torch/csrc/distributed/c10d/comm.hpp @@ -18,12 +18,14 @@ class TORCH_API GradBucket { public: explicit GradBucket( size_t index, + size_t bucket_count, const at::Tensor& tensor, const std::vector& offsets, const std::vector& lengths, const std::vector& sizes_vec, const std::vector& parameters) : index_(index), + bucket_count_(bucket_count), buffer_(tensor), offsets_(offsets), lengths_(lengths), @@ -63,11 +65,12 @@ class TORCH_API GradBucket { // Returns whther this bucket is the last bucket to allreduce in an iteration. bool isLast() const { - return index_ == 0; + return index_ == bucket_count_ - 1; } private: size_t index_; + size_t bucket_count_; at::Tensor buffer_; // Per-variable info in buffer_. diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index eafc70cc5e30f..91db615181e56 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -472,6 +472,7 @@ std::vector Reducer::get_grad_buckets( auto variables_for_bucket = get_variables_for_bucket(i, bucket); gradBuckets.emplace_back( i, + buckets_.size(), return_zero_tensors ? at::zeros_like(bucket.replicas[0].contents) : bucket.replicas[0].contents, bucket.replicas[0].offsets, @@ -888,6 +889,7 @@ void Reducer::all_reduce_bucket(Bucket& bucket) { auto variables_for_bucket = get_variables_for_bucket(next_bucket_, bucket); GradBucket grad_bucket( next_bucket_, + buckets_.size(), tensors[0], // Since we only support single-process single-device // mode, there is always only one replica in the bucket. From d5bfdd3dac33dfa84e2a511fa79c4ad4e0e6b822 Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Wed, 1 Sep 2021 08:48:25 -0700 Subject: [PATCH 423/530] OpInfo for `nn.functional.layer_norm` (#63276) Summary: Please see https://github.com/facebookresearch/functorch/issues/78 and https://github.com/pytorch/pytorch/issues/54261. Note: * This PR also adds a reference test inspired by existing tests in `test_nn.py`. cc: mruberry zou3519 Pull Request resolved: https://github.com/pytorch/pytorch/pull/63276 Reviewed By: ejguan Differential Revision: D30452483 Pulled By: zou3519 fbshipit-source-id: 2578d01ca34e031668a41bd284db60c31ae1fba8 --- test/test_nn.py | 26 -------- .../_internal/common_methods_invocations.py | 65 +++++++++++++++++++ 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/test/test_nn.py b/test/test_nn.py index 5008c7256acf7..e60ff698ed19b 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -13281,32 +13281,6 @@ def test_LayerNorm_general(self, device): if self.device_type == 'cuda': self._test_LayerNorm_cuda_half(device) - @onlyOnCPUAndCUDA - def test_LayerNorm_numeric(self, device): - def layer_norm_ref(X, gamma, beta, normalized_shape, eps): - feature_size = np.prod(normalized_shape) - X_view = X.view(-1, feature_size) - mean = X_view.mean(dim=-1, keepdim=True) - var = X_view.var(dim=-1, unbiased=False, keepdim=True) - Y = (X_view - mean) / torch.sqrt(var + eps) - Y = Y * gamma.view(-1) + beta.view(-1) - return Y.view(*X.size()) - - normalized_shape = [256, 256, 144] - layer_norm = nn.LayerNorm(normalized_shape).float().to(device) - X = torch.rand(2, *normalized_shape, dtype=torch.float32, - device=device) - - Y = layer_norm(X) - Y_ref = layer_norm_ref(X, layer_norm.weight.data, layer_norm.bias.data, - normalized_shape, layer_norm.eps) - self.assertEqual(Y, Y_ref, rtol=0, atol=1e-5) - - if self.device_type == 'cuda': - layer_norm.cpu() - Y_cpu = layer_norm(X.cpu()) - self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5) - @onlyOnCPUAndCUDA def test_GroupNorm_general(self, device): self._test_GroupNorm_general(device) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 3579310dc68c9..fe8e36fbe6758 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2548,6 +2548,42 @@ def generator(): return list(generator()) +def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs): + make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + # Ordered as input shape, normalized_shape and a kwarg dict for eps + cases: Tuple[Tuple[int], Tuple[int], dict] = ( # type: ignore[assignment] + ((1, 2, 3), (1, 2, 3), {'eps': 0.5}), + ((2, 2, 3), (2, 3), {'eps': -0.5}), + ((1,), (1,), {}), + ((1, 2), (2,), {}), + ((0, 1), (1,), {}), + ) + + def generator(): + for input_shape, normalized_shape, kwargs in cases: + # Shape of weight and bias should be the same as normalized_shape + weight = make_arg(normalized_shape) + bias = make_arg(normalized_shape) + yield SampleInput( + make_arg(input_shape), + args=(normalized_shape, weight, bias), + kwargs=kwargs + ) + # Without any optional args + yield SampleInput(make_arg((1, 2)), args=((2,),)) + + # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs, + # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400 + + # With weight and a `None` bias + # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None)) + + # With `None` weight and bias (tests failing for this, see the link above) + # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,)))) + + return list(generator()) + def sample_inputs_hardswish(self, device, dtype, requires_grad): N = 5 # make sure we are testing -3 -> 3 range. default is -10 -> 10 so maybe unnecessary ? @@ -5595,6 +5631,21 @@ def reference_mse_loss(input, target, reduction="mean"): return se +def reference_layer_norm(inp: np.ndarray, normalized_shape: Tuple[int], weight=None, bias=None, eps=1e-5): + feature_size = np.prod(normalized_shape) + inp_view = inp.reshape(-1, feature_size) # type: ignore[call-overload] + mean = inp_view.mean(axis=-1, keepdims=True) + var = inp_view.var(axis=-1, ddof=0, keepdims=True) + Y = (inp_view - mean) / np.sqrt(var + eps) + if weight is None and bias is not None: + Y = Y + bias.reshape(-1) + elif weight is not None and bias is None: + Y = Y * weight.reshape(-1) + elif weight is not None and bias is not None: + Y = Y * weight.reshape(-1) + bias.reshape(-1) + return Y.reshape(*inp.shape) + + def gradcheck_wrapper_hermitian_input(op, input, *args, **kwargs): """Gradcheck wrapper for functions that take Hermitian matrices as input. @@ -7235,6 +7286,20 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): SkipInfo('TestJit', 'test_variant_consistency_jit'), ), supports_out=False,), + OpInfo('nn.functional.layer_norm', + aten_name='layer_norm', + aliases=('layer_norm',), + ref=reference_layer_norm, + dtypes=floating_types_and(torch.bfloat16), + dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + supports_out=False, + decorators=[ + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}), + 'TestCommon', 'test_reference_testing' + ), + ], + sample_inputs_func=sample_inputs_layer_norm,), OpInfo('nn.functional.pad', variant_test_name='constant', aten_name='constant_pad_nd', From c4f3f6e62d3201852fc9aac8f736d528a5a36bfe Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 1 Sep 2021 10:17:52 -0700 Subject: [PATCH 424/530] Fixes reduction launch config (#64304) Summary: Fixes https://github.com/pytorch/pytorch/issues/48573 See also https://github.com/pytorch/pytorch/pull/64194 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64304 Reviewed By: janeyx99 Differential Revision: D30689600 Pulled By: ngimel fbshipit-source-id: bf2103ca177fd3b6e27bc0324b81925234483a29 --- aten/src/ATen/native/cuda/LinearAlgebra.cu | 1 - aten/src/ATen/native/cuda/Normalization.cu | 1 - aten/src/ATen/native/cuda/Reduce.cuh | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu index b7ecf386c6edc..b4936c069b0b1 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index 1d4d1cc4bda4e..44e27a95647b1 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index b4600454f467d..3be7100483b3c 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -989,14 +989,14 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id // Map block.x to the fastest reducing dimension. It implies: // 1. block_x_reduce is required. // 2. block.y now max out to num_outputs. - dim0 = iter.shape()[0]; + dim0 = inputs_per_output; dim1 = num_outputs; fastest_moving_stride = iter.strides(/*arg=*/input_index)[0]; } else { // Map block.x to the fastest non reducing dimension. It implies: // 1. block_x_reduce is turned off. // 2. block.y now max out to inputs_per_output. - dim0 = iter.shape()[iter.num_reduce_dims()]; + dim0 = num_outputs; dim1 = inputs_per_output; fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]; } From 87d8ab6e50314f72ea6730250904fe7ff073cc21 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Wed, 1 Sep 2021 10:28:02 -0700 Subject: [PATCH 425/530] [nnc] Updated generic error message with info about turning off the fuser (#64316) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64316 Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D30683942 Pulled By: navahgar fbshipit-source-id: d86607563672213f99a1436dcf4f5dc28053b713 --- torch/csrc/jit/tensorexpr/kernel.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 78cbb822bfbff..f9653aea68840 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -67,10 +67,16 @@ namespace jit { namespace tensorexpr { std::string buildErrorMessage(const std::string& s) { - // TODO: Update this generic error message to include details regarding - // turning off the fuser. - static const std::string generic_error_message = ""; - return s + " " + generic_error_message; + static const std::string generic_error_message = + "This error occured in the fuser. You can turn off the fuser with " + "torch._C._jit_override_can_fuse_on_cpu(False)"; + if (s.empty()) { + return generic_error_message; + } + if (s.back() == '.') { + return s + " " + generic_error_message; + } + return s + ". " + generic_error_message; } static int te_cuda_pointwise_loop_levels = -1; From c6505cc3837eb903f98163e40fad638a1cfeb502 Mon Sep 17 00:00:00 2001 From: Patrick Hu Date: Wed, 1 Sep 2021 10:49:39 -0700 Subject: [PATCH 426/530] [FX] Fix python code generation for wrapped getattr() with default value (#64271) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64271 Closes #60417 Modified emit_node() in fx/graph.py to generate getattr() call with default value when len(node.args) != 2 instead of accessing the attribute. Added test_torch_fx_getattr() in test/test_fx.py. Test Plan: pytest test/test_fx.py Imported from OSS Reviewed By: jamesr66a Differential Revision: D30671265 fbshipit-source-id: f2db9ea47e0cb247547e200684f715aab006c374 --- test/test_fx.py | 10 ++++++++++ torch/fx/graph.py | 6 ++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_fx.py b/test/test_fx.py index f4e4ab203a7bc..5220f67ebf309 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -98,6 +98,8 @@ def a_lifted_leaf2(a, b): wrap('len') +wrap('getattr') + @wrap def wrapped_via_decorator(a): return a + 1 @@ -942,6 +944,14 @@ def forward(self, x): self.assertEqual(traced2(inp), inp + 3.0) self.assertIs(len, builtins.len) + def test_torch_fx_getattr(self): + class FXGetattrTest(torch.nn.Module): + def forward(self, x): + return getattr(x, 'nonexistent_attr', torch.Tensor([2, 3])) + + traced = symbolic_trace(FXGetattrTest()) + self.assertEqual(traced(torch.rand(3, 4)), torch.Tensor([2, 3])) + def test_sqrt(self): class Sqrt1(torch.nn.Module): def forward(self, x): diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 29ffc416715a7..65e93d0ccc7a1 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -949,11 +949,13 @@ def emit_node(node : Node): return qualified_name = _get_qualified_name(node.target) global_name = add_global(qualified_name, node.target) + # special case for getattr: node.args could be 2-argument or 3-argument + # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value if global_name == 'getattr' and \ isinstance(node.args, tuple) and \ isinstance(node.args[1], str) and \ - node.args[1].isidentifier(): - # pretty print attribute access + node.args[1].isidentifier() and \ + len(node.args) == 2: body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}') return body.append(f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})') From 0b48d968952a6183ae122679d624940e5228567f Mon Sep 17 00:00:00 2001 From: Patrick Kan Date: Wed, 1 Sep 2021 12:20:50 -0700 Subject: [PATCH 427/530] [Bootcamp] Include both python unittest and parser parameters in --help and -h flag (#64297) Summary: Fixes https://github.com/pytorch/pytorch/issues/45945 Creates a new thread to run -h or --help with unittest.main if the help flag is present, and keeps the add_help default for parameters. Includes both python unittest and parser parameters in --help and -h flag and will remain up to date since both messages are displayed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64297 Test Plan: Imported from GitHub `python test/test_spectral_ops.py --help` Output: ``` % python test/test_spectral_ops.py --help usage: test_spectral_ops.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b] [-k TESTNAMEPATTERNS] [tests [tests ...]] positional arguments: tests a list of any number of test modules, classes and test methods. optional arguments: -h, --help show this help message and exit -v, --verbose Verbose output -q, --quiet Quiet output --locals Show local variables in tracebacks -f, --failfast Stop on first fail or error -c, --catch Catch Ctrl-C and display results so far -b, --buffer Buffer stdout and stderr during tests -k TESTNAMEPATTERNS Only run tests which match the given substring Examples: test_spectral_ops.py - run default set of tests test_spectral_ops.py MyTestSuite - run suite 'MyTestSuite' test_spectral_ops.py MyTestCase.testSomething - run MyTestCase.testSomething test_spectral_ops.py MyTestCase - run all 'test*' test methods in MyTestCase usage: test_spectral_ops.py [-h] [--subprocess] [--seed SEED] [--accept] [--jit_executor JIT_EXECUTOR] [--repeat REPEAT] [--test_bailouts] [--save-xml [SAVE_XML]] [--discover-tests] [--log-suffix LOG_SUFFIX] [--run-parallel RUN_PARALLEL] [--import-slow-tests [IMPORT_SLOW_TESTS]] [--import-disabled-tests [IMPORT_DISABLED_TESTS]] optional arguments: -h, --help show this help message and exit --subprocess whether to run each test in a subprocess --seed SEED --accept --jit_executor JIT_EXECUTOR --repeat REPEAT --test_bailouts --save-xml [SAVE_XML] --discover-tests --log-suffix LOG_SUFFIX --run-parallel RUN_PARALLEL --import-slow-tests [IMPORT_SLOW_TESTS] --import-disabled-tests [IMPORT_DISABLED_TESTS] ``` Also ran some other tests to make sure tests still worked, and other tests with --help or -h flag Reviewed By: seemethere Differential Revision: D30677776 Pulled By: PatrickKan fbshipit-source-id: eb3d6e3fa677137ec703ec3a23808efb99acc896 --- torch/testing/_internal/common_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 90f3551caae94..0a265b52401b6 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -23,6 +23,7 @@ import random import contextlib import shutil +import threading from pathlib import Path import socket import subprocess @@ -156,7 +157,7 @@ def _get_test_report_path(): return os.path.join('test-reports', test_source) -parser = argparse.ArgumentParser(add_help=False) +parser = argparse.ArgumentParser() parser.add_argument('--subprocess', action='store_true', help='whether to run each test in a subprocess') parser.add_argument('--seed', type=int, default=1234) @@ -173,6 +174,15 @@ def _get_test_report_path(): parser.add_argument('--import-slow-tests', type=str, nargs='?', const=SLOW_TESTS_FILE) parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DISABLED_TESTS_FILE) +# Only run when -h or --help flag is active to display both unittest and parser help messages. +def run_unittest_help(argv): + unittest.main(argv=argv) + +if '-h' in sys.argv or '--help' in sys.argv: + help_thread = threading.Thread(target=run_unittest_help, args=(sys.argv,)) + help_thread.start() + help_thread.join() + args, remaining = parser.parse_known_args() if args.jit_executor == 'legacy': GRAPH_EXECUTOR = ProfilingMode.LEGACY From 421d8f86b6def536df18371a5da2f5df4de6e262 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 12:28:23 -0700 Subject: [PATCH 428/530] Add a record scope around autograd::engine::evaluate_function (#63619) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63619 Adds a RECORD_FUNCTION with the function that is being valuate as part of backwards execution. This has been useful in picking up some operations in the backwards pass that otherwise would not show up, for example custom cpp functions that use custom C++ code. ghstack-source-id: 137041723 Test Plan: CI benchmark: buck run mode/opt //scripts/rvarm1/ddp:bench Reviewed By: albanD Differential Revision: D30439492 fbshipit-source-id: 955917770cdf2a2edb0303223ace710b668ba388 --- test/test_autograd.py | 3 +++ torch/csrc/autograd/engine.cpp | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 364d48807b737..8b3c8bd33af66 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -3005,6 +3005,9 @@ def test_profiler_seq_nr(self): found_bwd_add = found_bwd_sum = False found_empty = False for e in p.function_events: + # Ignore record_function user scope. + if "autograd::engine::evaluate_function" in e.name: + continue if e.name == "aten::add": add_seq_nr = e.sequence_nr self.assertFalse(found_add) diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index acd7971aad6a7..4ea002a8312f1 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -419,7 +419,18 @@ auto Engine::thread_main(const std::shared_ptr& graph_task) -> void { // callbacks. GraphTaskGuard guard(local_graph_task); NodeGuard ndguard(task.fn_); - evaluate_function(local_graph_task, task.fn_.get(), task.inputs_, local_graph_task->cpu_ready_queue_); + { + RECORD_FUNCTION( + c10::str( + "autograd::engine::evaluate_function: ", + task.fn_.get()->name()), + std::vector()); + evaluate_function( + local_graph_task, + task.fn_.get(), + task.inputs_, + local_graph_task->cpu_ready_queue_); + } } catch (std::exception& e) { thread_on_exception(local_graph_task, task.fn_, e); } From 468001600cb38423deeec0ba0abc6ca33e3c60e4 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Wed, 1 Sep 2021 12:38:39 -0700 Subject: [PATCH 429/530] Back out "Revert D30327514: [Pytorch lite predictor] Use KinetoEdgeCPUProfiler for operator profiling." (#64307) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64307 Original commit changeset: 0b2aa7c57d08 Restores original changes. This diff changes the way operator profiling is done in lite predictor benchmarking binary. Instead of using custom callbacks it uses KinetoEdgeCPUProfiler to profile events and then generate operator level metric from it. Since KinetoEvents do not contain cpu clock time, now we report only wallclock time. This unifies various profiling effort that we have for benchmarking purpose. In production we will still use observer based mechanism, but the advantage of using kineto profiler is that we get few other things for free, such as: chrome trace generation. operator level memory profiling (to be added) flop counts (to be added) Furthermore possible we can use python post processing script to parse chrome trace and generate output similar to torch.profiler. (To be done) Furthermore removes some tests from test_lite_interpreter.cpp which were testing module hierarchy in debug info. They should be covered by test_mobile_profiler.cpp. Test Plan: aibench run Model without debug info: https://www.internalfb.com/intern/aibench/details/219598441154763 Model with debug info and --print_module_info true (see Operator summary has now module hierarchy information). https://www.internalfb.com/intern/aibench/details/617154236292985 Reviewed By: raziel Differential Revision: D30680354 fbshipit-source-id: b6ba0d59c510c13d13d9935b1d8051cc82ffa4e9 --- test/cpp/jit/test_lite_interpreter.cpp | 319 ------------------------ tools/build_variables.bzl | 4 +- torch/csrc/jit/mobile/debug_info.cpp | 15 +- torch/csrc/jit/mobile/import.cpp | 3 + torch/csrc/jit/mobile/interpreter.cpp | 3 + torch/csrc/jit/mobile/module.cpp | 3 +- torch/csrc/jit/mobile/module.h | 11 +- torch/csrc/jit/mobile/profiler_edge.cpp | 45 +++- torch/csrc/jit/mobile/profiler_edge.h | 5 + 9 files changed, 72 insertions(+), 336 deletions(-) diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index 3bd2becd8779d..26100b3b6f508 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -456,144 +456,6 @@ TEST(LiteInterpreterTest, BuiltinFunction) { } #if !defined FB_XPLAT_BUILD -TEST(LiteInterpreterTest, ModuleInfoBasic) { - Module m("M"); - m.define(R"JIT( - def forward(self, x): - return 2 * x - )JIT"); - - std::stringstream ss; - m._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::unordered_set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - AT_ASSERT(module_debug_info_set.count("top(M)::.aten::mul")); -} - -TEST(LiteInterpreterTest, NotSaveModuleInfo) { - Module m("M"); - m.define(R"JIT( - def forward(self, x): - return x + 5 - )JIT"); - - std::stringstream ss; - m._save_for_mobile(ss); - mobile::Module bc = _load_for_mobile(ss); - - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - AT_ASSERT( - module_info.empty() || - (module_info.find("debug_handle") != std::string::npos)); - ++pc; - } catch (const std::exception& e) { - break; - } - } -} - -TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return 2 * x + 5 - )JIT"); - Module b("B"); - b.register_module("A0", a); - b.define(R"JIT( - def forward(self, x): - return self.A0.forward(x) + 1 - )JIT"); - - std::stringstream ss; - b._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - AT_ASSERT(module_debug_info_set.count("top(B)::.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(B)::.A0(A)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(B)::.A0(A)::forward.aten::mul")); -} - -TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return x + 1 - )JIT"); - Module b("B"); - b.define(R"JIT( - def forward(self, x): - return x + 2 - )JIT"); - Module c("C"); - c.register_module("A0", a); - c.register_module("B0", b); - c.define(R"JIT( - def forward(self, x): - return self.A0.forward(x) + self.B0.forward(x) - )JIT"); - - std::stringstream ss; - c._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - AT_ASSERT(module_debug_info_set.count("top(C)::.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.A0(A)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.B0(B)::forward.aten::add")); -} - TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) { auto runtime_bytecode_version = _get_runtime_bytecode_version(); AT_ASSERT( @@ -795,187 +657,6 @@ TEST(LiteInterpreterTest, isCompatibleFail) { AT_ASSERT(result.status = ModelCompatibilityStatus::ERROR); } -#if !defined FB_XPLAT_BUILD -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -TEST(LiteInterpreterTest, SequentialModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return x + 1 - )JIT"); - Module b("B"); - b.define(R"JIT( - def forward(self, x): - return x + 2 - )JIT"); - Module c("C"); - c.register_module("A0", a); - c.register_module("B0", b); - c.define(R"JIT( - def forward(self, x): - return self.A0.forward(self.B0.forward(x)) - )JIT"); - - std::stringstream ss; - c._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - // class A(nn.Module): - // def __init__(self): - // super(A, self).__init__() - - // def forward(self, x): - // return x + 1 - - // class B(nn.Module): - // def __init__(self): - // super(B, self).__init__() - - // def forward(self, x): - // return x + 2 - - // class C(nn.Module): - // def __init__(self): - // super(C, self).__init__() - // self.A0 = A() - // self.B0 = B() - - // def forward(self, x): - // return self.A0.forward(self.B0.forward(x)) - - AT_ASSERT(module_debug_info_set.count("top(C)::.prim::Return")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.A0(A)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.B0(B)::forward.aten::add")); -} - -TEST(LiteInterpreterTest, HierarchyModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return x + 1 - )JIT"); - Module b("B"); - b.register_module("A0", a); - b.define(R"JIT( - def forward(self, x): - return self.A0.forward(x) + 1 - )JIT"); - Module c("C"); - c.register_module("B0", b); - c.define(R"JIT( - def forward(self, x): - return self.B0.forward(x) + 1 - )JIT"); - - std::stringstream ss; - c._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - // There are 3 module information strings here. - // "top(C).forward": for the add operator in top. - // "top(C).B0(B).forward": for the add operator in B0. - // "top(C).B0(B).forward.A0(A).forward": for the add operator in A0. - AT_ASSERT(module_debug_info_set.count("top(C)::.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.B0(B)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(C)::.B0(B)::forward.A0(A)::forward.aten::add")); -} - -TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) { - Module a("A"); - a.define(R"JIT( - def forward(self, x): - return x + 5 - )JIT"); - Module b("B"); - b.register_module("A0", a); - b.register_module("A1", a); - b.define(R"JIT( - def forward(self, x): - return self.A0.forward(x) + self.A1.forward(x) - )JIT"); - - std::stringstream ss; - b._save_for_mobile(ss, {}, true); - mobile::Module bc = _load_for_mobile(ss); - - std::set module_debug_info_set; - size_t pc = 0; - while (true) { - try { - std::string module_info = bc.get_forward_method_debug_info(pc); - if (!module_info.empty() && - (module_info.find("debug_handle") == std::string::npos)) { - module_debug_info_set.insert(module_info); - } - ++pc; - } catch (const std::exception& e) { - break; - } - } - - // class A(nn.Module): - // def __init__(self): - // super(A, self).__init__() - - // def forward(self, x): - // return x + 5 - - // class B(nn.Module): - // def __init__(self): - // super(B, self).__init__() - // self.A0 = A() - // self.A1 = A() - - // def forward(self, x): - // return self.A0.forward(x) + self.A1.forward(x) - - // There are 3 module information strings here. - // "top(B).forward": for the add operator in top. - // "top(B).A0(A).forward": for the add operator in A0. - // "top(B).A1(A).forward": for the add operator in A1. - - AT_ASSERT(module_debug_info_set.count("top(B)::.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(B)::.A0(A)::forward.aten::add")); - AT_ASSERT(module_debug_info_set.count( - "top(B)::.A1(A)::forward.aten::add")); -} -#endif // !defined(FB_XPLAT_BUILD) - TEST(LiteInterpreterTest, Eval) { std::vector inputs; diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 34846b5d6c7b3..c4731570e6d77 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -319,7 +319,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/testing/hooks_for_testing.cpp", "torch/csrc/utils/tensor_flatten.cpp", "torch/csrc/utils/variadic.cpp", -] + libtorch_profiler_sources +] core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [ "torch/csrc/jit/backends/backend_debug_info.cpp", @@ -337,7 +337,7 @@ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp", ] -libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources) +libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources + libtorch_profiler_sources) # These files are the only ones that are supported on Windows. libtorch_distributed_base_sources = [ diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp index 41ce3c6d46d52..a75ffe16c61f5 100644 --- a/torch/csrc/jit/mobile/debug_info.cpp +++ b/torch/csrc/jit/mobile/debug_info.cpp @@ -13,6 +13,12 @@ namespace jit { namespace { +C10_ALWAYS_INLINE std::string debugHandlesNotFoundMessage( + const std::string& debug_handles_string) { + return "Debug info for handle(s): " + debug_handles_string + + ", was not found."; +} + std::pair, std::string> getStackTraceWithModuleHierarchy( const DebugInfoTuple& source_callstack, const std::string& caller_name) { @@ -152,8 +158,7 @@ std::string MobileDebugTable::getModuleHierarchyInfo( const std::string& top_module_type_name) const { const auto it = callstack_ptr_map_.find(debug_handle); if (it == callstack_ptr_map_.end()) { - return "Module info for handle, " + std::to_string(debug_handle) + - ", not found."; + return debugHandlesNotFoundMessage(std::to_string(debug_handle)); } return (getStackTraceWithModuleHierarchy( {it->second}, "top", top_module_type_name)) @@ -172,8 +177,7 @@ std::string MobileDebugTable::getSourceDebugString( const std::string& top_module_type_name) const { const auto it = callstack_ptr_map_.find(debug_handle); if (it == callstack_ptr_map_.end()) { - return "Debug info for handle, " + std::to_string(debug_handle) + - ", not found."; + return debugHandlesNotFoundMessage(std::to_string(debug_handle)); } return (getStackTraceWithModuleHierarchy( {it->second}, "top", top_module_type_name)) @@ -208,8 +212,7 @@ std::pair MobileDebugTable:: debug_handles_string += std::to_string(debug_handle); } debug_handles_string += "}"; - debug_handles_string = - "Debug info for handles: " + debug_handles_string + ", was not found."; + debug_handles_string = debugHandlesNotFoundMessage(debug_handles_string); return {debug_handles_string, debug_handles_string}; } return (getStackTraceWithModuleHierarchy( diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index 6a548103f6965..99be225255ffb 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -517,12 +517,15 @@ mobile::Module BytecodeDeserializer::deserialize( auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements(); c10::optional> debug_handles; + bool has_debug_handles{false}; if (reader_->hasRecord("mobile_debug_handles.pkl")) { debug_handles = readArchive("mobile_debug_handles", mcu).toTuple()->elements(); + has_debug_handles = true; } parseMethods(bvals, debug_handles, *mcu); auto m = mobile::Module(readArchive("data", mcu).toObject(), mcu); + m.setHasDebugHandles(has_debug_handles); #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE) MobileDebugTable debug_table = MobileDebugTable(reader_, compilation_unit_); m.setDebugTable(std::move(debug_table)); diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp index 02e7c35792693..ab558cd2bf5e0 100644 --- a/torch/csrc/jit/mobile/interpreter.cpp +++ b/torch/csrc/jit/mobile/interpreter.cpp @@ -57,6 +57,9 @@ bool InterpreterState::run(Stack& stack) { auto inst_with_handle = code_->instructions_with_handles_.at(pc); Instruction inst = inst_with_handle.instruction; DebugHandle debug_handle = inst_with_handle.debug_handle; + // If no valid debug handle found then just log pc. + // This is possible when we did not save debug handles + debug_handle = debug_handle == -1 ? pc : debug_handle; // std::cout << "RUNNING " << pc << " " // << code_->instructions_with_handles_[pc].instruction; diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp index c04d9f74b7378..c74ca138d848a 100644 --- a/torch/csrc/jit/mobile/module.cpp +++ b/torch/csrc/jit/mobile/module.cpp @@ -145,8 +145,7 @@ std::string Module::getCallStack(const int64_t debug_handle) const { // We really need to change this part, so in the next step for profiling support // for delegates, the first thing will be to rewrite how profiling is done // for lite interpreter. -std::string Module::get_forward_method_debug_info(size_t pc) const { - auto debug_handle = find_method("forward")->get_debug_handle(pc); +std::string Module::get_forward_method_debug_info(int64_t debug_handle) const { #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE) return getDebugTable().getModuleHierarchyInfo( debug_handle, getTopModuleTypeName(*this)); diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h index 73637aa4584a0..6102aa517df66 100644 --- a/torch/csrc/jit/mobile/module.h +++ b/torch/csrc/jit/mobile/module.h @@ -78,7 +78,7 @@ class TORCH_API Module { } const std::vector parameters() const; const std::map named_parameters() const; - std::string get_forward_method_debug_info(size_t pc) const; + std::string get_forward_method_debug_info(int64_t debug_handle) const; std::string getModuleHierarchy(const int64_t debug_handle) const; std::string getCallStack(const int64_t debug_handle) const; /// Enables "training" mode. @@ -115,11 +115,20 @@ class TORCH_API Module { return debug_table_; } + void setHasDebugHandles(bool has_debug_handles) { + has_debug_handles_ = has_debug_handles; + } + + bool hasDebugHandles() const { + return has_debug_handles_; + } + private: c10::intrusive_ptr object_; std::unordered_map metadata_; std::shared_ptr cu_; MobileDebugTable debug_table_; + bool has_debug_handles_; }; } // namespace mobile } // namespace jit diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp index bcd5a6258ee7c..162e43f0982a6 100644 --- a/torch/csrc/jit/mobile/profiler_edge.cpp +++ b/torch/csrc/jit/mobile/profiler_edge.cpp @@ -2,7 +2,6 @@ #include #include -namespace profiler = torch::autograd::profiler; namespace torch { namespace jit { namespace mobile { @@ -27,17 +26,26 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler( if (with_modules || with_stack) { auto post_processing = [this, with_stack, with_modules]( std::vector& events) { + std::string no_debug_info("Model was not saved with debug information"); for (auto& e : events) { if (with_modules) { // Since KinetoEvents's module hierarchy takes vector of strings we // just construct a temporary vector using one string element - e.moduleHierarchy(std::vector( - {this->m_.getModuleHierarchy(e.debugHandle())})); + if (this->m_.hasDebugHandles()) { + e.moduleHierarchy(std::vector( + {this->m_.getModuleHierarchy(e.debugHandle())})); + } else { + e.moduleHierarchy(std::vector({no_debug_info})); + } } else if (with_stack) { // Since KinetoEvents's stack trace takes vector of strings we just // construct a temporary vector using one string element - e.stack(std::vector( - {this->m_.getCallStack(e.debugHandle())})); + if (this->m_.hasDebugHandles()) { + e.stack(std::vector( + {this->m_.getCallStack(e.debugHandle())})); + } else { + e.stack(std::vector({no_debug_info})); + } } } }; @@ -55,8 +63,33 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler( trace_file_name_ = fname; } +const std::unique_ptr& KinetoEdgeCPUProfiler:: + disableProfiler() { + TORCH_CHECK( + !profiler_result_, + "KinetoEdgeCPUProfiler already disabled. " + "To get list of events use getProfilerResults()"); + profiler_result_ = profiler::disableProfiler(); + return profiler_result_; +} + +const std::unique_ptr& KinetoEdgeCPUProfiler:: + getProfilerResult() { + TORCH_CHECK( + profiler_result_, + "KinetoEdgeCPUProfiler has not been disabled. " + "use disableProfiler() API first, which returns the ProfilerResult."); + return profiler_result_; +} + KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() { - profiler::disableProfiler()->save(trace_file_name_); + if (!trace_file_name_.empty()) { + if (profiler_result_) { + profiler_result_->save(trace_file_name_); + } else { + profiler::disableProfiler()->save(trace_file_name_); + } + } } } // namespace mobile } // namespace jit diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h index a245034e34f9b..ef37e01ed4c71 100644 --- a/torch/csrc/jit/mobile/profiler_edge.h +++ b/torch/csrc/jit/mobile/profiler_edge.h @@ -2,6 +2,7 @@ #include #include +namespace profiler = torch::autograd::profiler; namespace torch { namespace jit { namespace mobile { @@ -53,6 +54,9 @@ class TORCH_API KinetoEdgeCPUProfiler { const bool with_flops = false, const bool with_modules = false); + const std::unique_ptr& disableProfiler(); + const std::unique_ptr& getProfilerResult(); + ~KinetoEdgeCPUProfiler(); private: @@ -62,6 +66,7 @@ class TORCH_API KinetoEdgeCPUProfiler { */ const mobile::Module& m_; std::string trace_file_name_; + std::unique_ptr profiler_result_; }; } // namespace mobile } // namespace jit From 03a58a2ba0a18ba4e8d41ad1a8cd8431ac1e5a4b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 1 Sep 2021 13:24:11 -0700 Subject: [PATCH 430/530] [Caffe2] Create fewer strings during argument fetching (#64285) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64285 With C++14 heterogeneous ordered container lookup, it is no longer necessary to create a `std::string` in order to look up elements of a `CaffeMap` keyed by std::string. Accordingly, this diff reworks the argument-getting operator functions to avoid that in favor of `c10::string_view`. ghstack-source-id: 137139818 ghstack-source-id: 137139818 Test Plan: buildsizebot iOS apps -- code size win. less strings is probably marginally good for perf but this only happens at setup time anyway. Reviewed By: dzhulgakov Differential Revision: D26826676 fbshipit-source-id: ee653b14dc2c528bae8c90f0fc6a7a419cbca1d6 --- aten/src/ATen/core/function_schema.h | 3 +- caffe2/core/operator.cc | 2 +- caffe2/core/operator.h | 17 ++++---- caffe2/utils/proto_utils.cc | 49 ++++++++++++--------- caffe2/utils/proto_utils.h | 64 +++++++++++++++++----------- 5 files changed, 79 insertions(+), 56 deletions(-) diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index a7b514990185b..f4b11fc4a304a 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -272,7 +273,7 @@ struct FunctionSchema { }); } - c10::optional argumentIndexWithName(const std::string& name) const { + c10::optional argumentIndexWithName(c10::string_view name) const { for(size_t i = 0; i < arguments().size(); ++i) { if(name == arguments()[i].name()) return i; diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index ca66f7846c300..e25c92a6d6075 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -831,7 +831,7 @@ std::function GetOperatorLogger() { } c10::optional OperatorBase::argumentIndexWithName( - const std::string& name) const { + c10::string_view name) const { #if defined(EXPOSE_C2_OPS) || \ !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) return getFunctionSchema().argumentIndexWithName(name); diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index b840254612929..15d1ead352762 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include "caffe2/core/blob.h" @@ -97,7 +98,7 @@ class TORCH_API OperatorBase : public Observable { /** @brief Checks if the operator has an argument of the given name. */ - inline bool HasArgument(const string& name) const { + inline bool HasArgument(c10::string_view name) const { if (isLegacyOperator()) { CAFFE_ENFORCE(operator_def_, "operator_def was null!"); return ArgumentHelper::HasArgument(*operator_def_, name); @@ -108,7 +109,7 @@ class TORCH_API OperatorBase : public Observable { // Functions that deal with arguments. Basically, this allows us to map an // argument name to a specific type of argument that we are trying to access. template - inline T GetSingleArgument(const string& name, const T& default_value) const { + inline T GetSingleArgument(c10::string_view name, const T& default_value) const { if (isLegacyOperator()) { CAFFE_ENFORCE(operator_def_, "operator_def was null!"); return ArgumentHelper::GetSingleArgument( @@ -126,7 +127,7 @@ class TORCH_API OperatorBase : public Observable { } template - inline bool HasSingleArgumentOfType(const string& name) const { + inline bool HasSingleArgumentOfType(c10::string_view name) const { CAFFE_ENFORCE(operator_def_, "operator_def was null!"); return ArgumentHelper::HasSingleArgumentOfType( *operator_def_, name); @@ -141,7 +142,7 @@ class TORCH_API OperatorBase : public Observable { template inline vector GetRepeatedArgument( - const string& name, + c10::string_view name, const vector& default_value = {}) const; // Get the inputs and outputs as specific types. @@ -654,7 +655,7 @@ class TORCH_API OperatorBase : public Observable { } } - c10::optional argumentIndexWithName(const std::string& name) const; + c10::optional argumentIndexWithName(c10::string_view name) const; // An event used by asynchronous execution. std::unique_ptr event_; @@ -664,7 +665,7 @@ class TORCH_API OperatorBase : public Observable { template <> inline NetDef OperatorBase::GetSingleArgument( - const std::string& name, + c10::string_view name, const NetDef& default_value) const { if (isLegacyOperator()) { CAFFE_ENFORCE(operator_def_, "operator_def was null!"); @@ -756,7 +757,7 @@ inline vector OperatorBase::GetVectorFromIValueList( template inline vector OperatorBase::GetRepeatedArgument( - const string& name, + c10::string_view name, const vector& default_value) const { if (isLegacyOperator()) { CAFFE_ENFORCE(operator_def_, "operator_def was null!"); @@ -778,7 +779,7 @@ inline vector OperatorBase::GetRepeatedArgument( // int16_t. We need to load it as List and transform to int16_t. template <> inline vector OperatorBase::GetRepeatedArgument( - const string& name, + c10::string_view name, const vector& default_value) const { if (isLegacyOperator()) { CAFFE_ENFORCE(operator_def_, "operator_def was null!"); diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc index d2aa59e02b63f..db379462e5347 100644 --- a/caffe2/utils/proto_utils.cc +++ b/caffe2/utils/proto_utils.cc @@ -323,8 +323,12 @@ C10_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) { } } -C10_EXPORT bool ArgumentHelper::HasArgument(const string& name) const { +C10_EXPORT bool ArgumentHelper::HasArgument(c10::string_view name) const { +#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP return arg_map_.count(name); +#else + return arg_map_.count(std::string(name)); +#endif } namespace { @@ -364,18 +368,19 @@ std::ostream& operator<<(std::ostream& output, const NetDef& n) { T, fieldname, enforce_lossless_conversion) \ template <> \ C10_EXPORT T ArgumentHelper::GetSingleArgument( \ - const string& name, const T& default_value) const { \ - if (arg_map_.count(name) == 0) { \ + c10::string_view name, const T& default_value) const { \ + auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); \ + if (it == arg_map_.end()) { \ VLOG(1) << "Using default parameter value " << default_value \ << " for parameter " << name; \ return default_value; \ } \ CAFFE_ENFORCE( \ - arg_map_.at(name).has_##fieldname(), \ + it->second.has_##fieldname(), \ "Argument ", \ name, \ " does not have the right field: expected field " #fieldname); \ - auto value = arg_map_.at(name).fieldname(); \ + auto value = it->second.fieldname(); \ if (enforce_lossless_conversion) { \ auto supportsConversion = \ SupportsLosslessConversion(value); \ @@ -391,11 +396,12 @@ std::ostream& operator<<(std::ostream& output, const NetDef& n) { } \ template <> \ C10_EXPORT bool ArgumentHelper::HasSingleArgumentOfType( \ - const string& name) const { \ - if (arg_map_.count(name) == 0) { \ + c10::string_view name) const { \ + auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); \ + if (it == arg_map_.end()) { \ return false; \ } \ - return arg_map_.at(name).has_##fieldname(); \ + return it->second.has_##fieldname(); \ } INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false) @@ -415,13 +421,14 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false) #define INSTANTIATE_GET_REPEATED_ARGUMENT( \ T, fieldname, enforce_lossless_conversion) \ template <> \ - C10_EXPORT std::vector ArgumentHelper::GetRepeatedArgument( \ - const string& name, const std::vector& default_value) const { \ - if (arg_map_.count(name) == 0) { \ + C10_EXPORT std::vector ArgumentHelper::GetRepeatedArgument( \ + c10::string_view name, const std::vector& default_value) const { \ + auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); \ + if (it == arg_map_.end()) { \ return default_value; \ } \ - std::vector values; \ - for (const auto& v : arg_map_.at(name).fieldname()) { \ + std::vector values; \ + for (const auto& v : it->second.fieldname()) { \ if (enforce_lossless_conversion) { \ auto supportsConversion = \ SupportsLosslessConversion(v); \ @@ -531,7 +538,7 @@ C10_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) { // Return the argument index or -1 if it does not exist. C10_EXPORT int GetArgumentIndex( const google::protobuf::RepeatedPtrField& args, - const string& name) { + c10::string_view name) { int index = 0; for (const Argument& arg : args) { if (arg.name() == name) { @@ -544,7 +551,7 @@ C10_EXPORT int GetArgumentIndex( C10_EXPORT const Argument& GetArgument( const OperatorDef& def, - const string& name) { + c10::string_view name) { int index = GetArgumentIndex(def.arg(), name); if (index != -1) { return def.arg(index); @@ -557,7 +564,7 @@ C10_EXPORT const Argument& GetArgument( } } -C10_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) { +C10_EXPORT const Argument& GetArgument(const NetDef& def, c10::string_view name) { int index = GetArgumentIndex(def.arg(), name); if (index != -1) { return def.arg(index); @@ -572,7 +579,7 @@ C10_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) { C10_EXPORT const Argument* GetArgumentPtr( const OperatorDef& def, - const string& name) { + c10::string_view name) { int index = GetArgumentIndex(def.arg(), name); if (index != -1) { return &def.arg(index); @@ -583,7 +590,7 @@ C10_EXPORT const Argument* GetArgumentPtr( C10_EXPORT const Argument* GetArgumentPtr( const NetDef& def, - const string& name) { + c10::string_view name) { int index = GetArgumentIndex(def.arg(), name); if (index != -1) { return &def.arg(index); @@ -594,7 +601,7 @@ C10_EXPORT const Argument* GetArgumentPtr( C10_EXPORT bool GetFlagArgument( const google::protobuf::RepeatedPtrField& args, - const string& name, + c10::string_view name, bool default_value) { int index = GetArgumentIndex(args, name); if (index != -1) { @@ -609,13 +616,13 @@ C10_EXPORT bool GetFlagArgument( C10_EXPORT bool GetFlagArgument( const OperatorDef& def, - const string& name, + c10::string_view name, bool default_value) { return GetFlagArgument(def.arg(), name, default_value); } C10_EXPORT bool -GetFlagArgument(const NetDef& def, const string& name, bool default_value) { +GetFlagArgument(const NetDef& def, c10::string_view name, bool default_value) { return GetFlagArgument(def.arg(), name, default_value); } diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h index 57676982c7851..b5c6b312b3ab3 100644 --- a/caffe2/utils/proto_utils.h +++ b/caffe2/utils/proto_utils.h @@ -8,10 +8,18 @@ #endif // !CAFFE2_USE_LITE_PROTO #include +#include #include "caffe2/utils/proto_wrap.h" #include "caffe2/proto/caffe2_pb.h" +#ifndef C10_ANDROID +#define CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP +#define CAFFE2_ARG_MAP_FIND(map, key) map.find(key) +#else +#define CAFFE2_ARG_MAP_FIND(map, key) map.find(std::string(key)) +#endif + namespace caffe2 { using std::string; @@ -204,40 +212,40 @@ TORCH_API bool HasInput(const OperatorDef& op, const std::string& input); class C10_EXPORT ArgumentHelper { public: template - static bool HasArgument(const Def& def, const string& name) { + static bool HasArgument(const Def& def, c10::string_view name) { return ArgumentHelper(def).HasArgument(name); } template static T GetSingleArgument( const Def& def, - const string& name, + c10::string_view name, const T& default_value) { return ArgumentHelper(def).GetSingleArgument(name, default_value); } template - static bool HasSingleArgumentOfType(const Def& def, const string& name) { + static bool HasSingleArgumentOfType(const Def& def, c10::string_view name) { return ArgumentHelper(def).HasSingleArgumentOfType(name); } template static std::vector GetRepeatedArgument( const Def& def, - const string& name, + c10::string_view name, const std::vector& default_value = std::vector()) { return ArgumentHelper(def).GetRepeatedArgument(name, default_value); } template - static MessageType GetMessageArgument(const Def& def, const string& name) { + static MessageType GetMessageArgument(const Def& def, c10::string_view name) { return ArgumentHelper(def).GetMessageArgument(name); } template static std::vector GetRepeatedMessageArgument( const Def& def, - const string& name) { + c10::string_view name) { return ArgumentHelper(def).GetRepeatedMessageArgument(name); } @@ -255,24 +263,25 @@ class C10_EXPORT ArgumentHelper { explicit ArgumentHelper(const OperatorDef& def); explicit ArgumentHelper(const NetDef& netdef); - bool HasArgument(const string& name) const; + bool HasArgument(c10::string_view name) const; template - T GetSingleArgument(const string& name, const T& default_value) const; + T GetSingleArgument(c10::string_view name, const T& default_value) const; template - bool HasSingleArgumentOfType(const string& name) const; + bool HasSingleArgumentOfType(c10::string_view name) const; template std::vector GetRepeatedArgument( - const string& name, + c10::string_view name, const std::vector& default_value = std::vector()) const; template - MessageType GetMessageArgument(const string& name) const { - CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name); + MessageType GetMessageArgument(c10::string_view name) const { + auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); + CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name); MessageType message; - if (arg_map_.at(name).has_s()) { + if (it->second.has_s()) { CAFFE_ENFORCE( - message.ParseFromString(arg_map_.at(name).s()), + message.ParseFromString(it->second.s()), "Failed to parse content from the string"); } else { VLOG(1) << "Return empty message for parameter " << name; @@ -281,42 +290,47 @@ class C10_EXPORT ArgumentHelper { } template - std::vector GetRepeatedMessageArgument(const string& name) const { - CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name); - std::vector messages(arg_map_.at(name).strings_size()); + std::vector GetRepeatedMessageArgument(c10::string_view name) const { + auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); + CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name); + std::vector messages(it->second.strings_size()); for (int i = 0; i < messages.size(); ++i) { CAFFE_ENFORCE( - messages[i].ParseFromString(arg_map_.at(name).strings(i)), + messages[i].ParseFromString(it->second.strings(i)), "Failed to parse content from the string"); } return messages; } private: - std::map arg_map_; + std::map +#endif + > arg_map_; }; // **** Arguments Utils ***** // Helper methods to get an argument from OperatorDef or NetDef given argument // name. Throws if argument does not exist. -TORCH_API const Argument& GetArgument(const OperatorDef& def, const string& name); -TORCH_API const Argument& GetArgument(const NetDef& def, const string& name); +TORCH_API const Argument& GetArgument(const OperatorDef& def, c10::string_view name); +TORCH_API const Argument& GetArgument(const NetDef& def, c10::string_view name); // Helper methods to get an argument from OperatorDef or NetDef given argument // name. Returns nullptr if argument does not exist. -TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, const string& name); -TORCH_API const Argument* GetArgumentPtr(const NetDef& def, const string& name); +TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, c10::string_view name); +TORCH_API const Argument* GetArgumentPtr(const NetDef& def, c10::string_view name); // Helper methods to query a boolean argument flag from OperatorDef or NetDef // given argument name. If argument does not exist, return default value. // Throws if argument exists but the type is not boolean. TORCH_API bool GetFlagArgument( const OperatorDef& def, - const string& name, + c10::string_view name, bool default_value = false); TORCH_API bool GetFlagArgument( const NetDef& def, - const string& name, + c10::string_view name, bool default_value = false); TORCH_API Argument* GetMutableArgument( From 25e2578967494ee88da66820d861c426eb7a742d Mon Sep 17 00:00:00 2001 From: Tanvir Zaman Date: Wed, 1 Sep 2021 13:31:45 -0700 Subject: [PATCH 431/530] Fix bytes_written and bytes_read (#64244) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64244 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64040 In operator cost inference functions, in many places we are using sizeof(x.data_type()). Since data_type() returns a 32 bit integer from [this enum](https://www.internalfb.com/code/fbsource/[15e7ffe4073cf08c61077c7c24a4839504b964a2]/fbcode/caffe2/caffe2/proto/caffe2.proto?lines=20), we are basically always getting 4 for sizeof(x.data_type()) no matter what actual data type x has. Big thanks to Jack Langman for specifically pointing to this bug. We would instead use the size in bytes based on actual data type. Test Plan: Added unit tests BatchMatMulMemCostTest: buck test //caffe2/caffe2/fb/fbgemm:batch_matmul_op_test -- BatchMatMulMemCostTest Extended existing unit test test_columnwise_concat for different data types: buck test //caffe2/caffe2/python/operator_test:concat_op_cost_test -- test_columnwise_concat Reviewed By: CrazySherman Differential Revision: D30656698 fbshipit-source-id: d42c0c9a0c5b0ddc5dba39e4994f1f85a5e618bf --- caffe2/core/operator_schema.h | 17 ++- caffe2/operators/batch_matmul_op.cc | 113 ++++++++++-------- caffe2/operators/concat_split_op.cc | 15 ++- caffe2/operators/conv_pool_op_base.h | 15 ++- caffe2/operators/distance_op.cc | 28 +++-- caffe2/operators/fc_inference.cc | 22 ++-- caffe2/operators/one_hot_ops.cc | 30 +++-- caffe2/operators/utility_ops.cc | 13 +- .../operator_test/concat_op_cost_test.py | 54 +++++---- caffe2/python/workspace_test.py | 2 +- caffe2/sgd/adagrad_op.cc | 55 ++++++--- 11 files changed, 224 insertions(+), 140 deletions(-) diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h index 64f5ef3ed883a..0d048eb8d26e9 100644 --- a/caffe2/core/operator_schema.h +++ b/caffe2/core/operator_schema.h @@ -6,12 +6,13 @@ #include #include #include -#include #include +#include #include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" +#include "caffe2/core/types.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/filler.h" #include "caffe2/utils/proto_utils.h" @@ -273,8 +274,8 @@ class TORCH_API OpSchema { OpSchema& Arg(const char* name, const char* description, bool required = false); -#define DECLARE_STANDARD_ARG(name, str) \ - static const char* Arg_##name; \ +#define DECLARE_STANDARD_ARG(name, str) \ + static const char* Arg_##name; \ OpSchema& Arg##name(const char* description); DECLARE_STANDARD_ARG(IsTest, is_test) @@ -339,7 +340,9 @@ class TORCH_API OpSchema { return inplace_enforced_(x, y); } - TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema); + TORCH_API friend std::ostream& operator<<( + std::ostream& out, + const OpSchema& schema); const std::vector& args() const { return args_; @@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference( } c.flops = nElemX * OpsPerPoint; - c.bytes_read = nElemRead * sizeof(X.data_type()); - c.bytes_written = nElemX * sizeof(X.data_type()); + auto const& X_element_size_byte = + DataTypeToTypeMeta(X.data_type()).itemsize(); + c.bytes_read = nElemRead * X_element_size_byte; + c.bytes_written = nElemX * X_element_size_byte; return c; } diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc index 32799ced10671..205acf74f1572 100644 --- a/caffe2/operators/batch_matmul_op.cc +++ b/caffe2/operators/batch_matmul_op.cc @@ -1,6 +1,7 @@ #include "caffe2/operators/batch_matmul_op.h" #include "caffe2/core/operator_schema.h" +#include "caffe2/core/types.h" namespace caffe2 { @@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul( K = in[0].dims(ndims_A - 1); } + auto const& A_element_size_byte = + DataTypeToTypeMeta(A.data_type()).itemsize(); + auto const& Y_element_size_byte = + DataTypeToTypeMeta(Y.data_type()).itemsize(); c.flops = 2 * nElemY * K; - c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type()); - c.bytes_written = nElemY * sizeof(Y.data_type()); + c.bytes_read = (nElemA + nElemB) * A_element_size_byte; + c.bytes_written = nElemY * Y_element_size_byte; c.params_bytes = 0; return c; } @@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase { auto no_trans_arg = vector(); auto trans_a_arg = vector{MakeArgument("trans_a", 1)}; auto trans_b_arg = vector{MakeArgument("trans_b", 1)}; - auto trans_both_arg = vector{MakeArgument("trans_a", 1), - MakeArgument("trans_b", 1)}; + auto trans_both_arg = vector{ + MakeArgument("trans_a", 1), MakeArgument("trans_b", 1)}; if (trans_a) { if (trans_b) { // A'B': // dA = B'G', dB = G'A' - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{I(1), GO(0)}, - vector{GI(0)}, - trans_both_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(0)}, - vector{GI(1)}, - trans_both_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(1), GO(0)}, + vector{GI(0)}, + trans_both_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(0)}, + vector{GI(1)}, + trans_both_arg)}; } else { // A'B: // dA = BG', dB = AG - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{I(1), GO(0)}, - vector{GI(0)}, - trans_b_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(0), GO(0)}, - vector{GI(1)}, - no_trans_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(1), GO(0)}, + vector{GI(0)}, + trans_b_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(0), GO(0)}, + vector{GI(1)}, + no_trans_arg)}; } } else { if (trans_b) { // AB': // dA = GB, dB = G'A - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(1)}, - vector{GI(0)}, - no_trans_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(0)}, - vector{GI(1)}, - trans_a_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(1)}, + vector{GI(0)}, + no_trans_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(0)}, + vector{GI(1)}, + trans_a_arg)}; } else { // AB: // dA = GB', dB = A'G - return vector{CreateOperatorDef( - "BatchMatMul", - "", - vector{GO(0), I(1)}, - vector{GI(0)}, - trans_b_arg), - CreateOperatorDef( - "BatchMatMul", - "", - vector{I(0), GO(0)}, - vector{GI(1)}, - trans_a_arg)}; + return vector{ + CreateOperatorDef( + "BatchMatMul", + "", + vector{GO(0), I(1)}, + vector{GI(0)}, + trans_b_arg), + CreateOperatorDef( + "BatchMatMul", + "", + vector{I(0), GO(0)}, + vector{GI(1)}, + trans_a_arg)}; } } } diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc index 8eceb5ab4a577..86d6536b8880d 100644 --- a/caffe2/operators/concat_split_op.cc +++ b/caffe2/operators/concat_split_op.cc @@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit( CAFFE_ENFORCE_GT(in.size(), 0); struct OpSchema::Cost cost; cost.flops = 0; - auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type()); - auto split_bytes_count = - (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type()); + auto const& input_0_element_size_byte = + DataTypeToTypeMeta(in[0].data_type()).itemsize(); + auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte; + auto split_bytes_count = in.size() > 1 + ? nElemFromDim(in[1]) * DataTypeToTypeMeta(in[1].data_type()).itemsize() + : 0; // There can be two input blobs: // (1) actual tensor to be split // (2) lengths of outputs along split axis @@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat( } auto split_info_bytes_count = in.size() * sizeof(int); + auto const& input_0_element_size_byte = + DataTypeToTypeMeta(in[0].data_type()).itemsize(); struct OpSchema::Cost cost; cost.flops = 0; - cost.bytes_read = nElemRead * sizeof(in[0].data_type()); + cost.bytes_read = nElemRead * input_0_element_size_byte; cost.bytes_written = - size * sizeof(in[0].data_type()) + split_info_bytes_count; + size * input_0_element_size_byte + split_info_bytes_count; cost.params_bytes = 0; return cost; } diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index 25bd99a92e50f..b356ef952d79c 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -7,6 +7,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" +#include "caffe2/core/types.h" #include "caffe2/proto/caffe2_legacy.pb.h" #include "caffe2/utils/math.h" @@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator { uint64_t nElemW = nElemFromDim(W); uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0; + auto const& X_elemenet_size_byte = + DataTypeToTypeMeta(X.data_type()).itemsize(); + auto const& Y_element_size_byte = + DataTypeToTypeMeta(Y.data_type()).itemsize(); + auto const& W_element_size_byte = + DataTypeToTypeMeta(W.data_type()).itemsize(); + // grouping is NOT properly handled yet c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h * in_channels * out_channels * 2; - c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type()); - c.bytes_written = - N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type()); + c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte; + c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte; c.params_bytes = out_channels * in_channels * kernel_t * kernel_h * - kernel_w * sizeof(W.data_type()); + kernel_w * W_element_size_byte; return c; } diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc index 1529534d8fb2e..9ea8eea5a2725 100644 --- a/caffe2/operators/distance_op.cc +++ b/caffe2/operators/distance_op.cc @@ -1,4 +1,5 @@ #include "caffe2/operators/distance_op.h" +#include "caffe2/core/types.h" #include "caffe2/utils/eigen_utils.h" #ifdef CAFFE2_USE_MKLDNN #include @@ -7,7 +8,7 @@ namespace caffe2 { -template<> +template <> bool SquaredL2DistanceOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); @@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct( CAFFE_ENFORCE_EQ(out[0].dims().size(), 1); struct OpSchema::Cost c = PointwiseCostInference<2>(def, in); - c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type()); + auto const& out_0_element_size_byte = + DataTypeToTypeMeta(out[0].data_type()).itemsize(); + c.bytes_written = out[0].dims(0) * out_0_element_size_byte; c.params_bytes = 0; return c; } @@ -379,10 +382,12 @@ bool DotProductWithPaddingOp::RunOnDevice() { } // L2 -REGISTER_CPU_OPERATOR(SquaredL2Distance, - SquaredL2DistanceOp); -REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient, - SquaredL2DistanceGradientOp); +REGISTER_CPU_OPERATOR( + SquaredL2Distance, + SquaredL2DistanceOp); +REGISTER_CPU_OPERATOR( + SquaredL2DistanceGradient, + SquaredL2DistanceGradientOp); OPERATOR_SCHEMA(SquaredL2Distance) .NumInputs(2) @@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; vector GetGradientDefs() override { return SingleGradientDef( - "SquaredL2DistanceGradient", "", + "SquaredL2DistanceGradient", + "", vector{I(0), I(1), GO(0)}, vector{GI(0), GI(1)}); } @@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase { replicate = GetArgument(Def(), "replicate").i(); } - const auto dot_arg = - vector{MakeArgument("pad_value", pad_value), - MakeArgument("replicate", replicate)}; + const auto dot_arg = vector{ + MakeArgument("pad_value", pad_value), + MakeArgument("replicate", replicate)}; return SingleGradientDef( "DotProductWithPaddingGradient", @@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase { } }; REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient); -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc index a44c230980c7f..ba1b7122cdc9d 100644 --- a/caffe2/operators/fc_inference.cc +++ b/caffe2/operators/fc_inference.cc @@ -1,4 +1,5 @@ #include "caffe2/operators/fc_inference.h" +#include "caffe2/core/types.h" namespace caffe2 { std::vector FCShapeInference( @@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC( ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1])) : size_to_dim_(canonical_axis_w, GetDimsVector(in[1])); - const auto& X = in[0]; + auto const& X_element_size_byte = + DataTypeToTypeMeta(in[0].data_type()).itemsize(); c.flops = M * N * (2 * K + 1); - c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type()); - c.bytes_written = M * N * sizeof(X.data_type()); - c.params_bytes = (K * N + N) * sizeof(X.data_type()); + c.bytes_read = (K * (M + N) + N) * X_element_size_byte; + c.bytes_written = M * N * X_element_size_byte; + c.params_bytes = (K * N + N) * X_element_size_byte; return c; } @@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient( CAFFE_ENFORCE_LT(0, out.size()); const TensorShape dW = out[0]; + auto const& dW_element_size_byte = + DataTypeToTypeMeta(dW.data_type()).itemsize(); const TensorShape db = out[1]; + auto const& db_element_size_byte = + DataTypeToTypeMeta(db.data_type()).itemsize(); auto axis = helper.GetSingleArgument("axis", 1); const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size()); @@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient( uint64_t size_db = nElemFromDim(db); c.flops = M * N * (2 * K + 1); - c.bytes_written = (size_dW + size_db) * sizeof(float); + c.bytes_written = + size_dW * dW_element_size_byte + size_db * db_element_size_byte; c.params_bytes = (K * N + N) * sizeof(float); if (out.size() == 3) { const TensorShape dX = out[2]; uint64_t size_dX = nElemFromDim(dX); - + auto const& dX_element_size_byte = + DataTypeToTypeMeta(dX.data_type()).itemsize(); c.flops += 2 * M * N * K; - c.bytes_written += size_dX * sizeof(float); + c.bytes_written += size_dX * dX_element_size_byte; } return c; } diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc index c3eaf05db0e8f..55c73a5be22c4 100644 --- a/caffe2/operators/one_hot_ops.cc +++ b/caffe2/operators/one_hot_ops.cc @@ -2,6 +2,7 @@ #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" +#include "caffe2/core/types.h" namespace caffe2 { @@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot( const auto& length = in[1]; const auto& values = in[2]; - uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type()); - uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type()); - uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type()); + auto const& data_element_size_byte = + DataTypeToTypeMeta(data.data_type()).itemsize(); + auto const& length_element_size_byte = + DataTypeToTypeMeta(length.data_type()).itemsize(); + auto const& values_element_size_byte = + DataTypeToTypeMeta(values.data_type()).itemsize(); + auto const& output_element_size_byte = + DataTypeToTypeMeta(output.data_type()).itemsize(); + + uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte; + uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte; + uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte; c.flops = 0; c.bytes_read = nBytesData + nBytesLength + nBytesValues; - c.bytes_written = nElemFromDim(output) * sizeof(output.data_type()); + c.bytes_written = nElemFromDim(output) * output_element_size_byte; c.params_bytes = 0; return c; } @@ -145,15 +155,15 @@ bool BatchBucketOneHotOp::RunOnDevice() { for (int64_t j = 0; j < D; j++) { // here we assume the boundary values for each feature are sorted int64_t lower_bucket_idx = std::lower_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; int64_t upper_bucket_idx = std::upper_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2; diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 8b5e116024b81..561da9189b388 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -1,6 +1,7 @@ #include "caffe2/operators/utility_ops.h" #include #include +#include "caffe2/core/types.h" #include "caffe2/utils/eigen_utils.h" namespace caffe2 { @@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum( const auto& nElem = nElemFromDim(X0); const auto& nInputs = in.size(); c.flops = (nInputs - 1) * nElem; - c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type()); - c.bytes_written = nElem * sizeof(X0.data_type()); - c.params_bytes = (nInputs / 2) * sizeof(X0.data_type()); + auto const& X0_element_size_byte = + DataTypeToTypeMeta(X0.data_type()).itemsize(); + c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte; + c.bytes_written = nElem * X0_element_size_byte; + c.params_bytes = (nInputs / 2) * X0_element_size_byte; return c; } @@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp); REGISTER_CPU_OPERATOR(SumInt, SumOp); REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp); REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp); -REGISTER_CPU_OPERATOR( - ScatterWeightedSum, - ScatterWeightedSumOp); +REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp); REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); REGISTER_CPU_OPERATOR(Scatter, ScatterOp); diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py index 996b330be4947..7dab4d6bd5d1f 100644 --- a/caffe2/python/operator_test/concat_op_cost_test.py +++ b/caffe2/python/operator_test/concat_op_cost_test.py @@ -7,33 +7,39 @@ class TestConcatOpCost(TestCase): def test_columnwise_concat(self): - workspace.ResetWorkspace() - workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)) - workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32)) - concat_op = core.CreateOperator( - "Concat", - ["input_1", "input_2"], - ["output", "split_info"], - ) - workspace.RunOperatorOnce(concat_op) + def _test_columnwise_concat_for_type(dtype): + workspace.ResetWorkspace() + workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)) + workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype)) + concat_op = core.CreateOperator( + "Concat", + ["input_1", "input_2"], + ["output", "split_info"], + ) + workspace.RunOperatorOnce(concat_op) - output = workspace.FetchBlob("output") - self.assertTupleEqual(output.shape, (2, 4)) - np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]]) + output = workspace.FetchBlob("output") + self.assertTupleEqual(output.shape, (2, 4)) + np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]]) - flops, bytes_written, bytes_read = workspace.GetOperatorCost( - concat_op, concat_op.input - ) + flops, bytes_written, bytes_read = workspace.GetOperatorCost( + concat_op, concat_op.input + ) - self.assertEqual(flops, 0) - self.assertEqual( - bytes_read, - sum(workspace.FetchBlob(b).nbytes for b in concat_op.input), - ) - self.assertEqual( - bytes_written, - sum(workspace.FetchBlob(b).nbytes for b in concat_op.output), - ) + self.assertEqual(flops, 0) + self.assertEqual( + bytes_read, + sum(workspace.FetchBlob(b).nbytes for b in concat_op.input), + ) + self.assertEqual( + bytes_written, + sum(workspace.FetchBlob(b).nbytes for b in concat_op.output), + ) + + [ + _test_columnwise_concat_for_type(t) + for t in [np.int64, np.float, np.half, np.int8] + ] def test_split_then_concat(self): workspace.ResetWorkspace() diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index afb2065027075..1bf7b607e1b7e 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -60,7 +60,7 @@ def testGetOperatorCost(self): self.assertTupleEqual( op_cost, namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])( - 1152, 256, 2084 + 1152, 256, 4168 ), ) diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc index 0de50f03e62d5..0b6f604b48cdb 100644 --- a/caffe2/sgd/adagrad_op.cc +++ b/caffe2/sgd/adagrad_op.cc @@ -1,4 +1,5 @@ #include "adagrad_op.h" +#include "caffe2/core/types.h" namespace caffe2 { @@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad( // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = grad_size * 10; + auto const& moment_element_size_byte = + DataTypeToTypeMeta(moment.data_type()).itemsize(); + auto const& param_element_size_byte = + DataTypeToTypeMeta(param.data_type()).itemsize(); + auto const& grad_element_size_byte = + DataTypeToTypeMeta(grad.data_type()).itemsize(); + auto const& lr_element_size_byte = + DataTypeToTypeMeta(lr.data_type()).itemsize(); uint64_t bytes_written = - grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type())); + grad_size * param_element_size_byte + moment_element_size_byte; if (output_size == 3) { // also need to output effective learning rate in this case // assume it's the same data type as lr - bytes_written += grad_size * sizeof(lr.data_type()); + bytes_written += grad_size * lr_element_size_byte; } else if (output_size == 4) { // also need to output effective learning rate and updates in this case // assume update is the same data type as param bytes_written += - grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type())); + grad_size * (lr_element_size_byte + param_element_size_byte); } c.bytes_written = bytes_written; c.bytes_read = c.bytes_written + - grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type())); + grad_size * (grad_element_size_byte + lr_element_size_byte); return c; } @@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad( // (optimistically count sqrt as one flop). // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = grad_size * 7; + auto const& param_element_size_byte = + DataTypeToTypeMeta(param.data_type()).itemsize(); + auto const& moment_element_size_byte = + DataTypeToTypeMeta(moment.data_type()).itemsize(); c.bytes_written = - grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type())); - c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) + - n * sizeof(indices.data_type()); + grad_size * (param_element_size_byte + moment_element_size_byte); + auto const& grad_element_size_byte = + DataTypeToTypeMeta(grad.data_type()).itemsize(); + auto const& indices_element_size_byte = + DataTypeToTypeMeta(indices.data_type()).itemsize(); + c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte + + n * indices_element_size_byte; return c; } @@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad( OpSchema::Cost c; if (n > 0) { + auto const& param_element_size_byte = + DataTypeToTypeMeta(param.data_type()).itemsize(); + auto const& moment_element_size_byte = + DataTypeToTypeMeta(moment.data_type()).itemsize(); + auto const& grad_element_size_byte = + DataTypeToTypeMeta(grad.data_type()).itemsize(); + auto const& indices_element_size_byte = + DataTypeToTypeMeta(indices.data_type()).itemsize(); + auto const& lr_element_size_byte = + DataTypeToTypeMeta(lr.data_type()).itemsize(); auto block_size = grad_size / n; if (block_size == 1) { // +2: applying weight decay and add to grads @@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad( // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = n * 9; c.bytes_written = - n * (sizeof(param.data_type()) + sizeof(moment.data_type())); + n * (param_element_size_byte + moment_element_size_byte); c.bytes_read = c.bytes_written + n * - (sizeof(grad.data_type()) + sizeof(indices.data_type()) + - sizeof(lr.data_type())); + (grad_element_size_byte + indices_element_size_byte + + lr_element_size_byte); } else { // 5 per block (not counting index transforms) // 8 for each value of a block // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) c.flops = n * (5 + (block_size * 8)); - c.bytes_written = - n * sizeof(moment.data_type()) + n * block_size * (param.data_type()); + c.bytes_written = n * moment_element_size_byte + + n * block_size * param_element_size_byte; - c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) + + c.bytes_read = c.bytes_written + n * lr_element_size_byte + 2 * n * block_size * - (sizeof(grad.data_type()) + sizeof(param.data_type())); + (grad_element_size_byte + param_element_size_byte); } } return c; From e322547fe6dd4f0ca9261a1ac2ae7095800b98a1 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Wed, 1 Sep 2021 13:34:48 -0700 Subject: [PATCH 432/530] Add forward AD support for custom Functions (#64061) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64061 Test Plan: Imported from OSS Reviewed By: soulitzer Differential Revision: D30640868 Pulled By: albanD fbshipit-source-id: b0e6610430a879074d6d5306443772fc154b431f --- test/test_autograd.py | 114 +++++++++++++++ torch/autograd/function.py | 24 +++ torch/csrc/autograd/custom_function.cpp | 187 +++++++++++++++++++++++- torch/csrc/autograd/custom_function.h | 12 +- torch/csrc/autograd/python_function.cpp | 59 +++++++- 5 files changed, 385 insertions(+), 11 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 8b3c8bd33af66..ebe3aa5d29e18 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -5494,6 +5494,11 @@ def backward(ctx, foo): def vjp(ctx, foo): return foo + class BadJvp(Function): + @staticmethod + def forward(ctx, foo): + return foo.clone() + inp = torch.rand(1, requires_grad=True) with self.assertRaisesRegex(NotImplementedError, "must implement the forward"): BadFw.apply(inp) @@ -5504,6 +5509,115 @@ def vjp(ctx, foo): with self.assertRaisesRegex(RuntimeError, "Implementing both 'backward' and 'vjp'"): BadBw2.apply(inp).sum().backward() + with self.assertRaisesRegex(RuntimeError, "must implement the jvp function"): + with fwAD.dual_level(): + d = fwAD.make_dual(inp, torch.rand_like(inp)) + res = BadJvp.apply(d) + + def test_custom_function_forward_mode_view_checks(self): + flag_to_error = { + "ok": None, + "not_a_view": "jvp is not returning a view", + "not_a_view_of_inp": "jvp is not returning a view of the given", + "not_a_view_of_inp_base": "jvp is not returning a view of the same base", + } + + class ViewFn(Function): + @staticmethod + def forward(ctx, foo, flag): + ctx.flag = flag + ctx.size = foo.size() + return foo.narrow(0, 0, 2) + + @staticmethod + def vjp(ctx, gO): + gI = gO.new_zeros(ctx.size) + gI.narrow(0, 0, 2).copy_(gO) + return gI, None + + @staticmethod + def jvp(ctx, gI, _): + res = gI.narrow(0, 0, 2) + if ctx.flag != "ok": + # Break the view in the gradients! + res = res.clone() + if ctx.flag in ["not_a_view_of_inp", "not_a_view_of_inp_base"]: + # Result should be a view, just of the wrong thing + res = res.view_as(res) + return res + + inp = torch.rand(4, 4, dtype=torch.double, requires_grad=True) + + for flag, msg in flag_to_error.items(): + def test_fn(inp): + if flag == "not_a_view_of_inp_base": + inp = inp.view_as(inp) + return ViewFn.apply(inp, flag) + + if msg is None: + gradcheck(test_fn, inp, check_forward_ad=True) + else: + with self.assertRaisesRegex(RuntimeError, msg): + gradcheck(test_fn, inp, check_forward_ad=True) + + def test_custom_function_forward_mode_inplace_checks(self): + class InplaceFn(Function): + @staticmethod + def forward(ctx, foo, flag): + ctx.mark_dirty(foo) + ctx.flag = flag + foo.mul_(2) + return foo + + @staticmethod + def vjp(ctx, gO): + return 2 * gO, None + + @staticmethod + def jvp(ctx, gI, _): + if ctx.flag: + # Don't do the change inplace + return 2 * gI + else: + gI.mul_(2) + return gI + + inp = torch.rand(4, 4, dtype=torch.double, requires_grad=True) + + def test_fn(inp, flag): + inp = inp.clone() + return InplaceFn.apply(inp, flag) + + gradcheck(test_fn, (inp, False), check_forward_ad=True) + + with self.assertRaisesRegex(RuntimeError, "inplace custom Function is not modifying the forward mode gradients inplace"): + gradcheck(test_fn, (inp, True), check_forward_ad=True) + + def test_custom_function_forward_mode_wrong_formula(self): + class UserFn(Function): + @staticmethod + def forward(ctx, foo, should_fail): + ctx.should_fail = should_fail + return foo * 2 + + @staticmethod + def vjp(ctx, gO): + return 2 * gO, None + + @staticmethod + def jvp(ctx, gI, _): + if ctx.should_fail: + # Wrong gradient formula + return 3 * gI + else: + return 2 * gI + + inp = torch.rand(10, dtype=torch.double, requires_grad=True) + gradcheck(UserFn.apply, (inp, False), check_forward_ad=True) + + with self.assertRaisesRegex(RuntimeError, "Jacobian computed with forward mode mismatch for output 0"): + gradcheck(UserFn.apply, (inp, True), check_forward_ad=True) + def test_custom_function_local_inplace(self): class MyFn(torch.autograd.Function): @staticmethod diff --git a/torch/autograd/function.py b/torch/autograd/function.py index 90aeea5f1dfea..909e71959320b 100644 --- a/torch/autograd/function.py +++ b/torch/autograd/function.py @@ -198,6 +198,10 @@ def apply(self, *args): user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn return user_fn(self, *args) + def apply_jvp(self, *args): + # _forward_cls is defined by derived class + return self._forward_cls.jvp(self, *args) # type: ignore[attr-defined] + class FunctionMeta(type): """Function metaclass. @@ -307,6 +311,26 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any: # vjp and backward are alias of each other vjp = backward + @staticmethod + def jvp(ctx: Any, *grad_inputs: Any) -> Any: + r"""Defines a formula for differentiating the operation with forward mode + automatic differentiation. + This function is to be overridden by all subclasses. + It must accept a context :attr:`ctx` as the first argument, followed by + as many inputs as the :func:`forward` got (None will be passed in + for non tensor inputs of the forward function), + and it should return as many tensors as there were outputs to + :func:`forward`. Each argument is the gradient w.r.t the given input, + and each returned value should be the gradient w.r.t. the + corresponding output. If an output is not a Tensor or the function is not + differentiable with respect to that output, you can just pass None as a + gradient for that input. + + You can use the :attr:`ctx` object to pass any value from the forward to this + functions. + """ + raise NotImplementedError("You must implement the jvp function for custom " + "autograd.Function to use it with forward mode AD.") def once_differentiable(fn): diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp index fdcf9971a0606..1bb4cb836f1e8 100644 --- a/torch/csrc/autograd/custom_function.cpp +++ b/torch/csrc/autograd/custom_function.cpp @@ -26,8 +26,175 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const { } } +// This function has two main goals: +// 1) Use the user-provided jvp function to populate the the outputs' forward gradient +// 2) Perform error checking to ensure that view and inplace ops are properly handled +// +// For 1) we have to: +// - Create a variable_list of grad_inputs based on the function inputs +// - Call the user jvp function with these to get the grad_outputs +// - Set the forward grad field on each output based on these grad_outputs +// +// For 2) we want to check the following: +// - If an output is a view, then the generated forward grad must be a view as well and +// the output's base's forward grad must be the output's forward grad's base. +// - If an input was modified inplace (it must be an output as well) we make sure that its +// forward grad was also modified inplace and already present on the corresponding output. +void _process_forward_mode_AD(const variable_list &inputs, + std::unordered_map inputs_mapping, + const at::ArrayRef> raw_outputs, + const optional_variable_list &outputs, + const std::unordered_set &non_differentiable, + const std::unordered_set &dirty_inputs, + _jvp_fn_t jvp_user_function) { + + // TODO handle multiple levels here + uint64_t level = 0; + + const auto num_inputs = inputs.size(); + const auto num_outputs = outputs.size(); + + // The tracking info below are used to perform the view and inplace checks. + // They are lazily initialized to reduce the cost of this function in the common + // case where the user is not using forward mode AD. + variable_list input_grads; + std::vector grad_versions; + std::vector grad_impls; + std::unordered_map inputs_bases; + + auto init_tracked_info = [&] () { + input_grads.resize(num_inputs); + grad_versions.resize(num_inputs); + grad_impls.resize(num_inputs); + + for (const auto i: c10::irange(num_inputs)) { + const auto& inp = inputs[i]; + if (inp.is_view() && impl::get_view_autograd_meta(inp)->has_fw_view()) { + inputs_bases.emplace(impl::get_view_autograd_meta(inp)->get_forward_view().base_.unsafeGetTensorImpl(), i); + } else { + inputs_bases.emplace(inp.unsafeGetTensorImpl(), i); + } + + } + }; + + bool any_input_has_grad = false; + // Extract the input's forward gradients and record any info we will need later + for (const auto i : c10::irange(num_inputs)) { + const auto& inp = inputs[i]; + if (!inp.defined()) { + continue; + } + const auto& fw_grad = inp._fw_grad(level); + if (fw_grad.defined()) { + if (!any_input_has_grad) { + any_input_has_grad = true; + init_tracked_info(); + } + input_grads[i] = fw_grad; + grad_versions[i] = fw_grad._version(); + grad_impls[i] = fw_grad.unsafeGetTensorImpl(); + } + } + + // If no input has forward grad, nothing to do here + if (!any_input_has_grad) { + return; + } + + + auto forward_grads = jvp_user_function(inputs, input_grads); + + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + const auto num_forward_grads = forward_grads.size(); + // contrary to backward mode, we don't allow returning too many gradients + TORCH_CHECK(num_forward_grads == num_outputs, "Function's jvp returned " + "an invalid number of of forward gradients (expected ", num_outputs, + " but got ", num_forward_grads, ")"); + + for (const auto i : c10::irange(num_outputs)) { + const auto& out = outputs[i].has_value()? outputs[i].value() : at::Tensor(); + const auto& out_grad = forward_grads[i]; + if (!out.defined()) { + TORCH_CHECK(!out_grad.defined(), "Function's jvp returned a gradient at position ", i, ", but " + " the corresponding forward output is not a differentiable Tensor"); + continue; + } + + TORCH_INTERNAL_ASSERT(raw_outputs[i].has_value()); + auto out_tensor_impl = raw_outputs[i].value().unsafeGetTensorImpl(); + bool is_input = inputs_mapping.count(out_tensor_impl) > 0; + bool is_modified = dirty_inputs.count(out_tensor_impl) > 0; + + if (is_modified) { + TORCH_CHECK(is_input, "Only input Tensors should be given to ctx.mark_dirty(). If a Tensor is not an input, there" + " is no need to pass it to mark_dirty()."); + auto inp_idx = inputs_mapping[out_tensor_impl]; + if (grad_impls[inp_idx]) { + // If there was already a forward grad for that input + // Just make sure that it is modified inplace and returned as-is + TORCH_CHECK(out_grad._version() != grad_versions[inp_idx], "An inplace custom Function is not modifying the " + "forward mode gradients inplace. If the forward is modifying an input inplace, then the jvp " + "function must modify the corresponding gradient inplace.") + TORCH_CHECK(out_grad.unsafeGetTensorImpl() == grad_impls[inp_idx], "An inplace custom Function is not returning the " + "forward mode gradients as-is. If the forward is modifying an input inplace, then the jvp " + "function must modify the gradient inplace and return it as-is.") + } else { + // If that Tensor didn't had gradients already, set the newly returned one + // We could also use inputs[inp_idx] here as it is the same as out + out._set_fw_grad(out_grad, level, /* is_inplace_op */ true); + } + } else { + // At this point, outputs[i] cannot be one of the input (raw_outputs[i] might be but was changed by the backward code) + TORCH_INTERNAL_ASSERT(!is_input); + + if (out.is_view() && impl::get_view_autograd_meta(out)->has_fw_view()) { + // If the output is a view + const auto& out_view_info = impl::get_view_autograd_meta(out)->get_forward_view(); + if (inputs_bases.count(out_view_info.base_.unsafeGetTensorImpl())) { + // And it is a view of an input (either that input is its base or they have a common base) + const auto matching_input_idx = inputs_bases[out_view_info.base_.unsafeGetTensorImpl()]; + const auto& matching_input = inputs[matching_input_idx]; + + const auto& matching_input_grad = matching_input._fw_grad(level); + + // If the matching input has a forward grad, the user should have returned a view of that Tensor + if (matching_input_grad.defined()) { + TORCH_CHECK(out_grad.is_view() && impl::get_view_autograd_meta(out_grad)->has_fw_view(), + "A custom Function's forward is returning a view but the jvp is not returning a view."); + + const auto& out_grad_base = impl::get_view_autograd_meta(out_grad)->get_forward_view().base_; + if (matching_input_grad.is_view() && impl::get_view_autograd_meta(matching_input_grad)->has_fw_view()) { + // If the matching input's grad is a view, ensure that the out_grad is a view of the same base + const auto& matching_input_grad_base = impl::get_view_autograd_meta(matching_input_grad)->get_forward_view().base_; + TORCH_CHECK(matching_input_grad_base.unsafeGetTensorImpl() == out_grad_base.unsafeGetTensorImpl(), + "A custom Function is returning a view but the jvp is not returning a view of the same base as " + "the given grad input."); + } else { + // If the matching input's grad is not a view, then it must be the output gradient's base + TORCH_CHECK(matching_input_grad.unsafeGetTensorImpl() == out_grad_base.unsafeGetTensorImpl(), + "A custom Function is returning a view but the jvp is not returning a view of the given grad input."); + } + } else { + // We have a view op where the input didn't have a forward grad but the user returned one for the output + // To ensure that we maintain the view/inplace constraints, we consider this as an inplace op + // This case CANNOT happen in codegen as all view ops are mapping from one Tensor to one Tensor and so the output + // of the view cannot have a forward grad if the base does not. + out._set_fw_grad(out_grad, level, /* is_inplace_op */ true); + return; + } + + } + } + + out._set_fw_grad(out_grad, level, /* is_inplace_op */ false); + } + } +} + optional_variable_list _process_backward_mode_ad( - const std::unordered_set &inputs_set, + const std::unordered_map &inputs_mapping, const std::unordered_set &non_differentiable, const std::unordered_set &dirty_inputs, const at::ArrayRef> raw_outputs, @@ -121,7 +288,7 @@ optional_variable_list _process_backward_mode_ad( Variable var = raw_outputs[i].value(); auto out_tensor_impl = var.unsafeGetTensorImpl(); - bool is_input = inputs_set.count(out_tensor_impl) > 0; + bool is_input = inputs_mapping.count(out_tensor_impl) > 0; bool is_modified = dirty_inputs.count(out_tensor_impl) > 0; bool is_differentiable = cdata && non_differentiable.count(out_tensor_impl) == 0 && isDifferentiableType(var.scalar_type()); @@ -179,16 +346,20 @@ optional_variable_list _wrap_outputs(const variable_list &input_vars, const std::unordered_set &non_differentiable, const std::unordered_set &dirty_inputs, const at::ArrayRef> raw_outputs, - const std::shared_ptr &cdata) { + const std::shared_ptr &cdata, + _jvp_fn_t jvp_user_function) { - std::unordered_set inputs_set; - inputs_set.reserve(input_vars.size()); - for (auto& var : input_vars) { - inputs_set.emplace(var.unsafeGetTensorImpl()); + std::unordered_map inputs_mapping; + inputs_mapping.reserve(input_vars.size()); + for (const auto i: c10::irange(input_vars.size())) { + inputs_mapping.emplace(input_vars[i].unsafeGetTensorImpl(), i); } - auto outputs = _process_backward_mode_ad(inputs_set, non_differentiable, dirty_inputs, raw_outputs, cdata); + auto outputs = _process_backward_mode_ad(inputs_mapping, non_differentiable, dirty_inputs, raw_outputs, cdata); + // This must happen after the backward processing as we expect the computations happening here to track + // backward mode gradients. + _process_forward_mode_AD(input_vars, inputs_mapping, raw_outputs, outputs, non_differentiable, dirty_inputs, jvp_user_function); return outputs; } diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h index 376cab693e453..94e62bf7b63c7 100644 --- a/torch/csrc/autograd/custom_function.h +++ b/torch/csrc/autograd/custom_function.h @@ -10,13 +10,15 @@ namespace torch { namespace autograd { using optional_variable_list = std::vector>; +using _jvp_fn_t = std::function; TORCH_API std::vector> _wrap_outputs( const variable_list &input_vars, const std::unordered_set &non_differentiable, const std::unordered_set &dirty_inputs, const at::ArrayRef> raw_outputs, - const std::shared_ptr &cdata); + const std::shared_ptr &cdata, + _jvp_fn_t jvp_user_function); TORCH_API void check_variable_result(const Variable& original, const Variable& result, std::string hook_name); @@ -265,12 +267,18 @@ auto Function::apply(Args&&... args) -> std::enable_if_t::v outputs = T::forward(&node->ctx_, std::forward(args)...); } + _jvp_fn_t jvp_fn = [](variable_list inputs, variable_list gI) -> variable_list { + TORCH_CHECK(false, "jvp is not implemented for the c++ API of custom Function yet.", + "Please open a feature request on Github if you need this."); + }; + auto wrapped_outputs = _wrap_outputs( input_vars, node->ctx_.get_non_differentiable(), node->ctx_.get_and_bump_dirty(), to_optional(outputs), - is_executable ? node : nullptr); + is_executable ? node : nullptr, + jvp_fn); node->output_info_.reserve(wrapped_outputs.size()); for (auto& output : wrapped_outputs) { diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 14874186d6f22..eee56f71ed7d8 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -340,8 +340,61 @@ static void _wrap_outputs(const std::shared_ptr& cdata, THPFunction *sel } } + _jvp_fn_t jvp_user_function = [self](variable_list inputs, variable_list grad_inputs) { + pybind11::gil_scoped_acquire gil; + + // Massage a C++ variable_list into a Python arguments tuple + // Making sure to introduce the proper None for non-Tensor inputs + auto num_inputs = self->is_variable_input.size(); + THPObjectPtr pyInputs(PyTuple_New(num_inputs)); + if (!pyInputs) throw_python_error(); + auto var_input_idx = 0; + for (const auto i : c10::irange(num_inputs)) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + PyObject* input; + if (self->is_variable_input[i]) { + if (grad_inputs[i].defined() || !self->materialize_grads) { + input = THPVariable_Wrap(grad_inputs[i]); + } else { + input = THPVariable_Wrap(at::zeros_like(inputs[i])); + } + if (!input) throw_python_error(); + } else { + Py_INCREF(Py_None); + input = Py_None; + } + PyTuple_SET_ITEM(pyInputs.get(), i, input); + } + + THPObjectPtr apply_jvp_fn(PyObject_GetAttrString((PyObject*)self, "apply_jvp")); + if (!apply_jvp_fn) throw_python_error(); + THPObjectPtr r(PyObject_CallObject(apply_jvp_fn, pyInputs.get())); + if (!r) throw_python_error(); + ensure_tuple(r); + + // Massage the Python results tuple back into a C++ variable_list + // Don't do any check on the number of results here as + // it is handled by the caller + const int num_outputs = PyTuple_GET_SIZE(r.get()); + variable_list results; + results.reserve(num_outputs); + for (int i = 0; i != num_outputs; ++i) { + PyObject* output = PyTuple_GET_ITEM(r.get(), i); + if (output == Py_None) { + results.emplace_back(); + } else { + TORCH_CHECK(THPVariable_Check(output), "expected Variable or None (got ", + THPUtils_typename(output), ") for grad output ", i, ".") + results.emplace_back(THPVariable_Unpack(output)); + } + } + + return results; + }; + // Wrap only the tensor outputs. - auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs, raw_output_vars, cdata_if_executable); + auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs, + raw_output_vars, cdata_if_executable, jvp_user_function); for(const auto i : c10::irange(num_outputs)) { PyObject* obj = PyTuple_GetItem(raw_output, i); @@ -571,6 +624,9 @@ PyObject* process_outputs(PyObject *op_obj, const std::shared_ptr& cdata bool is_inplace = static_cast(grad_fn->dirty_tensors); _wrap_outputs(cdata, grad_fn, unpacked.input_vars, raw_output, outputs, is_executable); _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output); + + // It is important that creating the SavedVariables happen after the output wrapping as the + // outputs must have their grad_fn/fw_grad properly set before we save them. if (is_executable) { _save_variables(cdata, grad_fn); } else { @@ -651,6 +707,7 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs) THPObjectPtr tensor_outputs; { AutoGradMode grad_mode(false); + at::AutoFwGradMode fw_grad_mode(false); THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward")); if (!forward_fn) return nullptr; tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple); From 15ff25d1fc212c36cf472f988cf0e709420cd248 Mon Sep 17 00:00:00 2001 From: David Reiss Date: Wed, 1 Sep 2021 13:41:37 -0700 Subject: [PATCH 433/530] Break up "@generated" string so Phabricator shows changes Summary: Created from CodeHub with https://fburl.com/edit-in-codehub Test Plan: CI Sandcastle run Reviewed By: larryliu0820 Differential Revision: D30701781 fbshipit-source-id: 3acab8b65a327c4ec7da90bc855ecf02f801c40a --- tools/autograd/gen_variable_type.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 8591a6800605c..e3f4d5553c34f 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -374,7 +374,7 @@ def gen_variable_type( """ fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) fm.write('VariableType.h', lambda: { - 'generated_comment': f'@generated from {template_path}/VariableType.h' + 'generated_comment': "@" f'generated from {template_path}/VariableType.h' }) # NOTE: see Note [Sharded File] at the top of the VariableType.cpp @@ -385,7 +385,7 @@ def gen_variable_type( key_fn=lambda fn: cpp.name(fn.func.func), base_env={ 'generated_comment': - f'@generated from {template_path}/VariableType.cpp', + "@" f'generated from {template_path}/VariableType.cpp', }, env_callable=gen_variable_type_func, num_shards=5, From 86c96542914bf9b3dfda0c7f6373fd13b48c6b97 Mon Sep 17 00:00:00 2001 From: Salil Desai Date: Wed, 1 Sep 2021 14:08:02 -0700 Subject: [PATCH 434/530] Update optimize_for_mobile to preserve node's debug information (#63106) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63106 Propagate debug info to the re-written nodes in the graph. Test Plan: - Clone open source repo and build - ``` python3 test/test_jit.py TestOptimizeForMobilePreserveDebugInfo ``` - Tests pass Reviewed By: kimishpatel Differential Revision: D28654659 fbshipit-source-id: 2d7c87f2fb95a3be53246375f35639bbd97c237e --- ...optimize_for_mobile_preserve_debug_info.py | 261 ++++++++++++++++++ test/test_jit.py | 1 + torch/csrc/jit/passes/xnnpack_rewrite.cpp | 165 ++++++++--- 3 files changed, 388 insertions(+), 39 deletions(-) create mode 100644 test/jit/test_optimize_for_mobile_preserve_debug_info.py diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py new file mode 100644 index 0000000000000..c08f3b5838fae --- /dev/null +++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py @@ -0,0 +1,261 @@ +import torch +import torch._C +import torch.backends.xnnpack +import torch.nn.functional as F +from torch.testing._internal.jit_utils import JitTestCase + +class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): + def check_replacement( + self, + model, + replacements, + jit_pass, + ): + """ + model: Model which optimization is performed on + replacements: Dict mapping from nodes' kinds in the optimized model + to the kinds of nodes they replaced in the original model + jit_pass: Function to perform optimization + """ + + original_kinds = set(replacements.values()) + original_source_ranges = { + node.kind(): node.sourceRange() + for node in model.graph.nodes() + if node.kind() in original_kinds + } + + jit_pass(model._c) + + for node in model.graph.nodes(): + if node.kind() in replacements: + self.assertEqual( + node.sourceRange(), + original_source_ranges[replacements[node.kind()]], + ) + + def test_replace_conv1d_with_conv2d(self): + class TestConv1d(torch.nn.Module): + def __init__(self, weight, bias): + super(TestConv1d, self).__init__() + self.weight = weight + self.bias = bias + + def forward(self, x): + return F.conv1d(x, self.weight, self.bias) + + self.check_replacement( + model=torch.jit.script( + TestConv1d( + weight=torch.rand(3, 3, 3), + bias=torch.rand(3), + ), + ), + replacements={ + "prim::ListUnpack": "aten::conv1d", + "prim::ListConstruct": "aten::conv1d", + "aten::unsqueeze": "aten::conv1d", + "aten::conv2d": "aten::conv1d", + "aten::squeeze": "aten::conv1d", + }, + jit_pass=torch._C._jit_pass_transform_conv1d_to_conv2d, + ) + + def test_insert_pre_packed_linear_before_inline_and_conv_2d_op(self): + class TestPrepackedLinearBeforeInlineAndConv2dOp(torch.nn.Module): + def __init__( + self, + linear_weight, + linear_bias, + conv2d_weight, + conv2d_bias, + conv_transpose2d_weight, + conv_transpose2d_bias, + ): + super( + TestPrepackedLinearBeforeInlineAndConv2dOp, + self, + ).__init__() + self.linear_weight = linear_weight.float() + self.linear_bias = linear_bias.float() + self.conv2d_weight = conv2d_weight.float() + self.conv2d_bias = conv2d_bias.float() + self.conv_transpose2d_weight = conv_transpose2d_weight.float() + self.conv_transpose2d_bias = conv_transpose2d_bias.float() + + def forward(self, x): + linear_res = F.linear( + x.float(), + self.linear_weight, + self.linear_bias, + ) + conv2d_res = F.conv2d( + input=linear_res.unsqueeze(dim=0).float(), + weight=self.conv2d_weight, + bias=self.conv2d_bias, + ) + return F.conv_transpose2d( + input=conv2d_res, + weight=self.conv_transpose2d_weight, + bias=self.conv_transpose2d_bias, + ) + + minibatch = 1 + in_channels = 6 + iH = 4 + iW = 5 + out_channels = 6 + kH = 2 + kW = 3 + + self.check_replacement( + model=torch.jit.script( + TestPrepackedLinearBeforeInlineAndConv2dOp( + linear_weight=torch.rand(iW, 3), + linear_bias=torch.rand(iW), + conv2d_weight=torch.rand(out_channels, in_channels, kH, kW), + conv2d_bias=torch.rand(out_channels), + conv_transpose2d_weight=torch.rand( + out_channels, + in_channels, + kH, + kW, + ), + conv_transpose2d_bias=torch.rand(out_channels), + ), + ), + replacements={ + "prepacked::linear_clamp_prepack": "prim::CallFunction", + "prepacked::linear_clamp_run": "prim::CallFunction", + "prepacked::conv2d_clamp_prepack": "aten::conv2d", + "prepacked::conv2d_clamp_run": "aten::conv2d", + "prepacked::conv2d_transpose_clamp_prepack": + "aten::conv_transpose2d", + "prepacked::conv2d_transpose_clamp_run": + "aten::conv_transpose2d", + }, + jit_pass=torch._C._jit_pass_insert_prepacked_ops, + ) + + def test_insert_pre_packed_linear_op(self): + self.check_replacement( + model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)), + replacements={ + "prepacked::linear_clamp_prepack": "aten::linear", + "prepacked::linear_clamp_run": "aten::linear" + }, + jit_pass=torch._C._jit_pass_insert_prepacked_ops, + ) + + def run_test_fuse_activation_with_pack_ops_linear_conv2d( + self, + linear_activation, + linear_activation_kind, + conv2d_activation, + conv2d_activation_kind, + ): + class TestFuseActivationLinearConv2d(torch.nn.Module): + def __init__( + self, + linear_weight, + linear_bias, + conv2d_weight, + conv2d_bias, + ): + super(TestFuseActivationLinearConv2d, self).__init__() + self.linear_weight = linear_weight + self.linear_bias = linear_bias + self.conv2d_weight = conv2d_weight + self.conv2d_bias = conv2d_bias + + def forward(self, x): + x = F.linear( + input=x, + weight=self.linear_weight, + bias=self.linear_bias, + ) + x = linear_activation(x) + x = F.conv2d( + input=x.unsqueeze(dim=0), + weight=self.conv2d_weight, + bias=self.conv2d_bias, + ) + return conv2d_activation(x) + + linear_in_features = 5 + linear_out_features = 4 + conv2d_in_channels = 3 + conv2d_out_channels = 4 + conv2d_kernel = 2 + x_shape = (3, 2, 5) + + model = torch.jit.trace( + TestFuseActivationLinearConv2d( + linear_weight=torch.nn.Parameter( + data=torch.rand( + linear_out_features, + linear_in_features, + ), + requires_grad=False, + ), + linear_bias=torch.nn.Parameter( + data=torch.rand(linear_out_features), + requires_grad=False, + ), + conv2d_weight=torch.rand( + conv2d_out_channels, + conv2d_in_channels, + conv2d_kernel, + conv2d_kernel, + ), + conv2d_bias=torch.rand(conv2d_out_channels), + ), + torch.rand(x_shape), + ) + + torch._C._jit_pass_insert_prepacked_ops(model._c) + + self.check_replacement( + model=model, + replacements={ + "prepacked::linear_clamp_prepack": + "prepacked::linear_clamp_prepack", + "prepacked::linear_clamp_run": linear_activation_kind, + "prepacked::conv2d_clamp_prepack": + "prepacked::conv2d_clamp_prepack", + "prepacked::conv2d_clamp_run": conv2d_activation_kind, + }, + jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv, + ) + + def test_fuse_activation_with_pack_ops_linear_conv2d_1(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.hardtanh, + linear_activation_kind="aten::hardtanh", + conv2d_activation=F.hardtanh_, + conv2d_activation_kind="aten::hardtanh_" + ) + + def test_fuse_activation_with_pack_ops_linear_conv2d_2(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.hardtanh_, + linear_activation_kind="aten::hardtanh_", + conv2d_activation=F.hardtanh, + conv2d_activation_kind="aten::hardtanh" + ) + + def test_fuse_activation_with_pack_ops_linear_conv2d_3(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.relu, + linear_activation_kind="aten::relu", + conv2d_activation=F.relu_, + conv2d_activation_kind="aten::relu_" + ) + + def test_fuse_activation_with_pack_ops_linear_conv2d_4(self): + self.run_test_fuse_activation_with_pack_ops_linear_conv2d( + linear_activation=F.relu_, + linear_activation_kind="aten::relu_", + conv2d_activation=F.relu, + conv2d_activation_kind="aten::relu" + ) diff --git a/test/test_jit.py b/test/test_jit.py index e94ed8db922b0..8d1981d772763 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -61,6 +61,7 @@ from jit.test_parametrization import TestParametrization # noqa: F401 from jit.test_attr import TestGetDefaultAttr # noqa: F401 from jit.test_aten_pow import TestAtenPow # noqa: F401 +from jit.test_optimize_for_mobile_preserve_debug_info import TestOptimizeForMobilePreserveDebugInfo # noqa: F401 # Torch from torch import Tensor diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp index 11210a4ea05b9..9b2cac6e25f9e 100644 --- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp +++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp @@ -26,8 +26,8 @@ namespace { void replaceConv1dWithConv2d(std::shared_ptr& graph) { std::string conv_1d_pattern = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int): - %r = aten::conv1d(%input, %weight, %bias, %stride, %padding, %dilation, %groups) - return (%r) )"; + %res = aten::conv1d(%input, %weight, %bias, %stride, %padding, %dilation, %groups) + return (%res) )"; std::string conv_2d_pattern = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int): @@ -47,8 +47,24 @@ void replaceConv1dWithConv2d(std::shared_ptr& graph) { %output : Tensor = aten::squeeze(%output_2d, %two) return (%output) )"; + std::vector> value_mappings( + {{"zero", "res"}, + {"one", "res"}, + {"stride_w", "res"}, + {"stride_2d", "res"}, + {"padding_w", "res"}, + {"padding_2d", "res"}, + {"dilation_w", "res"}, + {"dilation_2d", "res"}, + {"two", "res"}, + {"input_2d", "res"}, + {"weight_2d", "res"}, + {"output_2d", "res"}, + {"output", "res"}}); + SubgraphRewriter rewriter; - rewriter.RegisterRewritePattern(conv_1d_pattern, conv_2d_pattern); + rewriter.RegisterRewritePattern( + conv_1d_pattern, conv_2d_pattern, value_mappings); rewriter.runOnGraph(graph); } @@ -80,8 +96,8 @@ void insertPrePackedLinearOp(std::shared_ptr& graph) { std::string linear_before_inline = R"( graph(%linear, %input, %weight, %bias): - %r = prim::CallFunction(%linear, %input, %weight, %bias) - return (%r))"; + %res = prim::CallFunction(%linear, %input, %weight, %bias) + return (%res))"; std::string prepacked_ops_pattern_before_inline = R"( graph(%linear, %input, %weight, %bias): %output_min_max : None = prim::Constant() @@ -91,8 +107,8 @@ void insertPrePackedLinearOp(std::shared_ptr& graph) { return (%res))"; std::string linear_pattern = R"( graph(%input, %weight, %bias): - %r = aten::linear(%input, %weight, %bias) - return (%r))"; + %res = aten::linear(%input, %weight, %bias) + return (%res))"; std::string prepacked_ops_pattern = R"( graph(%input, %weight, %bias): %output_min_max : None = prim::Constant() @@ -112,13 +128,24 @@ void insertPrePackedLinearOp(std::shared_ptr& graph) { return false; }; + std::vector> value_mappings( + {{"output_min_max", "res"}, + {"packed_weight_bias", "res"}, + {"res", "res"}}); + SubgraphRewriter linear_call_fn_rewriter; linear_call_fn_rewriter.RegisterRewritePattern( - linear_before_inline, prepacked_ops_pattern_before_inline); + linear_before_inline, + prepacked_ops_pattern_before_inline, + value_mappings); linear_call_fn_rewriter.runOnGraph(graph, filter); + value_mappings = { + {"output_min_max", "res"}, {"packed_weight_bias", "res"}, {"res", "res"}}; + SubgraphRewriter linear_rewriter; - linear_rewriter.RegisterRewritePattern(linear_pattern, prepacked_ops_pattern); + linear_rewriter.RegisterRewritePattern( + linear_pattern, prepacked_ops_pattern, value_mappings); linear_rewriter.runOnGraph(graph); } @@ -128,8 +155,8 @@ void insertPrePackedConv2dOp(std::shared_ptr& graph) { std::string conv_2d_pattern = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int): - %r = aten::conv2d(%input, %weight, %bias, %stride, %padding, %dilation, %groups) - return (%r) )"; + %res = aten::conv2d(%input, %weight, %bias, %stride, %padding, %dilation, %groups) + return (%res) )"; std::string prepacked_ops_conv2d_pattern = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int): @@ -137,19 +164,24 @@ void insertPrePackedConv2dOp(std::shared_ptr& graph) { %packed_weight_bias = prepacked::conv2d_clamp_prepack( %weight, %bias, %stride, %padding, %dilation, %groups, %output_min_max, %output_min_max) - %r = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - return (%r) )"; + %res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) + return (%res) )"; + + std::vector> value_mappings( + {{"output_min_max", "res"}, + {"packed_weight_bias", "res"}, + {"res", "res"}}); SubgraphRewriter rewriter; rewriter.RegisterRewritePattern( - conv_2d_pattern, prepacked_ops_conv2d_pattern); + conv_2d_pattern, prepacked_ops_conv2d_pattern, value_mappings); rewriter.runOnGraph(graph); std::string conv_2d_transpose_pattern = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %output_padding:int[], %groups:int): - %r = aten::conv_transpose2d(%input, %weight, %bias, %stride, %padding, %output_padding, %groups, %dilation) - return (%r) )"; + %res = aten::conv_transpose2d(%input, %weight, %bias, %stride, %padding, %output_padding, %groups, %dilation) + return (%res) )"; std::string prepacked_ops_conv2d_transpose_pattern = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %output_padding:int[], %groups:int): @@ -157,12 +189,17 @@ void insertPrePackedConv2dOp(std::shared_ptr& graph) { %packed_weight_bias = prepacked::conv2d_transpose_clamp_prepack( %weight, %bias, %stride, %padding, %output_padding, %dilation, %groups, %output_min_max, %output_min_max) - %r = prepacked::conv2d_transpose_clamp_run(%input, %packed_weight_bias) - return (%r) )"; + %res = prepacked::conv2d_transpose_clamp_run(%input, %packed_weight_bias) + return (%res) )"; + + value_mappings = { + {"output_min_max", "res"}, {"packed_weight_bias", "res"}, {"res", "res"}}; SubgraphRewriter transpose_rewriter; transpose_rewriter.RegisterRewritePattern( - conv_2d_transpose_pattern, prepacked_ops_conv2d_transpose_pattern); + conv_2d_transpose_pattern, + prepacked_ops_conv2d_transpose_pattern, + value_mappings); transpose_rewriter.runOnGraph(graph); } @@ -182,8 +219,8 @@ void fuseHardtanhWithPackedOps(std::shared_ptr& graph) { %packed_weight_bias : __torch__.torch.classes.xnnpack.Conv2dOpContext = prepacked::conv2d_clamp_prepack( %weight, %bias, %stride, %padding, %dilation, %groups, %output_min, %output_max) - %r = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - return (%r) )"; + %res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) + return (%res) )"; std::string linear_prepack_run_hardtanh = R"( graph(%input, %weight, %bias, %output_min, %output_max, %dummy_min_max): @@ -193,8 +230,13 @@ void fuseHardtanhWithPackedOps(std::shared_ptr& graph) { %res = aten::hardtanh(%linear_res, %output_min, %output_max) return (%res))"; + std::vector> value_mappings( + {{"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}}); + rewriter.RegisterRewritePattern( - linear_prepack_run_hardtanh, linear_prepack_run_hardtanh_fused); + linear_prepack_run_hardtanh, + linear_prepack_run_hardtanh_fused, + value_mappings); std::string conv2d_prepack_run_hardtanh = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], @@ -203,11 +245,16 @@ void fuseHardtanhWithPackedOps(std::shared_ptr& graph) { %weight, %bias, %stride, %padding, %dilation, %groups, %dummy_min_max, %dummy_min_max) %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - %r = aten::hardtanh(%conv2d_res, %output_min, %output_max) - return (%r) )"; + %res = aten::hardtanh(%conv2d_res, %output_min, %output_max) + return (%res) )"; + + value_mappings = { + {"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}}; rewriter.RegisterRewritePattern( - conv2d_prepack_run_hardtanh, conv2d_prepack_run_hardtanh_fused); + conv2d_prepack_run_hardtanh, + conv2d_prepack_run_hardtanh_fused, + value_mappings); std::string linear_prepack_run_hardtanh_inplace = R"( graph(%input, %weight, %bias, %output_min, %output_max, %dummy_min_max): @@ -224,13 +271,24 @@ void fuseHardtanhWithPackedOps(std::shared_ptr& graph) { %weight, %bias, %stride, %padding, %dilation, %groups, %dummy_min_max, %dummy_min_max) %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - %r = aten::hardtanh_(%conv2d_res, %output_min, %output_max) - return (%r) )"; + %res = aten::hardtanh_(%conv2d_res, %output_min, %output_max) + return (%res) )"; + + value_mappings = { + {"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}}; rewriter.RegisterRewritePattern( - linear_prepack_run_hardtanh_inplace, linear_prepack_run_hardtanh_fused); + linear_prepack_run_hardtanh_inplace, + linear_prepack_run_hardtanh_fused, + value_mappings); + + value_mappings = { + {"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}}; + rewriter.RegisterRewritePattern( - conv2d_prepack_run_hardtanh_inplace, conv2d_prepack_run_hardtanh_fused); + conv2d_prepack_run_hardtanh_inplace, + conv2d_prepack_run_hardtanh_fused, + value_mappings); rewriter.runOnGraph(graph, torch::jit::graph_rewrite_helper::isClampFusable); } @@ -255,8 +313,8 @@ void fuseReluWithPackedOps(std::shared_ptr& graph) { %packed_weight_bias : __torch__.torch.classes.xnnpack.Conv2dOpContext = prepacked::conv2d_clamp_prepack( %weight, %bias, %stride, %padding, %dilation, %groups, %output_min, %output_max) - %r = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - return (%r) )"; + %res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) + return (%res) )"; std::string linear_prepack_run_relu = R"( graph(%input, %weight, %bias, %dummy_min_max): @@ -266,8 +324,14 @@ void fuseReluWithPackedOps(std::shared_ptr& graph) { %res = aten::relu(%linear_res) return (%res))"; + std::vector> value_mappings( + {{"output_min", "packed_weight_bias"}, + {"output_max", "packed_weight_bias"}, + {"packed_weight_bias", "packed_weight_bias"}, + {"res", "res"}}); + rewriter.RegisterRewritePattern( - linear_prepack_run_relu, linear_prepack_run_relu_fused); + linear_prepack_run_relu, linear_prepack_run_relu_fused, value_mappings); std::string conv2d_prepack_run_relu = R"( graph(%input, %weight, %bias, %stride:int[], %padding:int[], @@ -276,11 +340,17 @@ void fuseReluWithPackedOps(std::shared_ptr& graph) { %weight, %bias, %stride, %padding, %dilation, %groups, %dummy_min_max, %dummy_min_max) %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - %r = aten::relu(%conv2d_res) - return (%r) )"; + %res = aten::relu(%conv2d_res) + return (%res) )"; + + value_mappings = { + {"output_min", "packed_weight_bias"}, + {"output_max", "packed_weight_bias"}, + {"packed_weight_bias", "packed_weight_bias"}, + {"res", "res"}}; rewriter.RegisterRewritePattern( - conv2d_prepack_run_relu, conv2d_prepack_run_relu_fused); + conv2d_prepack_run_relu, conv2d_prepack_run_relu_fused, value_mappings); std::string linear_prepack_run_relu_inplace = R"( graph(%input, %weight, %bias, %dummy_min_max): @@ -297,13 +367,30 @@ void fuseReluWithPackedOps(std::shared_ptr& graph) { %weight, %bias, %stride, %padding, %dilation, %groups, %dummy_min_max, %dummy_min_max) %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias) - %r = aten::relu_(%conv2d_res) - return (%r) )"; + %res = aten::relu_(%conv2d_res) + return (%res) )"; + + value_mappings = { + {"output_min", "packed_weight_bias"}, + {"output_max", "packed_weight_bias"}, + {"packed_weight_bias", "packed_weight_bias"}, + {"res", "res"}}; rewriter.RegisterRewritePattern( - linear_prepack_run_relu_inplace, linear_prepack_run_relu_fused); + linear_prepack_run_relu_inplace, + linear_prepack_run_relu_fused, + value_mappings); + + value_mappings = { + {"output_min", "packed_weight_bias"}, + {"output_max", "packed_weight_bias"}, + {"packed_weight_bias", "packed_weight_bias"}, + {"res", "res"}}; + rewriter.RegisterRewritePattern( - conv2d_prepack_run_relu_inplace, conv2d_prepack_run_relu_fused); + conv2d_prepack_run_relu_inplace, + conv2d_prepack_run_relu_fused, + value_mappings); rewriter.runOnGraph(graph, torch::jit::graph_rewrite_helper::isClampFusable); } From 4aad366111f88010c9e4027d054da2fe357e1bfe Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Wed, 1 Sep 2021 14:19:21 -0700 Subject: [PATCH 435/530] [Static Runtime] Make per-op latency readable by FAI-PEP (#64315) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64315 Add a new flag `generate_ai_pep_output` to `StaticRuntime::benchmark`. If set, produces per-op-kind average total latency in milliseconds in a JSON format recognized by [Facebook AI performance evaluation platform (FAI-PEP)](https://github.com/facebook/FAI-PEP). This is useful for observing the impact of changes that make a big difference for a specific op, but do not affect the overall SR latency by more than a few percent. Reviewed By: hlu1 Differential Revision: D30679352 fbshipit-source-id: c847fa6ea20774aaf1e7949b11db4421d1f70b7e --- torch/csrc/jit/runtime/static/impl.cpp | 29 +++++++++++++++++++++++++- torch/csrc/jit/runtime/static/impl.h | 3 ++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index e22447819ea67..7697613e79573 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -19,6 +19,11 @@ #include #include +#ifdef FBCODE_CAFFE2 +#include +#include +#endif + namespace torch { namespace jit { @@ -873,12 +878,30 @@ c10::IValue StaticRuntime::operator()( return std::move(*outputs_[0]); } +namespace { + +std::string generate_node_time_json(const std::string& kind, float millis) { +#ifdef FBCODE_CAFFE2 + folly::dynamic json = folly::dynamic::object(); + json["type"] = kind; + json["metric"] = "latency"; + json["unit"] = "ms"; + json["value"] = millis; + return folly::toJson(json); +#else + return ""; +#endif +} + +} // namespace + void StaticRuntime::benchmark( const std::vector& args, const std::unordered_map& kwargs, const int warmup_runs, const int main_runs, - bool print_per_node_time) { + bool print_per_node_time, + bool generate_ai_pep_output) { float time_per_iter = benchmark_model(args, kwargs, warmup_runs, main_runs); std::cout << "Static runtime ms per iter: " << time_per_iter << ". Iters per second: " << 1000.0 / time_per_iter << std::endl; @@ -916,6 +939,10 @@ void StaticRuntime::benchmark( } else { std::cout << ")" << std::endl; } + + if (generate_ai_pep_output) { + LOG(INFO) << "PyTorchObserver " << generate_node_time_json(kind, ms); + } } std::cout << std::setw(15) << results.total_time << " ms. in Total" << std::endl; diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index d8a99f78cad2d..0d2378760f270 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -234,7 +234,8 @@ class TORCH_API StaticRuntime { const std::unordered_map& kwargs, const int warmup_runs, const int main_runs, - bool print_per_node_time = false); + bool print_per_node_time = false, + bool generate_ai_pep_output = false); float benchmark_model( const std::vector& args, From 968d7ee46a66316557a04a64333d2810b544ed9b Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Wed, 1 Sep 2021 14:24:54 -0700 Subject: [PATCH 436/530] [structured] Preserve computed elements from meta func to impl (#61746) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61746 **Summary** This commit introduces a new feature for structured kernels that allows kernels to declare quantities as "precomputed" in `native_functions.yaml`, compute them once in the `meta` function and reuse them again in the `impl`. The names and types of these quantities are used to generate code for a struct containing them that the `meta` function must return. In the case of a handful of surveyed kernels (`all,`, `any`, `avg_pool2d`), these quantities that are used both in the `meta` and `impl` have the same meaning as certain kernel arguments and in fact supersede them. Accordingly, the correspondence between a kernel argument and the precomputed elements that supersede it is also captured in `native_functions.yaml`. This information is used to unpack the struct returned by `meta` and pass its contents correctly to the `impl` function. The primary goal is to avoid recompute and enhance developer experience (e.g. sometimes people can forget to compute these elements while porting a kernel). Test Plan: Imported from OSS Reviewed By: tugsbayasgalan Differential Revision: D30407831 Pulled By: SplitInfinity fbshipit-source-id: 00975525ea373721fe52d06f75cd4ac91f3dc556 --- aten/src/ATen/TensorMeta.h | 10 ++ aten/src/ATen/native/AveragePool2d.cpp | 120 ++++++++++++-------- aten/src/ATen/native/Pool.h | 7 +- aten/src/ATen/native/ReduceOps.cpp | 13 ++- aten/src/ATen/native/cpu/AvgPoolKernel.cpp | 18 +-- aten/src/ATen/native/cuda/AveragePool2d.cu | 109 ++++++++++-------- aten/src/ATen/native/native_functions.yaml | 8 ++ tools/codegen/api/structured.py | 22 +++- tools/codegen/dest/register_dispatch_key.py | 24 +++- tools/codegen/gen.py | 91 ++++++++++++++- tools/codegen/model.py | 52 +++++++++ 11 files changed, 361 insertions(+), 113 deletions(-) diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h index ac295ec9bde79..6a5491ab3d50b 100644 --- a/aten/src/ATen/TensorMeta.h +++ b/aten/src/ATen/TensorMeta.h @@ -26,6 +26,16 @@ namespace impl { #define TORCH_META_FUNC(name) void structured_##name::meta #define TORCH_META_FUNC2(name, overload) void structured_##name##_##overload::meta +// These are versions of TORCH_META_FUNC(2) that include a precompute_out struct as a return value. +// They should be used when the kernel in question has precomputed values declared in native_functions.yaml and +// the corresponding implementation should return an instance of the aforementioned struct. +#define TORCH_PRECOMPUTE_META_FUNC(name) structured_##name::meta_return_ty structured_##name::meta +#define TORCH_PRECOMPUTE_META_FUNC2(name, overload) structured_##name##_##overload::meta_return_ty structured_##name##_##overload::meta + +// Use this to create a precompute struct in a meta function. +#define TORCH_PRECOMPUTE_STRUCT(name) structured_##name::precompute_out<> +#define TORCH_PRECOMPUTE_STRUCT2(name, overload) structured_##name##_##overload::precompute_out<> + // Use this to define the prototype for an implementation. This takes only // one argument, which is the name of the dispatch key entry you're // implementing. diff --git a/aten/src/ATen/native/AveragePool2d.cpp b/aten/src/ATen/native/AveragePool2d.cpp index 2693cc6ba49c5..8f264c007c6be 100644 --- a/aten/src/ATen/native/AveragePool2d.cpp +++ b/aten/src/ATen/native/AveragePool2d.cpp @@ -8,59 +8,81 @@ namespace at { namespace meta{ using namespace native; -TORCH_META_FUNC(avg_pool2d) ( - const Tensor& input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override -) { +TORCH_PRECOMPUTE_META_FUNC(avg_pool2d) +(const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override) { // #20866, #22032: Guarantee this for the official C++ API? TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints"); - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + const int64_t kH = kernel_size[0]; + const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1]; TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2, "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints"); - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW : - stride.size() == 1 ? dH : safe_downcast(stride[1]); + const int64_t dH = stride.empty() ? kH : stride[0]; + const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1]; TORCH_CHECK(padding.size() == 1 || padding.size() == 2, "avg_pool2d: padding must either be a single int, or a tuple of two ints"); - const int padH = safe_downcast(padding[0]); - const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + const int64_t padH = padding[0]; + const int64_t padW = padding.size() == 1 ? padH : padding[1]; TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0, "divisor must be not zero"); - /* sizes */ const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; const int64_t nInputPlane = input.size(-3); const int64_t inputHeight = input.size(-2); const int64_t inputWidth = input.size(-1); - const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); - const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); + const int64_t outputHeight = pooling_output_shape( + inputHeight, kH, padH, dH, 1, ceil_mode); + const int64_t outputWidth = + pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); auto memory_format = input.suggest_memory_format(); pool2d_shape_check( - input, - kH, kW, dH, dW, padH, padW, 1, 1, - nInputPlane, - inputHeight, inputWidth, - outputHeight, outputWidth, memory_format); + input, + kH, + kW, + dH, + dW, + padH, + padW, + 1, + 1, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + memory_format); /* resize output */ if (input.ndimension() == 3) { - set_output(0, {nInputPlane, outputHeight, outputWidth}, input.options()); + set_output( + 0, + {nInputPlane, + outputHeight, + outputWidth}, + input.options()); } else { - set_output(0, {nbatch, nInputPlane, outputHeight, outputWidth}, input.options().memory_format(memory_format)); + set_output( + 0, + {nbatch, + nInputPlane, + outputHeight, + outputWidth}, + input.options().memory_format(memory_format)); } + + return TORCH_PRECOMPUTE_STRUCT(avg_pool2d)().set_kH(kH).set_kW(kW).set_dH(dH).set_dW(dW).set_padH(padH).set_padW(padW); } TORCH_META_FUNC(avg_pool2d_backward) ( @@ -119,30 +141,30 @@ TORCH_META_FUNC(avg_pool2d_backward) ( namespace native { -TORCH_IMPL_FUNC(avg_pool2d_out_cpu) ( - const Tensor &input, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override, - const Tensor &output -) { - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); - - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW : - stride.size() == 1 ? dH : safe_downcast(stride[1]); - - const int padH = safe_downcast(padding[0]); - const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); - +TORCH_IMPL_FUNC(avg_pool2d_out_cpu) +(const Tensor& input, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override, + const Tensor& output) { avg_pool2d_kernel( - kCPU, output, input, - kW, kH, dW, dH, padW, padH, - count_include_pad, divisor_override); + kCPU, + output, + input, + kW, + kH, + dW, + dH, + padW, + padH, + count_include_pad, + divisor_override); } TORCH_IMPL_FUNC(avg_pool2d_backward_out_cpu) ( diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 5fe979df2c953..da774911b5737 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -16,10 +16,13 @@ DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel); DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel); // averge pooling has same signature for forward and backward -using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, +using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, + int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional divisor_override); +using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional divisor_override); + DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel); -DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_backward_kernel); +DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel); namespace { diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 6e5a1532bd8d1..620908b5b79bf 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -109,16 +109,18 @@ void check_all_any(const char* name, const Tensor& self, const Tensor& result) { } } -TORCH_META_FUNC2(all, dim)(const Tensor& self, int64_t dim, bool keepdim) { +TORCH_PRECOMPUTE_META_FUNC2(all, dim)(const Tensor& self, int64_t dim, bool keepdim) { check_all_any("all", self, maybe_get_output()); auto out_dtype = get_result_or_bytebool_dtype(self, maybe_get_output()); resize_reduction(*this, self, dim, keepdim, out_dtype); + return TORCH_PRECOMPUTE_STRUCT2(all, dim)().set_dim(maybe_wrap_dim(dim, self.dim())); } -TORCH_META_FUNC2(any, dim)(const Tensor& self, int64_t dim, bool keepdim) { +TORCH_PRECOMPUTE_META_FUNC2(any, dim)(const Tensor& self, int64_t dim, bool keepdim) { check_all_any("any", self, maybe_get_output()); auto out_dtype = get_result_or_bytebool_dtype(self, maybe_get_output()); resize_reduction(*this, self, dim, keepdim, out_dtype); + return TORCH_PRECOMPUTE_STRUCT2(any, dim)().set_dim(maybe_wrap_dim(dim, self.dim())); } void check_argmax_argmin( @@ -1338,7 +1340,6 @@ Tensor all(const Tensor& self) { TORCH_IMPL_FUNC(all_out) (const Tensor& self, int64_t dim, bool keepdim, const Tensor& result) { - dim = maybe_wrap_dim(dim, self.dim()); auto iter = get_allany_iter(self, result, dim, keepdim); auto mut_result = const_cast(result); if (!_dimreduce_return_trivial(mut_result, self, 1, dim, keepdim)) { @@ -1370,8 +1371,10 @@ Tensor any(const Tensor& self) { } TORCH_IMPL_FUNC(any_out) -(const Tensor& self, int64_t dim, bool keepdim, const Tensor& result) { - dim = maybe_wrap_dim(dim, self.dim()); +(const Tensor& self, + int64_t dim, + bool keepdim, + const Tensor& result) { auto iter = get_allany_iter(self, result, dim, keepdim); auto mut_result = const_cast(result); if (!_dimreduce_return_trivial(mut_result, self, 0, dim, keepdim)) { diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp index 2aa075f5933bd..2bee0206ff6b5 100644 --- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp @@ -14,9 +14,9 @@ template void cpu_avg_pool( const Tensor& output_, const Tensor& input_, - int kW, int kH, - int dW, int dH, - int padW, int padH, + int64_t kW, int64_t kH, + int64_t dW, int64_t dH, + int64_t padW, int64_t padH, bool count_include_pad, c10::optional divisor_override) { auto input = input_.contiguous(); @@ -98,9 +98,9 @@ template void cpu_avg_pool_channels_last( const Tensor& output_, const Tensor& input_, - int kW, int kH, - int dW, int dH, - int padW, int padH, + int64_t kW, int64_t kH, + int64_t dW, int64_t dH, + int64_t padW, int64_t padH, bool count_include_pad, c10::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 4, @@ -359,9 +359,9 @@ void cpu_avg_pool_backward_channels_last( void avg_pool2d_kernel_impl( const Tensor& output, const Tensor& input, - int kW, int kH, - int dW, int dH, - int padW, int padH, + int64_t kW, int64_t kH, + int64_t dW, int64_t dH, + int64_t padW, int64_t padH, bool count_include_pad, c10::optional divisor_override) { switch (input.suggest_memory_format()) { diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu index 5de3adc08bee8..df9fcfef64167 100644 --- a/aten/src/ATen/native/cuda/AveragePool2d.cu +++ b/aten/src/ATen/native/cuda/AveragePool2d.cu @@ -232,30 +232,31 @@ __global__ void avg_pool2d_backward_out_cuda_frame_nhwc(const int nthreads, } // anonymous namespace -TORCH_IMPL_FUNC(avg_pool2d_out_cuda) ( - const Tensor& input_, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override, - const Tensor& output -) { +TORCH_IMPL_FUNC(avg_pool2d_out_cuda) +(const Tensor& input_, + int64_t kH_, + int64_t kW_, + int64_t dH_, + int64_t dW_, + int64_t padH_, + int64_t padW_, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override, + const Tensor& output) { TensorArg output_arg{ output, "output", 1 }; TensorArg input_arg{ input_, "input_", 2 }; checkAllSameGPU("avg_pool2d_out_cuda", {output_arg, input_arg}); - const int kH = safe_downcast(kernel_size[0]); - const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + const int kH = safe_downcast(kH_); + const int kW = safe_downcast(kW_); - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() ? kW : - stride.size() == 1 ? dH : safe_downcast(stride[1]); + const int dH = safe_downcast(dH_); + const int dW = safe_downcast(dW_); - const int padH = safe_downcast(padding[0]); - const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + const int padH = safe_downcast(padH_); + const int padW = safe_downcast(padW_); /* sizes */ const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1; @@ -263,8 +264,8 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cuda) ( const int64_t inputHeight = input_.size(-2); const int64_t inputWidth = input_.size(-1); - const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); - const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); + int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); + int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); const auto memory_format = input_.suggest_memory_format(); Tensor input = input_.contiguous(memory_format); @@ -289,37 +290,55 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cuda) ( case MemoryFormat::ChannelsLast: { output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::ChannelsLast); avg_pool2d_out_cuda_frame_nhwc - <<>>( - count, - input_data, - nbatch, - nInputPlane, - inputHeight, inputWidth, - outputHeight, outputWidth, - kH, kW, - dH, dW, - padH, padW, - output_data, - divisor_override_value, - count_include_pad, use_divisor); + <<>>( + count, + input_data, + nbatch, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + kH, + kW, + dH, + dW, + padH, + padW, + output_data, + divisor_override_value, + count_include_pad, + use_divisor); C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } case MemoryFormat::Contiguous: { avg_pool2d_out_cuda_frame - <<>>( - count, - input_data, - nbatch, - nInputPlane, - inputHeight, inputWidth, - outputHeight, outputWidth, - kH, kW, - dH, dW, - padH, padW, - output_data, - divisor_override_value, - count_include_pad, use_divisor); + <<>>( + count, + input_data, + nbatch, + nInputPlane, + inputHeight, + inputWidth, + outputHeight, + outputWidth, + kH, + kW, + dH, + dW, + padH, + padW, + output_data, + divisor_override_value, + count_include_pad, + use_divisor); C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 688763ea39c13..fae433cd6aae6 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -487,6 +487,8 @@ - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True + precomputed: + - dim -> int dim dispatch: CPU, CUDA: all_out @@ -508,6 +510,8 @@ - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True + precomputed: + - dim -> int dim dispatch: CPU, CUDA: any_out @@ -8816,6 +8820,10 @@ - func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True + precomputed: + - kernel_size -> int kH, int kW + - stride -> int dH, int dW + - padding -> int padH, int padW dispatch: CPU: avg_pool2d_out_cpu CUDA: avg_pool2d_out_cuda diff --git a/tools/codegen/api/structured.py b/tools/codegen/api/structured.py index 4f1437fb6f3ff..6aab794413c64 100644 --- a/tools/codegen/api/structured.py +++ b/tools/codegen/api/structured.py @@ -84,7 +84,27 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[B def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]: args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] - args.extend(g.out.func.arguments.non_out) + + if g.out.precomputed: + # A list of parameters for the impl function with + # certain parameters replaced with precomputed counterparts + # as specified in native_functions.yaml. + non_out_args_replaced: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] + + for a in g.out.func.arguments.non_out: + if isinstance(a, Argument) and a.name in g.out.precomputed.replace: + # If a is in precompute.replace, append the parameters + # that should replace it onto non_out_args_replaced. + for replacement in g.out.precomputed.replace[a.name]: + non_out_args_replaced.append(replacement) + else: + # If not, push a as it is. + non_out_args_replaced.append(a) + + args.extend(non_out_args_replaced) + else: + args.extend(g.out.func.arguments.non_out) + args.extend(g.out.func.arguments.out) return [r for arg in args for r in argument(arg)] diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py index 784ee56e765fb..ec3a2e6afc0b1 100644 --- a/tools/codegen/dest/register_dispatch_key.py +++ b/tools/codegen/dest/register_dispatch_key.py @@ -584,7 +584,29 @@ def generate_defn(cpp_sig: CppSignature) -> str: method=False ) ) - sig_body.append(f"op.meta({meta_exprs});") + + if self.g.out.precomputed: + # If this function group has precomputed elements, the meta function + # returns a struct containing them which must be saved so that it + # can be unpacked when generating code to call the impl. + sig_body.append(f"auto precompute = op.meta({meta_exprs});") + + # Put all of the contents of the precompute struct into the context + # so that translate will be able to return the correct args for the + # call to the impl. + for precomputed_elems in self.g.out.precomputed.replace.values(): + for arg in precomputed_elems: + context.append(Expr( + expr=f"precompute.{arg.name}", + type=structured.argument_type(arg, binds=arg.name), + )) + + # Add a use of the precompute struct so FB internal compilers don't + # complain that there is an unused variable. + sig_body.append("(void)precompute;") + else: + sig_body.append(f"op.meta({meta_exprs});") + # After running meta, op.outputs_ is guaranteed to be valid; # add it to the context diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 203b5a99c356c..c986f8311604d 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -456,9 +456,98 @@ def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]: parent_class = g.out.structured_inherits if parent_class is None: parent_class = "at::impl::MetaBase" + meta_return = "void" + precomputed = g.out.precomputed if g.structured else None + + if precomputed: + # Generate the template declaration with one bool parameter for each + # precomputed element. Each parameter is true if the corresponding (in + # terms of position) precomputed element has been set. + precomputed_elements = [elem for replace_list in precomputed.replace.values() for elem in replace_list] + precomputed_template_parameters = [elem.name.upper() for elem in precomputed_elements] + precomputed_template_params_str = ", ".join(f"bool {param} = false" for param in precomputed_template_parameters) + precompute_template_decl = f"template <{precomputed_template_params_str}>" + + # Generate a string containing declarations of all precomputed elements. + precomputed_elements_with_cpp_types = [ + structured.argument_type(elem, binds=elem.name) + for elem in precomputed_elements + ] + + precomputed_elements_decl = ";\n".join( + f"{elem.cpp_type(strip_ref=True)} {elem.name}" for elem in precomputed_elements_with_cpp_types + ) + + # Generate "setter" methods for each precomputed element. Each method will return + # a new instance of precompute_out with the template parameter that corresponds to + # the member set by the method to true (to indicate that it has been set). + setter_methods = [] + for i, elem in enumerate(precomputed_elements): + # Generate the signature. The return type will be the same + # as the type of `this` but with the template parameter + # corresponding to the element set by this method set to true. + # The assert generated below will ensure that this template + # parameter is false on the type of `this`. + return_ty_templates = ", ".join( + precomputed_template_parameters[:i] + ["true"] + precomputed_template_parameters[i + 1:] + ) + return_ty = f"precompute_out<{return_ty_templates}>" + elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type(strip_ref=True) + signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)" + + # Generate an assert which checks that the + # template parameter corresponding to the precomputed + # element that is set by this method is false on the + # class corresponding to the object that `this` points to. + # This ensures that each element can be set only once. + assert_msg = f"\"{precomputed_elements[i].name} already set\"" + assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});" + + # Generate the new object construction block. All state + # except the element that this method sets is copied from the + # object that `this` points to. The value for the element that + # the method sets is taken from a method parameter. + construction_stmts = [] + construction_stmts.append(f"{return_ty} ret;") + + for j, elem in enumerate(precomputed_elements): + if i == j: + construction_stmts.append(f"ret.{elem.name} = value;") + else: + construction_stmts.append(f"ret.{elem.name} = this->{elem.name};") + + construction_stmts.append("return ret;") + construction_block = "\n".join(construction_stmts) + + setter_methods.append(f""" + {signature} {{ + {assert_stmt} + {construction_block} + }} + """) + setter_methods_decl = "\n".join(setter_methods) + + # Meta should return an instance of the struct containing the precomputed elements. + meta_return_template_params = ", ".join(["true"] * len(precomputed_template_parameters)) + # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return + # type (which has a variable number of template parameters). + meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;" + meta_return = "meta_return_ty" + precomputed_decl = f""" + {precompute_template_decl} + struct TORCH_API precompute_out {{ + {setter_methods_decl} + {precomputed_elements_decl}; + }};""" + else: + meta_return_typedef = "" + precomputed_decl = "" + return f"""\ struct TORCH_API structured_{name} : public {parent_class} {{ - void meta({args_str}); + {precomputed_decl} + {meta_return_typedef} + {meta_return} meta({args_str}); }}; """ diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 4f82b70ee31f2..e604e72d3a1ad 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -229,6 +229,14 @@ class NativeFunction: # changes the semantics of set_output to call the parent class. structured_inherits: Optional[str] + # Structured kernels can declare elements as "precomputed". These elements + # are returned by the meta function in one struct and passed to the impl + # function in lieu of certain kernel arguments that these precomputed + # elements supersede. Information about the names and types of these + # precomputed elements and how they correspond to kernel arguments is stored + # in this member, if applicable. + precomputed: Optional['Precompute'] + # Argument names whose default should be excluded from the C++ interface. # Intended for resolving overload ambiguities between signatures. cpp_no_default_args: Set[str] @@ -320,6 +328,10 @@ def from_yaml( category_override = e.pop('category_override', None) assert category_override is None or isinstance(category_override, str), f'not a str: {category_override}' + precomputed_dict = e.pop('precomputed', None) + assert precomputed_dict is None or structured is True + precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None + from tools.codegen.api import cpp raw_dispatch = e.pop('dispatch', None) @@ -389,6 +401,7 @@ def from_yaml( structured=structured, structured_delegate=structured_delegate, structured_inherits=structured_inherits, + precomputed=precomputed, manual_kernel_registration=manual_kernel_registration, manual_cpp_binding=manual_cpp_binding, python_module=python_module, @@ -1496,3 +1509,42 @@ def parse_returns(return_decl: str) -> Tuple[Return, ...]: if return_decl[0] == '(' and return_decl[-1] == ')': return_decl = return_decl[1:-1] return tuple(Return.parse(arg) for arg in return_decl.split(', ')) + + +# A Precompute instance consists of a map from kernel argument name +# to the list of Argument instances that should replace that +# kernel argument in the impl function. +@dataclass(frozen=True) +class Precompute: + # A map from kernel argument name -> a list of precomputed + # elements that replaces/supersedes it. + replace: Dict[str, List[Argument]] + + @staticmethod + def parse(src: object) -> 'Precompute': + assert isinstance(src, list) + + # src is a list of strings of the format: + # {kernel param name} -> {replacement decl}[, {replacement decl}, ...] + # Parse this list to get the names of which precomputed elements + # should replace which kernel arguments. + replace = {} + for raw_replace_item in src: + assert isinstance(raw_replace_item, str) + + arg, with_list_raw = raw_replace_item.split(' -> ') + with_list = with_list_raw.split(',') + with_list_args = [Argument.parse(name.strip()) for name in with_list] + replace[arg] = with_list_args + + r = Precompute(replace=replace) + assert r.to_list() == src, 'r.to_list() != src' + return r + + def to_list(self) -> List[str]: + replace_list = [] + for kernel_param, replacement_params in self.replace.items(): + replacements = ', '.join(str(param) for param in replacement_params) + replace_list.append(f'{kernel_param} -> {replacements}') + + return replace_list From 9495674905053b67247e8e809d5c088f4fa62abc Mon Sep 17 00:00:00 2001 From: Yuchen Huang Date: Wed, 1 Sep 2021 14:48:00 -0700 Subject: [PATCH 437/530] [xplat][metal] Add getters and setters for ivars in Conv2dOpContext (#57395) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57395 As title ghstack-source-id: 137223806 (Note: this ignores all push blocking failures!) Test Plan: ### Lib Build - `buck build caffe2:aten_metal_prepack` ### Integration Test - `arc focus2 pp-ops -a ModelRunner` - Click "Test Person/Hair Segmentation Model" {F612831435} - Image Classification Demo {F614144868} Reviewed By: xta0 Differential Revision: D28132020 fbshipit-source-id: 73560263a9d14e9ecfa39c69deb158a2ed8cb179 --- .../ATen/native/metal/MetalPrepackOpContext.h | 107 +++++++++++++----- .../ATen/native/metal/ops/MetalConvolution.mm | 28 ++--- 2 files changed, 92 insertions(+), 43 deletions(-) diff --git a/aten/src/ATen/native/metal/MetalPrepackOpContext.h b/aten/src/ATen/native/metal/MetalPrepackOpContext.h index e6b3f0b78a518..5976d7af23e53 100644 --- a/aten/src/ATen/native/metal/MetalPrepackOpContext.h +++ b/aten/src/ATen/native/metal/MetalPrepackOpContext.h @@ -21,14 +21,14 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { public: SerializationTypeConv2dPrePack pack() { return std::make_tuple( - weight, - bias, - stride, - padding, - dilation, - groups, - output_min, - output_max); + weight_, + bias_, + stride_, + padding_, + dilation_, + groups_, + output_min_, + output_max_); } Conv2dOpContext() = delete; Conv2dOpContext( @@ -40,32 +40,81 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { int64_t groups, const c10::optional& output_min, const c10::optional& output_max) - : weight(std::move(weight)), - bias(std::move(bias)), - stride(stride), - padding(padding), - dilation(dilation), - groups(groups), - output_min(output_min), - output_max(output_max) {} + : weight_(std::move(weight)), + bias_(std::move(bias)), + stride_(stride), + padding_(padding), + dilation_(dilation), + groups_(groups), + output_min_(output_min), + output_max_(output_max) {} void release_resources() override { - if (releaseCallback) { - releaseCallback(conv2dOp); - conv2dOp = nullptr; + if (releaseCallback_) { + releaseCallback_(conv2dOp_); + conv2dOp_ = nullptr; } } - Tensor weight; - c10::optional bias; - std::vector stride; - std::vector padding; - std::vector dilation; - int64_t groups; - c10::optional output_min; - c10::optional output_max; - void* conv2dOp = nullptr; // reserved to hold MPSCNNConv2dOp objects - std::function releaseCallback = nullptr; + const Tensor& get_weight() const { + return weight_; + } + + const c10::optional& get_bias() const { + return bias_; + } + + const std::vector& get_stride() const { + return stride_; + } + + const std::vector& get_padding() const { + return padding_; + } + + const std::vector& get_dilation() const { + return dilation_; + } + + int64_t get_groups() const { + return groups_; + } + + const c10::optional& get_output_min() const { + return output_min_; + } + + const c10::optional& get_output_max() const { + return output_max_; + } + + void set_conv2dOpPtr(void* ptr) { + conv2dOp_ = ptr; + } + + void* get_conv2dOpPtr() const { + return conv2dOp_; + } + + void set_releaseCallback(const std::function& func) { + releaseCallback_ = func; + } + + std::function& get_releaseCallback() { + return releaseCallback_; + } + + private: + Tensor weight_; + c10::optional bias_; + std::vector stride_; + std::vector padding_; + std::vector dilation_; + int64_t groups_; + c10::optional output_min_; + c10::optional output_max_; + std::function releaseCallback_ = nullptr; + void* conv2dOp_ = nullptr; // reserved to hold MPSCNNConv2dOp objects }; using SerializationTypeLinearPrePack = std::tuple< diff --git a/aten/src/ATen/native/metal/ops/MetalConvolution.mm b/aten/src/ATen/native/metal/ops/MetalConvolution.mm index c726382dde45f..4f07f5f77161d 100644 --- a/aten/src/ATen/native/metal/ops/MetalConvolution.mm +++ b/aten/src/ATen/native/metal/ops/MetalConvolution.mm @@ -55,28 +55,28 @@ Tensor conv2d( Tensor conv2d(const Tensor& input, Conv2dOpContext& context) { MPSImage* X = imageFromTensor(input); Conv2DParams params{input.sizes(), - context.weight.sizes(), - context.padding, - context.stride, - context.dilation, - context.groups}; + context.get_weight().sizes(), + context.get_padding(), + context.get_stride(), + context.get_dilation(), + context.get_groups()}; auto outputSize = params.output_sizes(); if(c10::multiply_integers(outputSize) == 0){ return makeTensor({outputSize}, input.options()); } - MPSCNNConvOp* op = (__bridge MPSCNNConvOp*)(context.conv2dOp); - NeuronType nt = neuronType(context.output_min, context.output_max); + MPSCNNConvOp* op = (__bridge MPSCNNConvOp*)(context.get_conv2dOpPtr()); + NeuronType nt = neuronType(context.get_output_min(), context.get_output_max()); if (!op) { - float* w = context.weight.data_ptr(); - float* b = context.bias.has_value() ? ((*context.bias).data_ptr()) + float* w = context.get_weight().data_ptr(); + float* b = context.get_bias().has_value() ? ((*context.get_bias()).data_ptr()) : nullptr; op = [MPSCNNConvOp conv2d:params weights:w bias:b neuronFilter:nt]; - context.conv2dOp = (void*)CFBridgingRetain(op); - context.releaseCallback = ^(void* res) { + context.set_conv2dOpPtr((void*)CFBridgingRetain(op)); + context.set_releaseCallback(^(void* res) { if (res) { CFBridgingRelease(res); } - }; + }); } MetalTensorImplStorage mt{outputSize}; MetalCommandBuffer* commandBuffer = getCommandBuffer(input); @@ -86,8 +86,8 @@ Tensor conv2d(const Tensor& input, Conv2dOpContext& context) { // fuse hardtanh with convolution if (nt == NeuronType::Clamp) { MPSImage* Y2 = createTemporaryImage(commandBuffer, [Y1 sizes]); - float min = context.output_min.value().toFloat(); - float max = context.output_max.value().toFloat(); + float min = context.get_output_min().value().toFloat(); + float max = context.get_output_max().value().toFloat(); MPSCNNClampOp* clampOp = [MPSCNNClampOp newWithTextures:@[ Y1, Y2 ] Args:@[ @(min), @(max) ]]; [clampOp encode:commandBuffer.buffer]; From 93bc03622eebb0bab4a79511c61e73b7315bf961 Mon Sep 17 00:00:00 2001 From: zhouzhuojie Date: Wed, 1 Sep 2021 14:53:25 -0700 Subject: [PATCH 438/530] Silent rm error for sccache log file (#64388) Summary: Sample reporting from dr.ci ![image](https://user-images.githubusercontent.com/658840/131724645-75afa04f-7554-4674-8e7c-cf139c84d994.png) The `rm` command is not actually running into problems, just need to silent the console output. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64388 Reviewed By: walterddr, malfet, seemethere Differential Revision: D30704439 Pulled By: zhouzhuojie fbshipit-source-id: ecd35531decf05b75cef30d08d46635f81112f67 --- .jenkins/caffe2/common.sh | 2 +- .jenkins/pytorch/common.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.jenkins/caffe2/common.sh b/.jenkins/caffe2/common.sh index 026cb8349d3d9..168e823ba2cc4 100644 --- a/.jenkins/caffe2/common.sh +++ b/.jenkins/caffe2/common.sh @@ -18,7 +18,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then if which sccache > /dev/null; then # Save sccache logs to file sccache --stop-server || true - rm ~/sccache_error.log || true + rm -f ~/sccache_error.log || true SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server # Report sccache stats for easier debugging diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh index 52b91510c4029..09e814b07d62d 100644 --- a/.jenkins/pytorch/common.sh +++ b/.jenkins/pytorch/common.sh @@ -74,7 +74,7 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then if which sccache > /dev/null; then # Save sccache logs to file sccache --stop-server > /dev/null 2>&1 || true - rm ~/sccache_error.log || true + rm -f ~/sccache_error.log || true if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then # sccache --start-server seems to hang forever on self hosted runners for GHA # so let's just go ahead and skip the --start-server altogether since it seems From 7ffcf1550374af29f5b3ce316d4e576423945be0 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 1 Sep 2021 14:56:14 -0700 Subject: [PATCH 439/530] [quant][graphmode][api] Add backend_config_dict to prepare_fx api (#64135) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64135 We want to start aligning the api with the design in https://github.com/pytorch/pytorch/wiki/Extending-PyTorch-Quantization-to-Custom-Backends We plan to gradually move things from `prepare_custom_config_dict` and `convert_custom_config_dict` to `backend_config_dict` and allow custom backend developer to define their own way of quantizing operators. Test Plan: python test/test_quantization.py TestQuantizeFx python test/test_quantization.py TestQuantizeFxOps Imported from OSS Reviewed By: zou3519 Differential Revision: D30699456 fbshipit-source-id: e3c068da8d3da2270f57719f7159cc71cafa8598 --- torch/quantization/fx/prepare.py | 1 + torch/quantization/quantize_fx.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index a6fd660e5e84c..fb526d09279dc 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -1114,6 +1114,7 @@ def prepare( node_name_to_scope: Dict[str, Tuple[str, type]], prepare_custom_config_dict: Optional[Dict[str, Any]] = None, equalization_qconfig_dict: Optional[Dict[str, Any]] = None, + backend_config_dict: Optional[Dict[str, Any]] = None, is_standalone_module: bool = False) -> ObservedGraphModule: """ standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py index aa8edbba64e49..2dd98ea6ffe4c 100644 --- a/torch/quantization/quantize_fx.py +++ b/torch/quantization/quantize_fx.py @@ -140,8 +140,9 @@ def create_node(self, kind : str, target : Target, return node def _prepare_fx(model: torch.nn.Module, qconfig_dict: Any, - prepare_custom_config_dict: Dict[str, Any] = None, - equalization_qconfig_dict: Dict[str, Any] = None, + prepare_custom_config_dict: Optional[Dict[str, Any]] = None, + equalization_qconfig_dict: Optional[Dict[str, Any]] = None, + backend_config_dict: Optional[Dict[str, Any]] = None, is_standalone_module: bool = False) -> ObservedGraphModule: r""" Internal helper function for prepare_fx Args: @@ -203,7 +204,8 @@ def _prepare_fx(model: torch.nn.Module, qconfig_dict: Any, def _prepare_standalone_module_fx( model: torch.nn.Module, qconfig_dict: Any, - prepare_custom_config_dict: Dict[str, Any] = None) -> GraphModule: + prepare_custom_config_dict: Dict[str, Any] = None, + backend_config_dict: Dict[str, Any] = None) -> GraphModule: r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the parent module. standalone_module means it a submodule that is not inlined in parent module, @@ -224,7 +226,7 @@ def _prepare_standalone_module_fx( same as input_quantized_idxs configuration provided for the standalone module """ - return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True) + return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, backend_config_dict, is_standalone_module=True) def fuse_fx(model: torch.nn.Module, fuse_custom_config_dict: Dict[str, Any] = None) -> GraphModule: @@ -265,8 +267,9 @@ def fuse_fx(model: torch.nn.Module, def prepare_fx( model: torch.nn.Module, qconfig_dict: Any, - prepare_custom_config_dict: Dict[str, Any] = None, - equalization_qconfig_dict: Dict[str, Any] = None) -> ObservedGraphModule: + prepare_custom_config_dict: Optional[Dict[str, Any]] = None, + equalization_qconfig_dict: Optional[Dict[str, Any]] = None, + backend_config_dict: Optional[Dict[str, Any]] = None) -> ObservedGraphModule: r""" Prepare a model for post training static quantization Args: @@ -392,6 +395,11 @@ def prepare_fx( with a similar structure as qconfig_dict except it will contain configurations specific to equalization techniques such as input-weight equalization. + `backend_config_dict`: a dictionary that specifies how operators are quantized + in a backend, this includes how the operaetors are observed, + supported fusion patterns, how quantize/dequantize ops are + inserted, supported dtypes etc. The structure of the dictionary is still WIP + and will change in the future, please don't use right now. Return: @@ -420,16 +428,18 @@ def calibrate(model, data_loader): torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx") assert not model.training, 'prepare_fx only works for models in ' + \ 'eval mode' - return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, equalization_qconfig_dict) + return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, equalization_qconfig_dict, backend_config_dict) def prepare_qat_fx( model: torch.nn.Module, qconfig_dict: Any, - prepare_custom_config_dict: Dict[str, Any] = None) -> ObservedGraphModule: + prepare_custom_config_dict: Optional[Dict[str, Any]] = None, + backend_config_dict: Optional[Dict[str, Any]] = None) -> ObservedGraphModule: r""" Prepare a model for quantization aware training Args: `model`: torch.nn.Module model, must be in train mode `qconfig_dict`: see :func:`~torch.quantization.prepare_fx` `prepare_custom_config_dict`: see :func:`~torch.quantization.prepare_fx` + `backend_config_dict`: see :func:`~torch.quantization.prepare_fx` Return: A GraphModule with fake quant modules (configured by qconfig_dict), ready for @@ -457,7 +467,7 @@ def train_loop(model, train_data): torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx") assert model.training, 'prepare_qat_fx only works for models in ' + \ 'train mode' - return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict) + return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, backend_config_dict) def _convert_fx( graph_module: GraphModule, is_reference: bool, From 535526b95cb26be10c0942129911db7d281d5bd9 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Wed, 1 Sep 2021 15:12:05 -0700 Subject: [PATCH 440/530] Restore LayerNorm numerics test (#64385) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64385 It was deleted in https://github.com/pytorch/pytorch/pull/63276. The numerics test was meant to check LayerNorm behavior on large inputs, but we deleted it without realizing that. Test Plan: - wait for tests. Reviewed By: ngimel Differential Revision: D30702950 Pulled By: zou3519 fbshipit-source-id: a480e26c45ec38fb628938b70416cdb22d976a46 --- test/test_nn.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/test_nn.py b/test/test_nn.py index e60ff698ed19b..5008c7256acf7 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -13281,6 +13281,32 @@ def test_LayerNorm_general(self, device): if self.device_type == 'cuda': self._test_LayerNorm_cuda_half(device) + @onlyOnCPUAndCUDA + def test_LayerNorm_numeric(self, device): + def layer_norm_ref(X, gamma, beta, normalized_shape, eps): + feature_size = np.prod(normalized_shape) + X_view = X.view(-1, feature_size) + mean = X_view.mean(dim=-1, keepdim=True) + var = X_view.var(dim=-1, unbiased=False, keepdim=True) + Y = (X_view - mean) / torch.sqrt(var + eps) + Y = Y * gamma.view(-1) + beta.view(-1) + return Y.view(*X.size()) + + normalized_shape = [256, 256, 144] + layer_norm = nn.LayerNorm(normalized_shape).float().to(device) + X = torch.rand(2, *normalized_shape, dtype=torch.float32, + device=device) + + Y = layer_norm(X) + Y_ref = layer_norm_ref(X, layer_norm.weight.data, layer_norm.bias.data, + normalized_shape, layer_norm.eps) + self.assertEqual(Y, Y_ref, rtol=0, atol=1e-5) + + if self.device_type == 'cuda': + layer_norm.cpu() + Y_cpu = layer_norm(X.cpu()) + self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5) + @onlyOnCPUAndCUDA def test_GroupNorm_general(self, device): self._test_GroupNorm_general(device) From 69f4401b7b6ea6a51a090ca8c958968a80a529e2 Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Wed, 1 Sep 2021 15:18:14 -0700 Subject: [PATCH 441/530] Make datasets in `ConcatDataset` not need to be sized (#64114) Summary: `datasets` needs to be iterable, but also sized because the length is checked. But immediately after it's converted to a list. By changing the order of these 2 lines, it doesn't need to be sized anymore. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64114 Reviewed By: H-Huang Differential Revision: D30641480 Pulled By: ejguan fbshipit-source-id: 7e16548c2123afa65b83845f9929271fa07fe1e8 --- torch/utils/data/dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py index 609e1a1eb6e2d..50488d13ae5d3 100644 --- a/torch/utils/data/dataset.py +++ b/torch/utils/data/dataset.py @@ -271,9 +271,8 @@ def cumsum(sequence): def __init__(self, datasets: Iterable[Dataset]) -> None: super(ConcatDataset, self).__init__() - # Cannot verify that datasets is Sized - assert len(datasets) > 0, 'datasets should not be an empty iterable' # type: ignore[arg-type] self.datasets = list(datasets) + assert len(self.datasets) > 0, 'datasets should not be an empty iterable' # type: ignore[arg-type] for d in self.datasets: assert not isinstance(d, IterableDataset), "ConcatDataset does not support IterableDataset" self.cumulative_sizes = self.cumsum(self.datasets) From ed89937d2cbda8f4c5b67439b8b7b138cff42552 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 1 Sep 2021 15:48:54 -0700 Subject: [PATCH 442/530] [quant][graphmode][fx] Add fbgemm backend_config_dict (#64288) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64288 This is just to setup the file structure and unblock experimentation. The format for backend_config_dict will change in the future Test Plan: python test/test_quantization.py TestQuantizeFx python test/test_quantization.py TestQuantizeFxOps Imported from OSS Reviewed By: zou3519 Differential Revision: D30699457 fbshipit-source-id: 28211a4def05d34757850c045a36e311f54760fe --- torch/quantization/fx/backend_config_dict/__init__.py | 4 ++++ torch/quantization/fx/backend_config_dict/fbgemm.py | 11 +++++++++++ torch/quantization/fx/prepare.py | 11 +++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 torch/quantization/fx/backend_config_dict/__init__.py create mode 100644 torch/quantization/fx/backend_config_dict/fbgemm.py diff --git a/torch/quantization/fx/backend_config_dict/__init__.py b/torch/quantization/fx/backend_config_dict/__init__.py new file mode 100644 index 0000000000000..edb2b956851b7 --- /dev/null +++ b/torch/quantization/fx/backend_config_dict/__init__.py @@ -0,0 +1,4 @@ +from .fbgemm import get_fbgemm_backend_config_dict + +def validate_backend_config_dict(backend_config_dict): + return "quant_patterns" in backend_config_dict diff --git a/torch/quantization/fx/backend_config_dict/fbgemm.py b/torch/quantization/fx/backend_config_dict/fbgemm.py new file mode 100644 index 0000000000000..4f40b100f0b78 --- /dev/null +++ b/torch/quantization/fx/backend_config_dict/fbgemm.py @@ -0,0 +1,11 @@ +from ..pattern_utils import get_default_quant_patterns + +def get_fbgemm_backend_config_dict(): + """ Get the backend config dictionary for fbgemm backend + NOTE: Current api will change in the future, it's just to unblock experimentation for + new backends, please don't use it right now. + """ + # TODO: add output_activation_post_process_map + return { + "quant_patterns": get_default_quant_patterns() + } diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index fb526d09279dc..0b65e339ce0a3 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -42,7 +42,6 @@ from .pattern_utils import ( MatchResult, - get_default_quant_patterns, get_default_output_activation_post_process_map, ) @@ -84,6 +83,9 @@ weight_dtype, ) +from .backend_config_dict import get_fbgemm_backend_config_dict +from .backend_config_dict import validate_backend_config_dict + from typing import Any, Callable, Dict, List, Optional, Tuple, Union def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool: @@ -1140,6 +1142,10 @@ def prepare( prepare_custom_config_dict = {} if equalization_qconfig_dict is None: equalization_qconfig_dict = {} + if backend_config_dict is None: + backend_config_dict = get_fbgemm_backend_config_dict() + + validate_backend_config_dict(backend_config_dict) additional_quant_patterns = \ prepare_custom_config_dict.get("additional_quant_pattern", {}) @@ -1153,8 +1159,9 @@ def prepare( # ((, ): # ), # } + quant_patterns = backend_config_dict["quant_patterns"] patterns: Dict[Pattern, QuantizeHandler] = get_combined_dict( - get_default_quant_patterns(), additional_quant_patterns) + quant_patterns, additional_quant_patterns) convert_dict_to_ordered_dict(qconfig_dict) convert_dict_to_ordered_dict(equalization_qconfig_dict) From a8f9aab84074ceab684da166199cc598afc62d54 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 1 Sep 2021 16:09:46 -0700 Subject: [PATCH 443/530] [DDP Comm Hook] Add bf16 gradient compression to ddp_comm_hooks.rst (#64346) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64346 as title ghstack-source-id: 137170288 Test Plan: N/A Reviewed By: rohan-varma Differential Revision: D30693513 fbshipit-source-id: 8c64b8404ff3b0322e1bbbd93f6ef051ea91307d --- docs/source/ddp_comm_hooks.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst index aed70c0752825..5bd0378e7c7b7 100644 --- a/docs/source/ddp_comm_hooks.rst +++ b/docs/source/ddp_comm_hooks.rst @@ -44,11 +44,13 @@ The input ``bucket`` is a :class:`torch.distributed.GradBucket` object. .. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks .. autofunction:: allreduce_hook .. autofunction:: fp16_compress_hook +.. autofunction:: bf16_compress_hook -Additionally, a communication hook wraper is provided to support :meth:`~fp16_compress_hook` as a wrapper, +Additionally, a communication hook wraper is provided to support :meth:`~fp16_compress_hook` or :meth:`~bf16_compress_hook` as a wrapper, which can be combined with other communication hooks. .. autofunction:: fp16_compress_wrapper +.. autofunction:: bf16_compress_wrapper PowerSGD Communication Hook --------------------------- From 5d80a48cef373e22393af1b1f4f4e3f2ad948a76 Mon Sep 17 00:00:00 2001 From: anjali411 Date: Wed, 1 Sep 2021 16:11:38 -0700 Subject: [PATCH 444/530] Add fast path for addmm when the inputs are conjugate (#59380) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/59380 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D28898374 Pulled By: anjali411 fbshipit-source-id: eab0e64d37bb57c18b54cabb8e5c00666338ba04 --- aten/src/ATen/ConjugateFallback.cpp | 11 ++ aten/src/ATen/cuda/CUDABlas.cpp | 4 +- aten/src/ATen/native/CPUBlas.cpp | 4 +- aten/src/ATen/native/CPUBlas.h | 2 +- aten/src/ATen/native/LinearAlgebra.cpp | 35 ++++-- aten/src/ATen/native/NegateFallback.cpp | 1 + aten/src/ATen/native/TensorFactories.cpp | 13 ++- aten/src/ATen/native/cuda/Blas.cpp | 65 ++++++++--- test/test_linalg.py | 32 ++++++ test/test_torch.py | 11 +- .../_internal/common_methods_invocations.py | 104 ++++++++++++++---- 11 files changed, 223 insertions(+), 59 deletions(-) diff --git a/aten/src/ATen/ConjugateFallback.cpp b/aten/src/ATen/ConjugateFallback.cpp index a64ef4950940b..2cf9538c9bb32 100644 --- a/aten/src/ATen/ConjugateFallback.cpp +++ b/aten/src/ATen/ConjugateFallback.cpp @@ -60,6 +60,17 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) { m.impl("vdot", torch::CppFunction::makeFallthrough()); m.impl("dot.out", torch::CppFunction::makeFallthrough()); m.impl("vdot.out", torch::CppFunction::makeFallthrough()); + m.impl("alias", torch::CppFunction::makeFallthrough()); + m.impl("mm", torch::CppFunction::makeFallthrough()); + m.impl("mm.out", torch::CppFunction::makeFallthrough()); + m.impl("addmm", torch::CppFunction::makeFallthrough()); + m.impl("addmm_", torch::CppFunction::makeFallthrough()); + m.impl("addmm.out", torch::CppFunction::makeFallthrough()); + m.impl("bmm", torch::CppFunction::makeFallthrough()); + m.impl("bmm.out", torch::CppFunction::makeFallthrough()); + m.impl("baddbmm", torch::CppFunction::makeFallthrough()); + m.impl("baddbmm_", torch::CppFunction::makeFallthrough()); + m.impl("baddbmm.out", torch::CppFunction::makeFallthrough()); } } // namespace at diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 75e59d0ecc100..70c3dda6f3401 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -64,8 +64,8 @@ static void _cublasAdjustLdLevel3( int64_t* lda, int64_t* ldb, int64_t* ldc) { - bool transa_ = ((transa == 't') || (transa == 'T')); - bool transb_ = ((transb == 't') || (transb == 'T')); + bool transa_ = ((transa != 'n') && (transa != 'N')); + bool transb_ = ((transb != 'n') && (transb != 'N')); // Note: leading dimensions generally are checked that they are > 0 // and at least as big the result requires (even if the value won't diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index 1a1f6737f23f1..f14e4dce68b5a 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -78,7 +78,7 @@ char to_blas(TransposeType trans) { switch (trans) { case Transpose: return 't'; case NoTranspose: return 'n'; - // case ConjTranspose: return 'c'; + case ConjTranspose: return 'c'; } TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); } @@ -89,7 +89,7 @@ fbgemm::matrix_op_t to_fbgemm(TransposeType trans) { switch (trans) { case Transpose: return fbgemm::matrix_op_t::Transpose; case NoTranspose: return fbgemm::matrix_op_t::NoTranspose; - // case ConjTranspose: return fbgemm::matrix_op_t::Transpose; + case ConjTranspose: TORCH_INTERNAL_ASSERT(false, "ConjTranspose type is not supported in fbgemm"); } TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); } diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h index e61207f7c76b8..3a483e4361bd2 100644 --- a/aten/src/ATen/native/CPUBlas.h +++ b/aten/src/ATen/native/CPUBlas.h @@ -12,7 +12,7 @@ namespace cpublas { enum TransposeType { Transpose, NoTranspose, - // ConjTranspose, -- Not implemented + ConjTranspose, }; namespace internal { diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 10576a0c63a49..2ae6202ce87e2 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -959,7 +959,6 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { static void addmm_impl_cpu_( Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) { TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2); - // Array access is faster than .size(n) and .stride(n) const auto self_sizes = self.sizes(); auto m1_strides = m1.strides(); @@ -992,18 +991,18 @@ static void addmm_impl_cpu_( if (result_strides[0] == 1 && (result_sizes[1] == 1 || result_strides[1] >= std::max(int64_t{1}, result_sizes[0]))) { transpose_c = false; - c = result; + c = result.resolve_conj(); } else if (result_strides[1] == 1 && (result_sizes[0] == 1 || result_strides[0] >= std::max(int64_t{1}, result_sizes[1]))) { std::swap(m1, m2); std::swap(m1_sizes, m2_sizes); std::swap(m1_strides, m2_strides); transpose_c = true; - c = result; + c = result.resolve_conj(); } else { transpose_c = false; // make c FORTRAN contiguous - c = result.transpose(0, 1).contiguous().transpose_(0, 1); + c = result.resolve_conj().transpose(0, 1).contiguous().transpose_(0, 1); } const int64_t m = result_sizes[transpose_c ? 1 : 0]; @@ -1017,7 +1016,7 @@ static void addmm_impl_cpu_( if (m1_strides[transpose_c ? 1 : 0] == 1 && m1_strides[transpose_c ? 0 : 1] >= std::max(int64_t{1}, m)) { transpose_a = false; - a = m1; + a = m1.resolve_conj(); } else if (m1_strides[transpose_c ? 0 : 1] == 1 && m1_strides[transpose_c ? 1 : 0] >= std::max(int64_t{1}, k)) { transpose_a = true; @@ -1034,7 +1033,7 @@ static void addmm_impl_cpu_( if (m2_strides[transpose_c ? 1 : 0] == 1 && m2_strides[transpose_c ? 0 : 1] >= std::max(int64_t{1}, k)) { transpose_b = false; - b = m2; + b = m2.resolve_conj(); } else if (m2_strides[transpose_c ? 0 : 1] == 1 && m2_strides[transpose_c ? 1 : 0] >= std::max(int64_t{1}, n)) { transpose_b = true; @@ -1048,13 +1047,16 @@ static void addmm_impl_cpu_( const int64_t ldb = b.strides()[(transpose_b == transpose_c) ? 1 : 0]; const int64_t ldc = c.strides()[transpose_c ? 0 : 1]; + // Always ensure the conjugation for c is resolved since there's no way to specify c's conjugation in the gemm call + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj()); + // Apply BLAS routine AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, result.scalar_type(), "addmm_impl_cpu_", [&]{ at::native::cpublas::gemm( - transpose_a ? cpublas::Transpose : cpublas::NoTranspose, - transpose_b ? cpublas::Transpose : cpublas::NoTranspose, + transpose_a ? a.is_conj() ? cpublas::ConjTranspose : cpublas::Transpose : cpublas::NoTranspose, + transpose_b ? b.is_conj() ? cpublas::ConjTranspose : cpublas::Transpose : cpublas::NoTranspose, m, n, k, alpha.to(), a.data_ptr(), lda, @@ -1349,8 +1351,18 @@ Tensor& baddbmm_out_cpu(const Tensor& self_, const Tensor& batch1, const Tensor& return at::native::baddbmm__cpu(result, batch1, batch2, beta, alpha); } +Tensor& conjugate_mutable_input_if_needed(Tensor& self, bool conjugate) { + if (conjugate) { + self.conj_physical_(); + } + return self; +} + Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) { - return bmm_out_or_baddbmm_(self, batch1, batch2, beta, alpha, false); + bool self_is_conj = self.is_conj(); + conjugate_mutable_input_if_needed(self, self_is_conj); + bmm_out_or_baddbmm_(self, batch1.resolve_conj(), batch2.resolve_conj(), beta, alpha, false); + return conjugate_mutable_input_if_needed(self, self_is_conj); } Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) { @@ -1363,7 +1375,10 @@ Tensor& bmm_out_cpu(const Tensor& batch1, const Tensor& batch2, Tensor &result) Scalar alpha(1.0); { NoNamesGuard guard; - bmm_out_or_baddbmm_(result, batch1, batch2, beta, alpha, true); + bool result_is_conj = result.is_conj(); + conjugate_mutable_input_if_needed(result, result_is_conj); + bmm_out_or_baddbmm_(result, batch1.resolve_conj(), batch2.resolve_conj(), beta, alpha, true); + conjugate_mutable_input_if_needed(result, result_is_conj); } namedinference::propagate_names_if_nonempty( result, diff --git a/aten/src/ATen/native/NegateFallback.cpp b/aten/src/ATen/native/NegateFallback.cpp index 86dbe05ff904f..d8381f58d036b 100644 --- a/aten/src/ATen/native/NegateFallback.cpp +++ b/aten/src/ATen/native/NegateFallback.cpp @@ -55,6 +55,7 @@ TORCH_LIBRARY_IMPL(aten, Negative, m) { m.impl("view", torch::CppFunction::makeFallthrough()); m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); m.impl("reshape", torch::CppFunction::makeFallthrough()); + m.impl("alias", torch::CppFunction::makeFallthrough()); } } // namespace at diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 3ee909be029ff..4712c3d99b6d8 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1411,17 +1411,18 @@ Tensor from_file(c10::string_view filename, c10::optional shared, c10::opt Tensor clone(const Tensor& src, c10::optional optional_memory_format) { auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve); + Tensor self; if (memory_format == MemoryFormat::Preserve) { if (src.is_non_overlapping_and_dense()) { - // Copy all strides - auto self = at::empty_strided(src.sizes(), src.strides(), src.options()); - self.copy_(src); - return self; + // Copy all strides, this is marginally faster than calling empty_like + self = at::empty_strided(src.sizes(), src.strides(), src.options()); } else { - memory_format = src.suggest_memory_format(); + self = at::empty_like(src); } + } else { + self = at::empty_like(src, src.options(), memory_format); } - auto self = at::empty_like(src, src.options(), memory_format); + self.copy_(src); return self; } diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index b4479101c59c9..269307d605aec 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -4,24 +4,51 @@ #include #include - namespace at { namespace native { namespace { +// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492 +c10::MaybeOwned inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) { + if (resolve_conj && tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor); + } + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, !transpose_result); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, transpose_result); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) { if (tensor.is_non_overlapping_and_dense()) { // common case transpose_tensor = tensor.is_contiguous(); - return c10::MaybeOwned::borrowed(tensor); + return resolve_conj_if_indicated(tensor, true); } IntArrayRef tensor_strides = tensor.strides(); IntArrayRef tensor_sizes = tensor.sizes(); if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { transpose_tensor = false; - return c10::MaybeOwned::borrowed(tensor); + return resolve_conj_if_indicated(tensor, true); } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { transpose_tensor = true; - return c10::MaybeOwned::borrowed(tensor); + return resolve_conj_if_indicated(tensor, true); } else { transpose_tensor = true; return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); @@ -39,19 +66,19 @@ c10::MaybeOwned prepare_batch_matrix_for_cublas(const Tensor& tensor, bo if (tensor_strides[fast_dim] == 1 && (tensor_strides[leading_dim] >= std::max(1, m))) { transpose_tensor = false; - tensor_ = c10::MaybeOwned::borrowed(tensor); - ld_tensor = tensor_strides[leading_dim]; + tensor_ = resolve_conj_if_indicated(tensor, true); + ld_tensor = tensor_->strides()[leading_dim]; } else if ((tensor_strides[leading_dim] == 1) && (tensor_strides[fast_dim] >= std::max(1, n))) { transpose_tensor = true; - tensor_ = c10::MaybeOwned::borrowed(tensor); - ld_tensor = tensor_strides[fast_dim]; + tensor_ = resolve_conj_if_indicated(tensor, false); + ld_tensor = tensor_->strides()[fast_dim]; } else { transpose_tensor = !transpose_result; // gemm call requires leading dimension and stride parameters to be non-zero bool is_stride_non_zero = tensor.strides()[1] != 0 && tensor.strides()[2] != 0; if (tensor.is_contiguous() && is_stride_non_zero) { - tensor_ = c10::MaybeOwned::borrowed(tensor); + tensor_ = resolve_conj_if_indicated(tensor, transpose_result); } else { tensor_ = c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); } @@ -104,8 +131,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma c10::MaybeOwned result_ = prepare_matrix_for_cublas(result, transpose_result); bool transpose_mat1; bool transpose_mat2; - c10::MaybeOwned mat1_ = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1); - c10::MaybeOwned mat2_ = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2); + auto mat1_ = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result); + auto mat2_ = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result); if (transpose_result) { transpose_mat1 = !transpose_mat1; @@ -141,6 +168,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma c10::nullopt /* pin_memory */)); } + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj()); + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] { scalar_t alpha_val = alpha.to(); scalar_t beta_val = beta.to(); @@ -148,8 +177,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma scalar_t* mat2_ptr = mat2_->data_ptr(); scalar_t* result_ptr = result_->data_ptr(); at::cuda::blas::gemm( - transpose_mat1 ? 't' : 'n', - transpose_mat2 ? 't' : 'n', + transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n', + transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n', m, n, k, alpha_val, mat1_ptr, mat1_ld, @@ -207,11 +236,11 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& if ((result_strides[1] == 1) && ((result_sizes[2] == 1) || (result_strides[2] >= std::max(1, result_sizes[1])))) { - result_ = c10::MaybeOwned::borrowed(result); + result_ = resolve_conj_if_indicated(result, true); } else if ((result_strides[2] == 1) && (result_sizes[1] == 1 || (result_strides[1] >= std::max(1, result_sizes[2])))) { transpose_result = true; - result_ = c10::MaybeOwned::borrowed(result); + result_ = resolve_conj_if_indicated(result, true); } else { result_ = c10::MaybeOwned::owned(result.transpose(1, 2).clone(at::MemoryFormat::Contiguous).transpose(1, 2)); } @@ -230,6 +259,8 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ldc = result_->strides()[leading_dim]; int64_t num_batches = result_->sizes()[0]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj()); + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "baddbmm_cuda", [&] { scalar_t alpha_val = alpha.to(); scalar_t beta_val = beta.to(); @@ -237,8 +268,8 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& scalar_t* batch2_ptr = batch2_->data_ptr(); scalar_t* result_ptr = result_->data_ptr(); at::cuda::blas::bgemm( - transpose_batch1 ? 't' : 'n', - transpose_batch2 ? 't' : 'n', + transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n', + transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n', m, n, k, alpha_val, batch1_ptr, lda, batch1_->strides()[0], diff --git a/test/test_linalg.py b/test/test_linalg.py index f7ce39272bf86..fbd219b3c5981 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -6165,6 +6165,38 @@ def genf_float(x, y): _test_mm(n, m, p, dtype, genf) + @onlyOnCPUAndCUDA + def test_mm_bmm_non_memory_dense(self, device): + def _slice(tensor, fn): + return fn(tensor)[..., ::2] + A = torch.randn(3, 6, dtype=torch.cfloat, device=device) + B = torch.randn(3, 3, dtype=torch.cfloat, device=device) + out = torch.empty(3, 3, device=device, dtype=torch.complex64).t() + out1 = torch.empty(3, 3, device=device, dtype=torch.complex64).t() + A_conj = _slice(A, torch.conj) + A_conj_physical = _slice(A, torch.conj_physical) + + self.assertEqual(torch.mm(A_conj, B, out=out), torch.mm(A_conj_physical, B, out=out)) + self.assertEqual(torch.mm(A_conj.t(), B, out=out), torch.mm(A_conj_physical.t(), B, out=out)) + + Ab = torch.randn(2, 3, 6, dtype=torch.cfloat, device=device) + Bb = torch.randn(2, 3, 3, dtype=torch.cfloat, device=device) + Bb_ = torch.randn(1, 3, 3, dtype=torch.cfloat, device=device).expand(2, 3, 3) + out_b = torch.empty(2, 3, 3, device=device, dtype=torch.complex64).transpose(-1, -2) + + Ab_conj = _slice(Ab, torch.conj) + Ab_conj_physical = _slice(Ab, torch.conj_physical) + + def t_b(tensor): + return tensor.transpose(-1, -2) + + self.assertEqual(torch.bmm(Ab_conj, Bb, out=out_b), torch.bmm(Ab_conj_physical, Bb, out=out_b)) + self.assertEqual(torch.bmm(t_b(Ab_conj), Bb, out=out_b), torch.bmm(t_b(Ab_conj_physical), Bb, out=out_b)) + + # test broadcasting + self.assertEqual(torch.bmm(Ab_conj, Bb_, out=out_b), torch.bmm(Ab_conj_physical, Bb_, out=out_b)) + self.assertEqual(torch.bmm(t_b(Ab_conj), Bb_, out=out_b), torch.bmm(t_b(Ab_conj_physical), Bb_, out=out_b)) + @onlyOnCPUAndCUDA @dtypes(torch.float32, torch.float64) def test_strided_mm_bmm(self, device, dtype): diff --git a/test/test_torch.py b/test/test_torch.py index b267b9cd6b610..a790839bbd50e 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5328,6 +5328,13 @@ def test_clone_zero_stride_dim(self, device): y = x.as_strided([2, 1, 5], [1, 0, 2]) self.assertEqual(y, y.clone()) + def test_clone_not_memory_dense(self): + # github issue: https://github.com/pytorch/pytorch/issues/64176 + x = torch.randn(10, 8).t()[::2, ::2] + y = x.clone() + # should retain permutation after densification + self.assertTrue(y.stride() == (1, 4)) + @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda'))) @dtypes(*set(torch.testing.get_all_math_dtypes('cpu'))) def test_addcmul(self, device, dtype): @@ -6013,9 +6020,9 @@ def test_masked_select_discontiguous(self, device): out_dc = torch.empty(size * size, device=device)[::2] for v, m in product(vals_list, mask_list): if m.is_contiguous(): - expected = v[:, ::2].clone().view(-1) + expected = v[:, ::2].clone().reshape((-1, )) else: - expected = v[::2].clone().view(-1) + expected = v[::2].clone().reshape((-1, )) out = torch.masked_select(v, m) self.assertEqual(out, expected, atol=0, rtol=0) torch.masked_select(v, m, out=out_dc) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index fe8e36fbe6758..10aae4146bae8 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1606,15 +1606,29 @@ def sample_inputs_t(op_info, device, dtype, requires_grad, **kwargs): def sample_inputs_mm(op_info, device, dtype, requires_grad, **kwargs): - args_list = ( - ((S, M), (M, S)), - ) - inputs = tuple(SampleInput(make_tensor(first_shape, device, dtype, - requires_grad=requires_grad), - args=(make_tensor(second_shape, device, dtype, - requires_grad=requires_grad),)) - for first_shape, second_shape in args_list) - return inputs + first_shape, second_shape = (S, M), (M, S) + sample_inputs = [] + sample_inputs.append( + SampleInput(make_tensor(first_shape, device, dtype, + requires_grad=requires_grad), + args=(make_tensor(second_shape, device, dtype, + requires_grad=requires_grad),))) + + if dtype.is_complex: + sample_inputs.append( + SampleInput(make_tensor(first_shape, device, dtype, + requires_grad=requires_grad), + args=( + make_tensor(second_shape, device, dtype, + requires_grad=requires_grad).conj(),))) + + sample_inputs.append( + SampleInput(make_tensor(first_shape, device, dtype, + requires_grad=requires_grad).transpose(0, 1), + args=( + make_tensor(second_shape, device, dtype, + requires_grad=requires_grad).transpose(0, 1).conj(),))) + return sample_inputs def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs): alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6) @@ -1627,15 +1641,40 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs): ((), (2, 2), (2, 3), True) ] test_cases = tests_list + tests_with_lhs_broadcasting # type: ignore[operator] - inputs = tuple(SampleInput(make_tensor(shape_a, device, dtype, requires_grad=requires_grad), - args=(make_tensor(shape_b, device, dtype, - requires_grad=requires_grad), - make_tensor(shape_c, device, dtype, - requires_grad=requires_grad)), - kwargs={'alpha': alpha_val, 'beta': beta_val}, - broadcasts_input=broadcasts_input) - for shape_a, shape_b, shape_c, broadcasts_input in test_cases) - return inputs + + sample_inputs = [] + + for shape_a, shape_b, shape_c, broadcasts_input in test_cases: + sample_inputs.append( + SampleInput( + make_tensor(shape_a, device, dtype, requires_grad=requires_grad), + args=( + make_tensor(shape_b, device, dtype, + requires_grad=requires_grad), + make_tensor(shape_c, device, dtype, + requires_grad=requires_grad)), + kwargs={'alpha': alpha_val, 'beta': beta_val}, + broadcasts_input=broadcasts_input)) + + if dtype.is_complex: + shape = (3, 3) + sample_inputs.append( + SampleInput(make_tensor(shape, device, dtype, requires_grad=requires_grad), + args=( + make_tensor(shape, device, dtype, + requires_grad=requires_grad).t().conj(), + make_tensor(shape, device, dtype, + requires_grad=requires_grad)), + kwargs={'alpha': alpha_val, 'beta': beta_val},)) + sample_inputs.append( + SampleInput(make_tensor(shape, device, dtype, requires_grad=requires_grad), + args=( + make_tensor(shape, device, dtype, + requires_grad=requires_grad), + make_tensor(shape, device, dtype, + requires_grad=requires_grad).t().conj()), + kwargs={'alpha': alpha_val, 'beta': beta_val},)) + return sample_inputs def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs): return ( @@ -1767,6 +1806,23 @@ def sample_inputs_baddbmm(op_info, device, dtype, requires_grad, **kwargs): sample_inputs.append(SampleInput(args[0], args=(args[1], args[2]), kwargs=dict(beta=beta * (1 + 2j), alpha=alpha * (2 + 3j)), broadcasts_input=broadcasts_input)) + + if dtype.is_complex: + shapes = [(S, S, S), (S, M, S), (S, S, M)] + args = (make_tensor(shapes[0], device, dtype, + low=None, high=None, + requires_grad=requires_grad), + make_tensor(shapes[1], device, dtype, + low=None, high=None, + requires_grad=requires_grad), + make_tensor(shapes[2], device, dtype, + low=None, high=None, + requires_grad=requires_grad)) + sample_inputs.append( + SampleInput( + args[0].transpose(-1, 1), args=(args[1].transpose(-1, 1).conj(), args[2].transpose(-1, 1).conj()), + kwargs=dict(beta=beta * (1 + 2j), alpha=alpha * (2 + 3j)),)) + return tuple(sample_inputs) def sample_inputs_addr(op_info, device, dtype, requires_grad, **kwargs): @@ -5847,6 +5903,13 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): *[torch.bfloat16] if SM53OrLater else [], torch.complex64, torch.complex128), supports_forward_ad=True, + decorators=[ + DecorateInfo( + toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), + 'TestCommon', 'test_variant_consistency_eager', device_type='cuda'), + DecorateInfo( + toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), + 'TestMathBits', 'test_conj_view', device_type='cuda')], skips=( # FIXME: bfloat16 backward support likely depends on CUDA11+ # and SM53+ @@ -7045,7 +7108,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): skips=( # matmul does not correctly warn when resizing out= inputs SkipInfo('TestCommon', 'test_out'), - SkipInfo('TestCommon', 'test_conj_view', device_type='cpu'), )), OpInfo('max', op=torch.max, @@ -7835,6 +7897,10 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): assert_autodiffed=True, sample_inputs_func=sample_inputs_matmul, supports_out=False, + decorators=[ + DecorateInfo( + toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), + 'TestMathBits', 'test_conj_view')], skips=( SkipInfo('TestJit', 'test_variant_consistency_jit',), )), From 9b8f9d5a25ca7a9ebd54a07ab88b3a540111e5b3 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 16:21:31 -0700 Subject: [PATCH 445/530] [c10d] Prefer use of torch_check (#63928) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63928 throw std::invalid_argument results in not getting stacktraces with TORCH_SHOW_CPP_STACKTRACES=1, so instead prefer torch_check here. ghstack-source-id: 137135328 Test Plan: CI Reviewed By: mrshenli Differential Revision: D30533955 fbshipit-source-id: 33e5bf4f449e3043dec68da93f8022f6624d9675 --- test/distributed/test_c10d_gloo.py | 114 +++++++++--------- .../distributed/c10d/ProcessGroupGloo.cpp | 18 +-- 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py index 55b2948b93b71..789d76e9d115a 100644 --- a/test/distributed/test_c10d_gloo.py +++ b/test/distributed/test_c10d_gloo.py @@ -259,43 +259,43 @@ def test_broadcast_checks(self): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.BroadcastOptions() opts.rootRank = -1 opts.rootTensor = 0 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.BroadcastOptions() opts.rootRank = self.world_size opts.rootTensor = 0 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = -1 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 1 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 0 pg.broadcast([], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 0 pg.broadcast([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 0 @@ -394,15 +394,15 @@ def test_allreduce_checks(self): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"): opts = c10d.AllreduceOptions() pg.allreduce([], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.AllreduceOptions() pg.allreduce([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.AllreduceOptions() pg.allreduce([t1, t3], opts) @@ -553,19 +553,19 @@ def test_allreduce_coalesced_checks(self): t2 = torch.zeros(1, dtype=torch.float64) t3 = torch.sparse_coo_tensor([[0]], [1], size=(1,)) - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([], opts) - with self.assertRaisesRegex(ValueError, "tensors must all have the same type"): + with self.assertRaisesRegex(RuntimeError, "tensors must all have the same type"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor layout at index"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor layout at index"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t1, t3], opts) - with self.assertRaisesRegex(ValueError, "unsupported layout"): + with self.assertRaisesRegex(RuntimeError, "unsupported layout"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t3, t3.clone()], opts) @@ -579,7 +579,7 @@ def test_allreduce_coalesced_checks_cuda(self): t1 = torch.zeros(1, dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "unsupported device type"): + with self.assertRaisesRegex(RuntimeError, "unsupported device type"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t1.cuda(), t1.cuda()], opts) @@ -647,21 +647,21 @@ def test_sparse_allreduce_checks(self): t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,)) t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,)) - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"): opts = c10d.AllreduceOptions() pg.allreduce([], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor layout"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor layout"): opts = c10d.AllreduceOptions() pg.allreduce([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.AllreduceOptions() pg.allreduce([t2, t3], opts) # Sparse allreduce only works with c10d.ReduceOp.SUM. for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]: - with self.assertRaisesRegex(ValueError, "unsupported reduction operation"): + with self.assertRaisesRegex(RuntimeError, "unsupported reduction operation"): opts = c10d.AllreduceOptions() opts.reduceOp = op pg.allreduce([t3], opts) @@ -705,36 +705,36 @@ def test_scatter_checks(self): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ScatterOptions() opts.rootRank = -1 pg.scatter([t1], [], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ScatterOptions() opts.rootRank = self.world_size pg.scatter([t1], [], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output tensor list" + RuntimeError, "requires a single-element output tensor list" ): opts = c10d.ScatterOptions() opts.rootRank = 0 pg.scatter([], [], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output tensor list" + RuntimeError, "requires a single-element output tensor list" ): opts = c10d.ScatterOptions() opts.rootRank = 0 pg.scatter([t1, t1], [], opts) - with self.assertRaisesRegex(ValueError, "requires a single-element input list"): + with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [], opts) - with self.assertRaisesRegex(ValueError, "requires a single-element input list"): + with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t1] * self.world_size, [t1] * self.world_size], opts) @@ -743,7 +743,7 @@ def test_scatter_checks(self): incorrect_list_size = self.world_size - 1 err_str = "Incorrect input list size {}. Input list size should be {}" with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.ScatterOptions() opts.rootRank = self.rank @@ -751,23 +751,23 @@ def test_scatter_checks(self): incorrect_list_size = self.world_size + 1 with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t1] * incorrect_list_size], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t2] * self.world_size], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t3] * self.world_size], opts) - with self.assertRaisesRegex(ValueError, "requires empty input on non-root"): + with self.assertRaisesRegex(RuntimeError, "requires empty input on non-root"): opts = c10d.ScatterOptions() opts.rootRank = (self.rank + 1) % self.world_size pg.scatter([t1], [[t1] * self.world_size], opts) @@ -872,39 +872,39 @@ def test_gather_checks(self): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.GatherOptions() opts.rootRank = -1 pg.gather([], [t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.GatherOptions() opts.rootRank = self.world_size pg.gather([], [t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element input tensor list" + RuntimeError, "requires a single-element input tensor list" ): opts = c10d.GatherOptions() opts.rootRank = 0 pg.gather([], [], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element input tensor list" + RuntimeError, "requires a single-element input tensor list" ): opts = c10d.GatherOptions() opts.rootRank = 0 pg.gather([], [t1, t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output list" + RuntimeError, "requires a single-element output list" ): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([], [t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output list" + RuntimeError, "requires a single-element output list" ): opts = c10d.GatherOptions() opts.rootRank = self.rank @@ -914,7 +914,7 @@ def test_gather_checks(self): incorrect_list_size = self.world_size - 1 err_str = "Incorrect output list size {}. Output list size should be {}" with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.GatherOptions() opts.rootRank = self.rank @@ -922,23 +922,23 @@ def test_gather_checks(self): incorrect_list_size = self.world_size + 1 with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([[t1] * incorrect_list_size], [t1], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([[t2] * self.world_size], [t1], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([[t3] * self.world_size], [t1], opts) - with self.assertRaisesRegex(ValueError, "requires empty output on non-root"): + with self.assertRaisesRegex(RuntimeError, "requires empty output on non-root"): opts = c10d.GatherOptions() opts.rootRank = (self.rank + 1) % self.world_size pg.gather([[t1] * self.world_size], [t1], opts) @@ -1039,39 +1039,39 @@ def test_allgather_checks(self): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "requires non-empty input tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty input tensor list"): pg.allgather([], []) with self.assertRaisesRegex( - ValueError, "requires input/output tensor lists to have the same length" + RuntimeError, "requires input/output tensor lists to have the same length" ): pg.allgather([], [t1]) with self.assertRaisesRegex( - ValueError, "requires input/output tensor lists to have the same length" + RuntimeError, "requires input/output tensor lists to have the same length" ): pg.allgather([[t1] * self.world_size, [t1] * self.world_size], [t1]) - with self.assertRaisesRegex(ValueError, "invalid output tensor list"): + with self.assertRaisesRegex(RuntimeError, "invalid output tensor list"): pg.allgather([[t1] * (self.world_size - 1)], [t1]) - with self.assertRaisesRegex(ValueError, "invalid output tensor list"): + with self.assertRaisesRegex(RuntimeError, "invalid output tensor list"): pg.allgather([[t1] * (self.world_size + 1)], [t1]) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): pg.allgather( [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t2] ) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): pg.allgather( [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t3] ) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): pg.allgather([([t1, t2] * (self.world_size))[: self.world_size]], [t1]) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): pg.allgather([([t1, t3] * (self.world_size))[: self.world_size]], [t1]) def _test_allgather_basics(self, fn): @@ -1160,13 +1160,13 @@ def test_allgather_coalesced_checks(self): # One of output tensors does not match input list. dummy_output_lists[0] = [torch.zeros([0], dtype=torch.float32)] with self.assertRaisesRegex( - ValueError, "invalid size of output tensor at index 0" + RuntimeError, "invalid size of output tensor at index 0" ): c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) # One of output tensors does not match input list. dummy_output_lists[0] = [torch.zeros([1], dtype=torch.float64)] - with self.assertRaisesRegex(ValueError, "invalid tensor type at index 0"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type at index 0"): c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) # Output lists have too many elements @@ -1174,7 +1174,7 @@ def test_allgather_coalesced_checks(self): [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size + 1) ] with self.assertRaisesRegex( - ValueError, "output lists should be equal to world size" + RuntimeError, "output lists should be equal to world size" ): c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) @@ -1194,26 +1194,26 @@ def test_reduce_checks(self): t1 = torch.zeros([1], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ReduceOptions() opts.rootRank = -1 opts.rootTensor = 0 pg.reduce([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ReduceOptions() opts.rootRank = self.world_size opts.rootTensor = 0 pg.reduce([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.ReduceOptions() opts.rootRank = self.rank opts.rootTensor = 1 pg.reduce([t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element tensor list" + RuntimeError, "requires a single-element tensor list" ): opts = c10d.ReduceOptions() opts.rootRank = self.rank diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index ba26409c9b990..b8f5aa3989ce4 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -923,7 +923,7 @@ c10::intrusive_ptr ProcessGroupGloo::broadcast( std::vector& inputs, const BroadcastOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::broadcast: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::broadcast: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -1414,7 +1414,7 @@ c10::intrusive_ptr ProcessGroupGloo::allreduce( std::vector& inputs, const AllreduceOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::allreduce: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::allreduce: " + msg); }; assertNonEmpty(invalidArgument, inputs); @@ -1475,7 +1475,7 @@ c10::intrusive_ptr ProcessGroupGloo::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument( + TORCH_CHECK(false, "ProcessGroupGloo::allreduce_coalesced: " + msg); }; assertNonEmpty(invalidArgument, tensors); @@ -1644,7 +1644,7 @@ c10::intrusive_ptr ProcessGroupGloo::reduce( std::vector& inputs, const ReduceOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::reduce: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::reduce: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -1821,7 +1821,7 @@ c10::intrusive_ptr ProcessGroupGloo::allgather( std::vector& inputs, const AllgatherOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::allgather: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::allgather: " + msg); }; if (inputs.size() == 0) { @@ -1955,7 +1955,7 @@ c10::intrusive_ptr ProcessGroupGloo::allgather_coalesced( std::vector& input_list, const AllgatherOptions& /* unused */) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument( + TORCH_CHECK(false, "ProcessGroupGloo::allgather_coalesced: " + msg); }; @@ -2152,7 +2152,7 @@ c10::intrusive_ptr ProcessGroupGloo::gather( std::vector& inputs, const GatherOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::gather: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::gather: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -2336,7 +2336,7 @@ c10::intrusive_ptr ProcessGroupGloo::scatter( std::vector>& inputs, const ScatterOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::scatter: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::scatter: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -2530,7 +2530,7 @@ c10::intrusive_ptr ProcessGroupGloo::alltoall_base( std::vector& inputCounts, const AllToAllOptions& /* unused */) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::alltoall_base: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::alltoall_base: " + msg); }; TORCH_CHECK( From 59fcbd172b5dfdeb12e2f1b7a78c2ce95c1eb680 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 16:25:00 -0700 Subject: [PATCH 446/530] Fix incorrect DDP test (#64074) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64074 Previous PR https://github.com/pytorch/pytorch/pull/63831 did not actually test the error in https://github.com/pytorch/pytorch/issues/63812. Introduce a test directly from the repro that simulates it. ghstack-source-id: 137171460 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D30569719 fbshipit-source-id: fd61250ef6d291c093607663d91d6d2cb5574eb7 --- .../_internal/distributed/distributed_test.py | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 333458c5f8308..f17842ee02a22 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -3761,25 +3761,28 @@ def test_DistributedDataParallel_requires_grad(self): self._barrier() @sandcastle_skip_if( - BACKEND != "nccl" and BACKEND != "gloo", - "Only NCCL and GLOO backend support DistributedDataParallel", + BACKEND == "nccl", + "Gloo-only test" ) - @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"])) def test_ddp_create_graph(self): - rank = self.rank - torch.cuda.set_device(rank) - net = torch.nn.parallel.DistributedDataParallel( - torch.nn.Linear(1, 1, bias=False).cuda(rank), - device_ids=[rank] - ) - inp = torch.randn((2, 1), device=rank) + class Model(nn.Module): + def __init__(self): + super().__init__() + self.p = nn.Parameter(torch.tensor(1.)) + + def forward(self): + return self.p.pow(2) + + model = Model() + ddp_model = torch.nn.parallel.DistributedDataParallel(model) for _ in range(6): - loss = net(inp).sum() - # Verify DDP works with create_graph=True - loss.backward(create_graph=True) + # Verify DDP doesn't throw when ran with create_graph=True. + # Although we do warn about potential issues, please see + # https://github.com/pytorch/pytorch/issues/63929 for details. + ddp_model().backward(create_graph=True) # grad tensors should require grad. self.assertTrue( - all([param.requires_grad for param in net.parameters()]) + all([param.requires_grad for param in ddp_model.parameters()]) ) @sandcastle_skip_if( From baceea442621346cc42f86c28d9d239531dfa006 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 17:04:37 -0700 Subject: [PATCH 447/530] [DDP] Add more logging iterations (#64071) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64071 Adding more logging iterations to get additional data. ghstack-source-id: 137119476 Test Plan: CI Reviewed By: mrshenli Differential Revision: D30579367 fbshipit-source-id: 57195266ada5e5926f0d8eaf4fb4e01dc98924d7 --- torch/csrc/distributed/c10d/logger.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp index 9fa7289c16568..0bb960a639907 100644 --- a/torch/csrc/distributed/c10d/logger.cpp +++ b/torch/csrc/distributed/c10d/logger.cpp @@ -8,7 +8,7 @@ namespace c10d { // When training runs at these iterations, log the runtime // stats. -const int LoggingIterations[] = {10, 20, 100, 1000}; +const int LoggingIterations[] = {10, 20, 100, 1000, 5000, 10000, 20000}; // NOLINT std::ostream& operator<<(std::ostream& output, const Logger& logger) { auto& ddp_logging_data = (*logger.ddp_logging_data_); From bf9d66586c388c0aa223644b1d224227443ae34b Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 1 Sep 2021 17:32:39 -0700 Subject: [PATCH 448/530] [DDP Comm Hook] Create a noop hook for performance debugging (#64344) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64344 As title. Additionally, avoid using numpy array in test_ddp_hooks.py. ghstack-source-id: 137170449 Test Plan: buck test mode/dev-nosan caffe2/test/distributed/algorithms/ddp_comm_hooks:test_ddp_hooks -- test_ddp_comm_hook_noop_hook Reviewed By: rohan-varma Differential Revision: D30693220 fbshipit-source-id: e17f0d1c6198863cf20a53566f586a6bff602522 --- .../ddp_comm_hooks/test_ddp_hooks.py | 34 +++++++++++++++---- .../algorithms/ddp_comm_hooks/__init__.py | 4 +++ .../ddp_comm_hooks/debugging_hooks.py | 26 ++++++++++++++ 3 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py index 3d00712ca5354..67175b2d22495 100644 --- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py +++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py @@ -2,7 +2,6 @@ import os import sys -import numpy as np import torch from torch import nn import torch.distributed as dist @@ -105,7 +104,9 @@ def _run_and_get_grads(self, model): # Run backward output.mean().backward() - return [p.grad.data.cpu().numpy() for p in model.parameters()] + # The only layer + param = next(model.parameters()) + return param.grad @requires_nccl() @skip_if_lt_x_gpu(2) @@ -122,7 +123,7 @@ def test_ddp_comm_hook_allreduce_hook(self): # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.ALLREDUCE) - np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0) + torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0) @requires_nccl() @skip_if_lt_x_gpu(2) @@ -139,7 +140,7 @@ def test_ddp_comm_hook_fp16compress_hook(self): # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.FP16_COMPRESS) - np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) + torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) @requires_nccl() @skip_if_lt_x_gpu(2) @@ -156,7 +157,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self): # Register hook case, get the hook grads. hook_grads = self._get_grads(process_group, DDPCommHookType.QUANTIZE_PER_TENSOR) - np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) + torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) @requires_nccl() @skip_if_lt_x_gpu(2) @@ -175,7 +176,28 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self): process_group, DDPCommHookType.QUANTIZE_PER_CHANNEL ) - np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) + torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4) + + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_noop_hook(self): + """ + This unit test verifies the ``noop`` hook registered case and a subsequent allreduce + gives same result with no hook registered case. + """ + store = dist.FileStore(self.file_name, self.world_size) + process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) + + # No hook registered case, get the reference grads. + reference_grads = self._get_grads(process_group, None) + # Register hook case, get the hook grads. + hook_grads = self._get_grads(process_group, DDPCommHookType.NOOP) + # Apply a subsequent allreduce to average grads. + hook_grads.div_(self.world_size) + dist.all_reduce(hook_grads, group=process_group) + + torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0) @requires_nccl() @skip_if_lt_x_gpu(2) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py index c3f3b066ee478..ff22a818f925d 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py @@ -5,6 +5,7 @@ from torch.nn.parallel import DistributedDataParallel from . import ( + debugging_hooks as debugging, default_hooks as default, powerSGD_hook as powerSGD, quantization_hooks as quantization, @@ -78,6 +79,9 @@ class DDPCommHookType(Enum): comm_hook=powerSGD.batched_powerSGD_hook, matrix_approximation_rank=2, ) + NOOP = partial( + _ddp_comm_hook_wrapper, comm_hook=debugging.noop_hook, + ) def register_ddp_comm_hook( diff --git a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py new file mode 100644 index 0000000000000..0c60762caf2ed --- /dev/null +++ b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py @@ -0,0 +1,26 @@ +from typing import Any + +import torch +import torch.distributed as dist + + +def noop_hook(_: Any, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: + """ + This DDP communication hook returns the a future that wraps the input, + so it is a noop that does not incur any communication overheads. + + This hook should **only** be used for headroom analysis of allreduce optimization, + instead of the normal gradient synchronization. + For example, if only less than 10% speedup of training time can be observed after this hook is registered, + it usually implies that allreduce is not a performance bottleneck for this case. + Such instrumentation can be particularly useful + if GPU traces cannot be easily retrieved or the trace analysis is complicated + some factors such as the overlap between allreduce and computation or the desynchronization across ranks. + + Example:: + >>> ddp_model.register_comm_hook(None, noop_hook) + """ + fut: torch.futures.Future[torch.Tensor] = torch.futures.Future() + fut.set_result(bucket.buffer()) + + return fut From 778af565048e6160ce59fb5eedd8455e629f7942 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 1 Sep 2021 17:32:39 -0700 Subject: [PATCH 449/530] [DDP Comm Hook] Add debugging communication hooks to ddp_comm_hooks.rst (#64352) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64352 as title ghstack-source-id: 137246253 Test Plan: N/A Reviewed By: rohan-varma Differential Revision: D30694089 fbshipit-source-id: a78110b11d59bb0718f43c99ede23f2fd8ab21d0 --- docs/source/ddp_comm_hooks.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst index 5bd0378e7c7b7..d0f11fe0b0412 100644 --- a/docs/source/ddp_comm_hooks.rst +++ b/docs/source/ddp_comm_hooks.rst @@ -84,6 +84,18 @@ PowerSGD Hooks .. autofunction:: powerSGD_hook .. autofunction:: batched_powerSGD_hook +Debugging Communication Hooks +----------------------------- + +As the name implies, debugging communication hooks are **only** used for debugging and performance optimization purpose. + +.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks + +.. warning :: + Debugging communication hooks do not necessarily output the correct results. + +.. autofunction:: noop_hook + Acknowledgements ---------------- From 59c6ceb6a8338c5de3f3aee7b7790b1d0daefb0a Mon Sep 17 00:00:00 2001 From: Zeina Migeed Date: Wed, 1 Sep 2021 18:04:19 -0700 Subject: [PATCH 450/530] add documentation to shape inference algorithm (#64312) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64312 Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D30709254 Pulled By: migeed-z fbshipit-source-id: 3297d26fe6727c5b9ca176625b1683d787f59659 --- .../experimental/graph_gradual_typechecker.py | 152 +++++++++++++----- 1 file changed, 114 insertions(+), 38 deletions(-) diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py index a54e52151f858..6094952f1695e 100644 --- a/torch/fx/experimental/graph_gradual_typechecker.py +++ b/torch/fx/experimental/graph_gradual_typechecker.py @@ -28,7 +28,7 @@ def expand_to_tensor_dim(t, n): Expand a type to the desired tensor dimension if possible Raise an error otherwise. - t is the given type - - n is a number to expand to + - n is a number of dimensions to expand to """ if t == Dyn: dims = [Dyn] * n @@ -42,6 +42,13 @@ def expand_to_tensor_dim(t, n): def broadcast_types(t1, t2): + """ + Applies broadcasting to both given types such that they + become consistent with eachother and returns two new + resulting types + """ + + # if either type is Dyn, do nothing since the types are already consistent if t1 == Dyn or t2 == Dyn or isinstance(t1, Var) or isinstance(t2, Var): return t1, t2 @@ -52,7 +59,8 @@ def broadcast_types(t1, t2): new_t1 = list(t1.__args__) new_t2 = list(t2.__args__) - # here, we make our tensors the same length + # We make the types the same length which is the first requirement + # for consistency if s1 > s2: for i in range(s1 - s2): new_t2.insert(0, 1) @@ -61,15 +69,18 @@ def broadcast_types(t1, t2): for i in range(s2 - s1): new_t1.insert(0, 1) + # we replace occurrences of "1" with each tensor with + # the corresponding type from the other tensor for i, (x, y) in enumerate(zip(new_t1, new_t2)): if x == 1: new_t1[i] = y elif y == 1: new_t2[i] = x + # at this point our tensors should be consistent + # and we can apply the element-wise operation and find the right dimension + # for the output of the operation (t1, t2) = TensorType(tuple(new_t1)), TensorType(tuple(new_t2)) - - return (t1, t2) else: raise TypeError(f'Cannot broadcast types {t1} and {t2}') @@ -77,7 +88,7 @@ def broadcast_types(t1, t2): def register_inference_rule(call_target): def register(fn): if call_target in _INFERENCE_RULES: - raise RuntimeError('Inference rule already registered for {call_target}!') + raise RuntimeError(f'Inference rule already registered for {call_target}!') _INFERENCE_RULES[call_target] = fn return fn return register @@ -85,7 +96,7 @@ def register(fn): def register_refinement_rule(call_target): def register(fn): if call_target in _REFINEMENT_RULES: - raise RuntimeError('Refinement rule already registered for {call_target}!') + raise RuntimeError(f'Refinement rule already registered for {call_target}!') _REFINEMENT_RULES[call_target] = fn return fn return register @@ -93,7 +104,7 @@ def register(fn): def register_algebraic_expressions_inference_rule(call_target): def register(fn): if call_target in _RULES: - raise RuntimeError('Rule already registered for {call_target}!') + raise RuntimeError(f'Rule already registered for {call_target}!') _RULES[call_target] = fn return fn return register @@ -101,6 +112,17 @@ def register(fn): @register_inference_rule(torch.add) @register_inference_rule(operator.add) def add_inference_rule(n: Node): + """ + Apply the addition inference rule. This includes: + - scalar addition + - broadcasting semantics + + Note that we always return the least precise type between + the operands (after applying broadcasting) to be the final type of the operation + + Note that we do not modify the operand types themselves after applying broadcasting + to them. We only use them to calculate the final type + """ assert isinstance(n.args[0], Node) assert isinstance(n.args[1], Node) t1 = n.args[0].type @@ -111,10 +133,15 @@ def add_inference_rule(n: Node): n.type = t2 return n.type + # handle scalar addition elif t2 == int and isinstance(t1, TensorType): n.type = t1 return n.type + # we bring the new types to the point where + # we can check for consistency + # any inconsistency would not have been caused + # by broadcasting at this point (new_t1, new_t2) = broadcast_types(t1, t2) if new_t1 != t1 or new_t2 != t2: @@ -122,13 +149,13 @@ def add_inference_rule(n: Node): n.meta[str(n.args[0])] = new_t1 n.meta[str(n.args[1])] = new_t2 - # Todo: maybe figure out that broadcasting definitely did not happen? else: n.meta['broadcast'] = False new_t1 = t1 if not n.meta['broadcast'] else new_t1 new_t2 = t2 if not n.meta['broadcast'] else new_t2 + # we check for consistency between the new types if is_consistent(new_t1, new_t2): # we return the less precise type because # broadcasting may have happened @@ -145,6 +172,12 @@ def add_inference_rule(n: Node): @register_inference_rule(getattr) def get_attr_inference_rule(n: Node, traced): + """ + The current getattr rule only handles the shape attribute + Can be extended to other attributes + The most representitive type we have is "Dyn" but the system + can be extended with more types, such as a type to represent shapes + """ attr_node = n.args[0] attr_name = n.args[1] @@ -158,6 +191,10 @@ def get_attr_inference_rule(n: Node, traced): @register_inference_rule(torch.transpose) def transpose_inference_rule(n: Node): + """ + We check that dimentions for the transpose operations + are within range of the tensor type of the node + """ if n.target == torch.transpose: assert isinstance(n.args[0], Node) t = n.args[0].type @@ -171,12 +208,11 @@ def transpose_inference_rule(n: Node): return n.type elif isinstance(t, TensorType): - if 0 <= dim1 < len(t.__args__) and 0 <= dim2 < len(t.__args__): new_type = list(t.__args__) new_type[dim1], new_type[dim2] = new_type[dim2], new_type[dim1] final = TensorType(new_type) - n.type = final + n.type = get_greatest_upper_bound(n.type, final) return n.type else: raise TypeError(f'Cannot transpose {dim1} and {dim2} in type {t} for node {n}') @@ -186,6 +222,15 @@ def transpose_inference_rule(n: Node): @register_inference_rule(torch.reshape) def reshape_inference_rule(n: Node): + """ + Without dynamism, the rule checks that the + product of the elements of the argument tensor + type is equal to the product of the elements + of the required shape. We gradualize this rule + by adding a case to handle fully dynamic input + as well as input where some of the tensor dimensions + are unknown. In this case we check for divisibility + """ assert isinstance(n.args[0], Node) t1 = n.args[0].type @@ -201,7 +246,7 @@ def reshape_inference_rule(n: Node): # if any of the dimensions are unknown, # we check for divisibility - elif isinstance(t1, TensorType) and Dyn in t1.__args__ or -1 in t2: + elif isinstance(t1, TensorType): assert isinstance(t1, TensorType) a = [e if e != Dyn else 1 for e in t1.__args__] p1 = reduce(lambda x, y: x * y, a) @@ -211,17 +256,6 @@ def reshape_inference_rule(n: Node): return t2_type else: raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}') - - # if all dimensions are known we check the products - elif isinstance(t1, TensorType): - p1 = reduce(lambda x, y: x * y, t1.__args__) - p2 = reduce(lambda x, y: x * y, t2) - if p1 == p2: - n.type = t2_type - return t2_type - else: - raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}') - else: raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}') @@ -260,7 +294,7 @@ def bn2d_inference_rule(n: Node, module_instance): def calculate_out_dimension(d_in, module_instance, index): """ - For calculating h_in and w_out. + For calculating h_in and w_out according to the conv2D documentation """ padding = (module_instance.padding, module_instance.padding) \ if isinstance(module_instance.padding, int) else module_instance.padding @@ -346,6 +380,10 @@ def relu_inference_rule(n: Node, module_instance): def maxpool2d_check(typ, module_instance): + """ + Applies the maxpool2d shape information to the input + this affects the last two dimensions + """ new_type_list = list(typ.__args__) if len(new_type_list) == 4 or len(new_type_list) == 3: w_in = new_type_list[-1] @@ -391,7 +429,6 @@ def linear_check(tensor_type, module_instance): """ if len(tensor_type.__args__) >= 2: if is_consistent(module_instance.in_features, tensor_type.__args__[-1]): - # Todo backwards propagation new_type_args = list(tensor_type.__args__) new_type_args[-1] = module_instance.out_features return TensorType(tuple(new_type_args)) @@ -403,6 +440,10 @@ def linear_check(tensor_type, module_instance): @register_inference_rule(torch.nn.Linear) def linear_inference_rule(n: Node, module_instance): + """ + Applies the shape information to the input then gets the greatest upper bound + of the resulting type and the existing type + """ assert isinstance(n.args[0], Node) if n.args[0].type == Dyn and isinstance(n.type, TensorType): n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__)) @@ -470,6 +511,10 @@ def flatten_check(tensor_type, start_dim, end_dim): @register_inference_rule(torch.flatten) def flatten_inference_rule(n: Node): + """ + Applies the flatten shape information to the input then gets the + greatest upper bound of the resulting type and the existing type + """ assert isinstance(n.args[0], Node) # set the default start and end dims @@ -568,6 +613,10 @@ def get_node_type(a): @register_refinement_rule(Conv2d) def conv_refinement_rule(n: Node): + """ + The equality constraints are between the first dimension of + the input and output + """ res = [] assert isinstance(n.args[0], Node) arg_type = n.args[0].type @@ -578,6 +627,10 @@ def conv_refinement_rule(n: Node): @register_refinement_rule(torch.nn.Linear) def linear_refinement_rule(n: Node): + """ + The equality constraints are between the first dimension of + the input and output + """ res = [] assert isinstance(n.args[0], Node) arg_type = n.args[0].type @@ -585,10 +638,12 @@ def linear_refinement_rule(n: Node): res = [Equality(arg_type.__args__[0], n.type.__args__[0])] return res -# todo needs review for addition. Is this constraint correct? @register_refinement_rule(BatchNorm2d) @register_refinement_rule(torch.nn.ReLU) def all_eq(n: Node): + """ + For operations where the input shape is equal to the output shape + """ res = [] assert isinstance(n.args[0], Node) arg_type = n.args[0].type @@ -600,7 +655,12 @@ def all_eq(n: Node): @register_refinement_rule(torch.nn.AdaptiveAvgPool2d) -def first_two__eq(n: Node): +@register_refinement_rule(torch.nn.MaxPool2d) +def first_two_eq(n: Node): + """ + For operations where the first two dimensions of the input and output shape + are equal + """ res = [] assert isinstance(n.args[0], Node) arg_type = n.args[0].type @@ -610,19 +670,37 @@ def first_two__eq(n: Node): res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])] return res + @register_refinement_rule(torch.add) @register_refinement_rule(operator.add) -def add_eq(n: Node): +def element_wise_eq(n: Node): + """ + For element-wise operations and handles broadcasting. + Note that after applying broadcasting to the arguments + we are able to determine if certain dimensions have not been broadcast + if they are symbolicallu equal. + + in this case, we can establish equality between those dimensions and the + corresponding output dimensions. + + Note that it takes two iterations for this result. One iteration to establish + equality between certain dimensions of the operands (requiring the whole solver + including unification) and another iteration to establish equality between the operands + and the resulting type, requiring another round of constraint generation and unificaiton. + """ res = [] if isinstance(n.args[0], Node) and isinstance(n.args[1], Node): arg_type1 = n.args[0].type arg_type2 = n.args[1].type if isinstance(arg_type1, TensorType) and isinstance(arg_type2, TensorType) and isinstance(n.type, TensorType): args1, args2 = broadcast_types(arg_type1, arg_type2) - # by this point, we know for sure that args1 and args2 are the same size. + # by this point, we know that args1 and args2 are the same size. a1 = args1.__args__ a2 = args2.__args__ a3 = n.type.__args__ + + # we would be here in the second iteration where we establish equality + # between operand type dimensions and the resulting type dimensions r = [] for x, y, z in zip(a1, a2, a3): if x == y: @@ -630,19 +708,13 @@ def add_eq(n: Node): res = r return res -@register_refinement_rule(torch.nn.MaxPool2d) -def first_two(n: Node): - res = [] - assert isinstance(n.args[0], Node) - arg_type = n.args[0].type - if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType): - args1 = arg_type.__args__ - args2 = n.type.__args__ - res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])] - return res @register_refinement_rule(torch.flatten) def flatten_refinement_rule(n: Node): + """ + Generates equality constraints between the dimensions of the input and output + that will not be involved in the flatten operation + """ assert isinstance(n.args[0], Node) eq_const = [] @@ -674,6 +746,10 @@ def flatten_refinement_rule(n: Node): @register_algebraic_expressions_inference_rule(Conv2d) def conv_rule(n: Node, module_instance): + """ + Represents the outout in terms of an algrbraic expression w.r.t + the input when possible + """ assert isinstance(n.args[0], Node) arg_type = n.args[0].type if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType): From 4d6314a16e78027832186f5442df888dbabbc159 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 18:12:02 -0700 Subject: [PATCH 451/530] [DDP] Log num threads (#64072) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64072 Log gloo threads to DDP logging. ghstack-source-id: 137119480 Test Plan: CI Reviewed By: mrshenli Differential Revision: D30596083 fbshipit-source-id: 2b4f6e762cb5d850be6056bcc5922029a1af3c91 --- torch/csrc/distributed/c10d/ProcessGroupGloo.hpp | 4 ++++ torch/csrc/distributed/c10d/logger.cpp | 11 +++++++++++ .../testing/_internal/distributed/distributed_test.py | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp index 036ce91b85faf..5c0c76afa2453 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp @@ -318,6 +318,10 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup { // may indicate that there is some sort of collective desynchronization. uint64_t getSequenceNumberForGroup() override; + int getNumThreads() { + return options_->threads; + } + protected: std::unique_ptr<::gloo::rendezvous::Store> store_; const c10::intrusive_ptr options_; diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp index 0bb960a639907..b1efd0b238378 100644 --- a/torch/csrc/distributed/c10d/logger.cpp +++ b/torch/csrc/distributed/c10d/logger.cpp @@ -4,6 +4,10 @@ #include #include +#ifdef USE_C10D_GLOO +#include +#endif + namespace c10d { // When training runs at these iterations, log the runtime @@ -68,6 +72,13 @@ void Logger::set_env_variables() { parse_env("GLOO_SOCKET_IFNAME"); ddp_logging_data_->strs_map["gloo_device_transport"] = parse_env("GLOO_DEVICE_TRANSPORT"); + + #ifdef USE_C10D_GLOO + auto gloo_pg = + static_cast(reducer_->process_group_.get()); + auto n_threads = gloo_pg->getNumThreads(); + ddp_logging_data_->ints_map["gloo_num_threads"] = n_threads; + #endif } } diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index f17842ee02a22..613e23ede8f84 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -5074,6 +5074,12 @@ def parse_env(var): ddp_logging_data.get("gloo_device_transport"), parse_env("GLOO_DEVICE_TRANSPORT"), ) + default_gloo_threads = 2 + self.assertEqual( + ddp_logging_data.get("gloo_num_threads"), + default_gloo_threads, + ) + self.assertEqual(ddp_logging_data.get("nccl_socket_ifname"), None) self.assertEqual(ddp_logging_data.get("nccl_blocking_wait"), None) self.assertEqual(ddp_logging_data.get("nccl_async_error_handling"), None) From d067f156220f987e73a524ad414cfa082ae39aac Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 21:07:01 -0700 Subject: [PATCH 452/530] [Dist CI] Move rest of distributed tests to their own CI job (#64253) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64253 Follow up to D30496178 (https://github.com/pytorch/pytorch/commit/f4aff3a346a0525e37d6071f318f7a4c54d5e1fb) to move the rest of distributed tests to their own jobs for Linux GHA. ghstack-source-id: 137233785 Test Plan: CI Reviewed By: walterddr Differential Revision: D30662999 fbshipit-source-id: f7cfbc0d1223aca52120f17f9da987d70fda8de6 --- test/run_test.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index 5953919b16323..5d3856ba3e144 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -286,7 +286,56 @@ def skip_test_p(name: str) -> bool: ] DISTRIBUTED_TESTS = [ + "distributed/test_data_parallel", + "distributed/test_launcher", + "distributed/nn/jit/test_instantiator", + "distributed/rpc/test_faulty_agent", + "distributed/rpc/test_tensorpipe_agent", + "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/test_c10d_common", + "distributed/test_c10d_gloo", + "distributed/test_c10d_nccl", + "distributed/test_jit_c10d", + "distributed/test_c10d_spawn_gloo", + "distributed/test_c10d_spawn_nccl", + "distributed/test_store", + "distributed/test_pg_wrapper", + "distributed/algorithms/test_join", "distributed/test_distributed_spawn", + "distributed/pipeline/sync/skip/test_api", + "distributed/pipeline/sync/skip/test_gpipe", + "distributed/pipeline/sync/skip/test_inspect_skip_layout", + "distributed/pipeline/sync/skip/test_leak", + "distributed/pipeline/sync/skip/test_portal", + "distributed/pipeline/sync/skip/test_stash_pop", + "distributed/pipeline/sync/skip/test_tracker", + "distributed/pipeline/sync/skip/test_verify_skippables", + "distributed/pipeline/sync/test_balance", + "distributed/pipeline/sync/test_bugs", + "distributed/pipeline/sync/test_checkpoint", + "distributed/pipeline/sync/test_copy", + "distributed/pipeline/sync/test_deferred_batch_norm", + "distributed/pipeline/sync/test_dependency", + "distributed/pipeline/sync/test_inplace", + "distributed/pipeline/sync/test_microbatch", + "distributed/pipeline/sync/test_phony", + "distributed/pipeline/sync/test_pipe", + "distributed/pipeline/sync/test_pipeline", + "distributed/pipeline/sync/test_stream", + "distributed/pipeline/sync/test_transparency", + "distributed/pipeline/sync/test_worker", + "distributed/optim/test_zero_redundancy_optimizer", + "distributed/elastic/timer/api_test", + "distributed/elastic/timer/local_timer_example", + "distributed/elastic/timer/local_timer_test", + "distributed/elastic/events/lib_test", + "distributed/elastic/metrics/api_test", + "distributed/elastic/utils/logging_test", + "distributed/elastic/utils/util_test", + "distributed/elastic/utils/distributed_test", + "distributed/elastic/multiprocessing/api_test", + "distributed/_sharding_spec/test_sharding_spec", + "distributed/_sharded_tensor/test_sharded_tensor", ] # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when From 71e149834b786f9e451788c16096c470191c9f04 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 1 Sep 2021 21:48:36 -0700 Subject: [PATCH 453/530] Add a warning about DataLoader num_workers > 0 "memory leak" (#64337) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64337 See https://github.com/pytorch/pytorch/issues/13246 Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D30690320 Pulled By: ezyang fbshipit-source-id: 2751aca05a94e63d25162599f458855988516fad --- docs/source/data.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/source/data.rst b/docs/source/data.rst index 9135c87d09262..b03fcb5858531 100644 --- a/docs/source/data.rst +++ b/docs/source/data.rst @@ -264,6 +264,21 @@ Setting the argument :attr:`num_workers` as a positive integer will turn on multi-process data loading with the specified number of loader worker processes. +.. warning:: + After several iterations, the loader worker processes will consume + the same amount of CPU memory as the parent process for all Python + objects in the parent process which are accessed from the worker + processes. This can be problematic if the Dataset contains a lot of + data (e.g., you are loading a very large list of filenames at Dataset + construction time) and/or you are using a lot of workers (overall + memory usage is ``number of workers * size of parent process``). The + simplest workaround is to replace Python objects with non-refcounted + representations such as Pandas, Numpy or PyArrow objects. Check out + `issue #13246 + `_ + for more details on why this occurs and example code for how to + workaround these problems. + In this mode, each time an iterator of a :class:`~torch.utils.data.DataLoader` is created (e.g., when you call ``enumerate(dataloader)``), :attr:`num_workers` worker processes are created. At this point, the :attr:`dataset`, From 69e1207084e6b8932b870ee2a315eb539859a67f Mon Sep 17 00:00:00 2001 From: Shirong Wu Date: Wed, 1 Sep 2021 22:09:42 -0700 Subject: [PATCH 454/530] Move graph util to fx2trt (#64064) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64064 Move original util in torch2trt to fx2trt dir since torch2trt is gonne be deprecated. This is a follow up diff for D30379124 Test Plan: manual Reviewed By: yinghai, mikekgfb Differential Revision: D30591687 fbshipit-source-id: ae0e59dfbc2d2e2aa4f3ccea7cff2291c7deb388 --- .../experimental/fx2trt/tools/graph_util.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 torch/fx/experimental/fx2trt/tools/graph_util.py diff --git a/torch/fx/experimental/fx2trt/tools/graph_util.py b/torch/fx/experimental/fx2trt/tools/graph_util.py new file mode 100644 index 0000000000000..96c8b12915da4 --- /dev/null +++ b/torch/fx/experimental/fx2trt/tools/graph_util.py @@ -0,0 +1,64 @@ +import graphviz # type: ignore[import] + +def get_layer_name_type(layer): + return "\n".join(f"{i}" for i in [layer.name, layer.type]) + +def trt_network_to_dot_graph(network): + dot = graphviz.Digraph(comment="Network") + + # add nodes (layers) + for i in range(network.num_layers): + layer = network.get_layer(i) + dot.node(get_layer_name_type(layer)) + + # add nodes (inputs) + for i in range(network.num_inputs): + dot.node(network.get_input(i).name) + + # add nodes (outputs) + for i in range(network.num_outputs): + dot.node(network.get_output(i).name) + + # add layer->layer edges + for a in range(network.num_layers): + layer_a = network.get_layer(a) + + for b in range(network.num_layers): + layer_b = network.get_layer(b) + + for i in range(layer_a.num_outputs): + output_i = layer_a.get_output(i) + + for j in range(layer_b.num_inputs): + input_j = layer_b.get_input(j) + + if output_i == input_j: + dot.edge(get_layer_name_type(layer_a), get_layer_name_type(layer_b), label=str(input_j.shape)) + + # add input->layer edges + for i in range(network.num_inputs): + input_i = network.get_input(i) + + for b in range(network.num_layers): + layer_b = network.get_layer(b) + + for j in range(layer_b.num_inputs): + input_j = layer_b.get_input(j) + + if input_i == input_j: + dot.edge(input_i.name, get_layer_name_type(layer_b), label=str(input_j.shape)) + + # add layer->output edges + for i in range(network.num_outputs): + input_i = network.get_output(i) + + for b in range(network.num_layers): + layer_b = network.get_layer(b) + + for j in range(layer_b.num_outputs): + input_j = layer_b.get_output(j) + + if input_i == input_j: + dot.edge(get_layer_name_type(layer_b), input_i.name, label=str(input_j.shape)) + + return dot From 0addd75be9a87d03830790a5c6a9b2e201a09c13 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 2 Sep 2021 00:48:03 -0700 Subject: [PATCH 455/530] Remove unnecessary resize_output (#64272) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64272 Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: H-Huang, bdhirsh Differential Revision: D30686941 Pulled By: ezyang fbshipit-source-id: de60e6f1115648f8cf7daaa1e652594fe8b06742 --- aten/src/ATen/native/UnaryOps.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index d5052a77f5b62..b7e596392c716 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -191,7 +191,6 @@ TORCH_IMPL_FUNC(polygamma_out) } TORCH_IMPL_FUNC(signbit_out) (const Tensor& self, const Tensor& result) { - at::native::resize_output(result, self.sizes()); if (self.dtype() == at::kBool) { result.fill_(false); } else { From 8d5b95019d69d43963b33a1b188ad1fec8079664 Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Thu, 2 Sep 2021 00:50:40 -0700 Subject: [PATCH 456/530] [PyTorch Edge] Support default args with out arg, flag off (#63540) Summary: 1. Allow consuming operators with defaults arguments and out arguments. Flag is off to keep the same behavior as v6, in pr 63651, turn on the flag. 2. Add two unittests to cover this type of operators. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63540 ghstack-source-id: 137211562 Test Plan: ``` caffe2/test/cpp/jit:jit - LiteInterpreterTest.DefaultArgsWithOutArg caffe2/test/cpp/jit:jit - LiteInterpreterTest.DefaultArgsPinvWithOutArg ``` Reviewed By: raziel, iseeyuan, tugsbayasgalan Differential Revision: D30414156 fbshipit-source-id: 0f3a219a22aee10ac53184cbd95940726c459d1f --- caffe2/serialize/versions.h | 2 +- test/cpp/jit/test_lite_interpreter.cpp | 62 +++++++++++++++++++ torch/csrc/jit/mobile/function.cpp | 38 ++++++++---- torch/csrc/jit/runtime/interpreter.cpp | 2 + torch/csrc/jit/runtime/interpreter.h | 1 + .../csrc/jit/runtime/interpreter/code_impl.h | 37 ++++++----- 6 files changed, 115 insertions(+), 27 deletions(-) diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h index 61c8c46666e67..ed5795841d1f9 100644 --- a/caffe2/serialize/versions.h +++ b/caffe2/serialize/versions.h @@ -85,7 +85,7 @@ static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion, // we should support this model_version. For example, we provide a wrapper to // handle an updated operator. constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L; -constexpr uint64_t kMaxSupportedBytecodeVersion = 0x6L; +constexpr uint64_t kMaxSupportedBytecodeVersion = 0x7L; } // namespace serialize } // namespace caffe2 diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index 26100b3b6f508..b362c8a6ddb06 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -1035,6 +1035,68 @@ TEST(LiteInterpreterTest, DefaultArgsPinvSpecifyDefault) { testLiteModuleCompareResultTensors(m, inputs); } +void testDefaultArgsPinvWithOutArg(int num_args) { + Module m("m"); + if (num_args == 1) { + m.define(R"( + def forward(self, input): + return torch.linalg_pinv(input, out=input) + )"); + } else if (num_args == 2) { + m.define(R"( + def forward(self, input): + return torch.linalg_pinv(input, 1e-5, out=input) + )"); + } else if (num_args == 3) { + m.define(R"( + def forward(self, input): + return torch.linalg_pinv(input, 1e-5, True, out=input) + )"); + } + + const int N = 28; + auto input = torch::range(1, N * N, 1); + input[0] = 10000; // a more stable matrix + input = input.view({N, N}); + auto ref = m.run_method("forward", input); + TORCH_CHECK(!input.equal(torch::range(1, N * N, 1))); + TORCH_CHECK(input.equal(ref.toTensor())); +} + +TEST(LiteInterpreterTest, DefaultArgsPinvWithOutArg) { + // Test with different number of specified arguments + out arg. + // Arguments not specified take default value. + for (int num_args = 1; num_args <= 3; ++num_args) { + testDefaultArgsPinvWithOutArg(num_args); + } +} + +TEST(LiteInterpreterTest, DefaultArgsWithOutArg) { + Module m("m"); + m.define(R"( + def forward(self, x, h): + torch.add(x, h, out=x) + )"); + + std::vector inputs; + auto input_x = 2 * torch::ones({}); + auto input_h = torch::ones({}); + auto ref = m.run_method("forward", input_x, input_h); + + std::stringstream ss; + + m._save_for_mobile(ss, {}, true); + mobile::Module bc = _load_for_mobile(ss); + bc.run_method("forward", input_x, input_h); + AT_ASSERT(input_x.equal(4 * torch::ones({}))); + + auto ops = _get_model_ops_and_info(ss); + auto op = ops.find("aten::add.out"); + TORCH_CHECK( + op != ops.end() && op->second.num_schema_args.has_value() && + op->second.num_schema_args.value() == 4); +} + TEST(LiteInterpreterTest, TestExceptionStackWithTwoLevelModuleHierarchy) { Module a("A"); a.define(R"( diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp index 127bd5f9418d4..fad8c39bd1f4d 100644 --- a/torch/csrc/jit/mobile/function.cpp +++ b/torch/csrc/jit/mobile/function.cpp @@ -99,21 +99,35 @@ bool Function::append_operator( // from model. We can use it to handle backward compatibility. if (num_specified_args && num_specified_args.value() < static_cast(args.size())) { - // Sanity check at load time, to save perf at runtime - for (size_t i = num_specified_args.value(); i < args.size(); ++i) { - auto default_val = args[i].default_value(); - TORCH_CHECK( - default_val.has_value(), - "Error happened at preparing for default values for the argument. The ", - i, - "th arguement of operator", - opname, - " does not have a specified value or default value. "); - } fn = [fn, num_specified_args, args](Stack& stack) { - for (size_t i = num_specified_args.value(); i < args.size(); ++i) { + std::vector out_args; + // The following logic pops and temporarily stores all out arguments + // from the stack (which can be 0 or more, and always appended to the + // schema), in order to push the necessary default values. Finally, the + // out arguments are pushed back into the stack. + for (size_t i = args.size() - 1; i > 0 && args.at(i).is_out(); i--) { + out_args.push_back(stack.back()); + stack.pop_back(); + } + size_t start_index = num_specified_args.value() - out_args.size(); + TORCH_CHECK( + start_index >= 0, + "The number of output arguments is: ", + out_args.size(), + ", which is more then the number of specified arguments: ", + num_specified_args.value()); + for (size_t i = start_index; i < (args.size() - out_args.size()); ++i) { + TORCH_CHECK( + args[i].default_value().has_value(), + "Error happened at preparing for default values for the argument. The ", + i, + "th argument ", + args[i].name(), + " does not have a specified value or default value. "); + stack.push_back(args[i].default_value()); } + stack.insert(stack.end(), out_args.rbegin(), out_args.rend()); fn(stack); }; } diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 70c9c6c653326..b34827176b2f3 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -978,11 +978,13 @@ MobileCode::MobileCode( const std::shared_ptr& graph, std::string function_name, bool emit_default_input_instructions, + bool support_default_args_before_out, size_t remaining_bailout_depth) : Code(new interpreter::MobileCodeImpl( graph, std::move(function_name), emit_default_input_instructions, + support_default_args_before_out, remaining_bailout_depth)) {} MobileCode::~MobileCode() = default; diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h index 80720ea2ca42f..3471e558e5a41 100644 --- a/torch/csrc/jit/runtime/interpreter.h +++ b/torch/csrc/jit/runtime/interpreter.h @@ -82,6 +82,7 @@ struct TORCH_API MobileCode : Code { const std::shared_ptr& graph, std::string function_name, bool emit_default_input_instructions = true, + bool support_default_args_before_out = false, size_t remaining_bailout_depth = 0); ~MobileCode(); }; diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h index 682c695138674..15ba0cec04d33 100644 --- a/torch/csrc/jit/runtime/interpreter/code_impl.h +++ b/torch/csrc/jit/runtime/interpreter/code_impl.h @@ -721,9 +721,11 @@ struct MobileCodeImpl : CodeImpl { const std::shared_ptr& graph, std::string function_name, bool emit_default_input_instructions, + bool support_default_args_before_out, size_t remaining_bailout_depth) : CodeImpl(graph, function_name, remaining_bailout_depth, false), - emit_default_input_instructions_(emit_default_input_instructions) { + emit_default_input_instructions_(emit_default_input_instructions), + support_default_args_before_out_(support_default_args_before_out) { // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall) run(); } @@ -746,11 +748,12 @@ struct MobileCodeImpl : CodeImpl { // skip if schema has vararg if (!op_schema.is_vararg()) { auto specifiedArgs = CalculateNecessaryArgs( - op_schema.arguments(), node->inputs(), false); - // preserving the old behavior - auto numInclude = specifiedArgs.first; - // TODO uncomment this - // auto numInclude = specifiedArgs.first + specifiedArgs.second; + op_schema.arguments(), + node->inputs(), + support_default_args_before_out_); + + size_t numInclude = specifiedArgs.first + + (support_default_args_before_out_ ? specifiedArgs.second : 0); auto unique_name = op_schema.overload_name() != "" ? op_schema.name() + "." + op_schema.overload_name() : op_schema.name(); @@ -782,21 +785,27 @@ struct MobileCodeImpl : CodeImpl { if (it != op_to_num_specified_args_.end()) { num_include = it->second; } - emitLoadInputs(node->inputs(), num_include); - // TODO: uncomment this - // auto num_out = op_to_num_out_args_.find(unique_op_name)->second; - // auto num_specified_before_out = num_include - num_out; - // emitLoadInputs(node->inputs(), 0, num_specified_before_out); - // emitLoadInputs(node->inputs(), node->inputs().size() - num_out, - // node->inputs().size()); - + if (support_default_args_before_out_) { + auto num_out = op_to_num_out_args_.find(unique_op_name)->second; + auto num_specified_before_out = num_include - num_out; + emitLoadInputs(node->inputs(), 0, num_specified_before_out); + emitLoadInputs( + node->inputs(), + node->inputs().size() - num_out, + node->inputs().size()); + } else { + emitLoadInputs(node->inputs(), num_include); + } insertInstruction(OP, operator_table_.size()); } operator_table_.emplace_back(op.getOperation(node)); } } + // To support forward compatibility for bytecode version bump from v5 to v6 bool emit_default_input_instructions_; + // To support forward compatibility for bytecode version bump from v6 to v7 + bool support_default_args_before_out_; }; } // namespace interpreter From ee8a6c1d141ae49e23323bdd485fb4b390541f69 Mon Sep 17 00:00:00 2001 From: Pavel Belevich Date: Thu, 2 Sep 2021 00:57:39 -0700 Subject: [PATCH 457/530] Replace std::unordered_map with DeviceMap (#64393) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64393 cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 Test Plan: Imported from OSS Reviewed By: rohan-varma Differential Revision: D30708384 Pulled By: pbelevich fbshipit-source-id: 1c565727e4f09cd9e560874dd90aa403470b4a97 --- .../distributed/autograd/functions/recvrpc_backward.cpp | 2 +- .../distributed/autograd/functions/recvrpc_backward.h | 4 ++-- .../autograd/rpc_messages/rpc_with_autograd.cpp | 8 ++++---- .../distributed/autograd/rpc_messages/rpc_with_autograd.h | 8 ++++---- torch/csrc/distributed/autograd/utils.cpp | 4 ++-- torch/csrc/distributed/autograd/utils.h | 4 ++-- torch/csrc/distributed/rpc/request_callback_no_python.cpp | 2 +- torch/csrc/distributed/rpc/rpc_agent.h | 2 +- .../distributed/rpc/testing/faulty_tensorpipe_agent.cpp | 2 +- .../distributed/rpc/testing/faulty_tensorpipe_agent.h | 2 +- torch/csrc/distributed/rpc/utils.cpp | 2 +- 11 files changed, 20 insertions(+), 20 deletions(-) diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp index 0d82c07835f55..a492d9847fb37 100644 --- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp +++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp @@ -15,7 +15,7 @@ RecvRpcBackward::RecvRpcBackward( const AutogradMetadata& autogradMetadata, ContextPtr autogradContext, rpc::worker_id_t fromWorkerId, - std::unordered_map deviceMap) + rpc::DeviceMap deviceMap) : autogradMetadata_(autogradMetadata), // NOLINTNEXTLINE(performance-move-const-arg) autogradContext_(std::move(autogradContext)), diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h index 46bdb297cdf46..6e6678b128985 100644 --- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h +++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h @@ -23,7 +23,7 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node { const AutogradMetadata& autogradMetadata, std::shared_ptr autogradContext, rpc::worker_id_t fromWorkerId, - std::unordered_map deviceMap); + rpc::DeviceMap deviceMap); torch::autograd::variable_list apply( torch::autograd::variable_list&& grads) override; @@ -41,7 +41,7 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node { rpc::worker_id_t fromWorkerId_; // Device mapping for tensors sent over RPC. - const std::unordered_map deviceMap_; + const rpc::DeviceMap deviceMap_; }; } // namespace autograd diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp index 4d84e99753961..b8d28f7be7c2d 100644 --- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp +++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp @@ -19,7 +19,7 @@ RpcWithAutograd::RpcWithAutograd( MessageType messageType, const AutogradMetadata& autogradMetadata, c10::intrusive_ptr wrappedMessage, - std::unordered_map deviceMap) + rpc::DeviceMap deviceMap) : fromWorkerId_(fromWorkerId), messageType_(messageType), autogradMetadata_(autogradMetadata), @@ -39,7 +39,7 @@ RpcWithAutograd::RpcWithAutograd( std::unique_ptr wrappedRpc, MessageType wrappedMessageType, std::vector tensors, - std::unordered_map deviceMap) + rpc::DeviceMap deviceMap) : fromWorkerId_(fromWorkerId), messageType_(messageType), autogradMetadata_(autogradMetadata), @@ -112,7 +112,7 @@ std::unique_ptr RpcWithAutograd::fromMessage( auto c10DeviceMap = tupleElements[4].to>(); // Convert to regular map. - std::unordered_map deviceMap; + rpc::DeviceMap deviceMap; for (const auto& mapEntry : c10DeviceMap) { deviceMap.insert({mapEntry.key(), mapEntry.value()}); } @@ -169,7 +169,7 @@ rpc::worker_id_t RpcWithAutograd::fromWorkerId() const { return fromWorkerId_; } -const std::unordered_map& RpcWithAutograd:: +const rpc::DeviceMap& RpcWithAutograd:: deviceMap() { return deviceMap_; } diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h index 1884cc9742939..6d0b6111cc88c 100644 --- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h +++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h @@ -19,7 +19,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { rpc::MessageType messageType, const AutogradMetadata& autogradMetadata, c10::intrusive_ptr wrappedMessage, - std::unordered_map deviceMap = {}); + rpc::DeviceMap deviceMap = {}); // Used when receiving an RPC over the wire. RpcWithAutograd( @@ -29,7 +29,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { std::unique_ptr wrappedRpc, rpc::MessageType wrappedMessageType, std::vector tensors, - std::unordered_map deviceMap = {}); + rpc::DeviceMap deviceMap = {}); c10::intrusive_ptr toMessageImpl() && override; @@ -55,7 +55,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { rpc::worker_id_t fromWorkerId() const; // Retrieve the device map. - const std::unordered_map& deviceMap(); + const rpc::DeviceMap& deviceMap(); private: // WorkerId from which this RPC originated. This is necessary for knowing @@ -90,7 +90,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase { std::vector tensors_; // Device mapping for tensors that are sent across an RPC to another node. - std::unordered_map deviceMap_; + rpc::DeviceMap deviceMap_; }; } // namespace autograd diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp index 4e29bfcc1ffe9..9db40766c598a 100644 --- a/torch/csrc/distributed/autograd/utils.cpp +++ b/torch/csrc/distributed/autograd/utils.cpp @@ -53,7 +53,7 @@ ContextPtr addRecvRpcBackward( const AutogradMetadata& autogradMetadata, std::vector& tensors, rpc::worker_id_t fromWorkerId, - const std::unordered_map& deviceMap) { + const rpc::DeviceMap& deviceMap) { // Initialize autograd context if necessary. auto& autogradContainer = DistAutogradContainer::getInstance(); auto autogradContext = @@ -105,7 +105,7 @@ c10::intrusive_ptr getMessageWithAutograd( c10::intrusive_ptr wrappedRpcMsg, MessageType msgType, bool forceGradRecording, - const std::unordered_map& deviceMap) { + const rpc::DeviceMap& deviceMap) { auto& autogradContainer = DistAutogradContainer::getInstance(); // If there is no valid context and no tensor requires grads, send original diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h index fae675d3b81c6..94883ce605269 100644 --- a/torch/csrc/distributed/autograd/utils.h +++ b/torch/csrc/distributed/autograd/utils.h @@ -31,7 +31,7 @@ TORCH_API ContextPtr addRecvRpcBackward( const AutogradMetadata& autogradMetadata, std::vector& tensors, rpc::worker_id_t fromWorkerId, - const std::unordered_map& deviceMap); + const rpc::DeviceMap& deviceMap); // This method is a wrapper utility used internally to wrap autograd info // and attach autograd function for each type of rpc call if it has valid @@ -44,7 +44,7 @@ TORCH_API c10::intrusive_ptr getMessageWithAutograd( c10::intrusive_ptr wrappedRpcMsg, rpc::MessageType msgType, bool forceGradRecording = false, - const std::unordered_map& deviceMap = + const rpc::DeviceMap& deviceMap = {}); // Send message after autograd checking diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp index 5eada8d573f2f..9e16061e0ad42 100644 --- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp +++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp @@ -290,7 +290,7 @@ c10::intrusive_ptr RequestCallbackNoPython:: // Need to reverse the device map for the backward pass of distributed // autograd. - std::unordered_map reverseDeviceMap; + DeviceMap reverseDeviceMap; for (const auto& mapEntry : rpcWithAutograd.deviceMap()) { reverseDeviceMap.insert({mapEntry.second, mapEntry.first}); } diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h index a83e77bfe56f9..7cd228e57da8e 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.h +++ b/torch/csrc/distributed/rpc/rpc_agent.h @@ -164,7 +164,7 @@ class TORCH_API RpcAgent { const WorkerInfo& to, c10::intrusive_ptr message, const float rpcTimeoutSeconds = kUnsetRpcTimeout, - const std::unordered_map& deviceMap = {}) = 0; + const DeviceMap& deviceMap = {}) = 0; // Retries sending the message up to maxRetries times until an ACK is // receieved. The duration between consecutive sends is increased over diff --git a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp index 72d4d5dfec82e..a2e052535efac 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp @@ -67,7 +67,7 @@ c10::intrusive_ptr FaultyTensorPipeAgent::send( const WorkerInfo& to, c10::intrusive_ptr message, const float rpcTimeoutSeconds, - const std::unordered_map& /* unused */) { + const DeviceMap& /* unused */) { // We only fail control messages that have been specified by the test case. // For all other messages, we just send them without any failures. if (!shouldFailMessage(message->type())) { diff --git a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h index 5d6059747c219..e69a76cddc8ed 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h @@ -53,7 +53,7 @@ class TORCH_API FaultyTensorPipeAgent : public TensorPipeAgent { const WorkerInfo& to, c10::intrusive_ptr message, const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout, - const std::unordered_map& deviceMap = {}) + const DeviceMap& deviceMap = {}) override; // Add delay to writes diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp index 615abbf300666..820ec31691a0a 100644 --- a/torch/csrc/distributed/rpc/utils.cpp +++ b/torch/csrc/distributed/rpc/utils.cpp @@ -177,7 +177,7 @@ std::unique_ptr deserializeResponse( // Need to reverse the device map for the backward pass of distributed // autograd. - std::unordered_map reverseDeviceMap; + DeviceMap reverseDeviceMap; for (const auto& mapEntry : rpcWithAutograd.deviceMap()) { reverseDeviceMap.insert({mapEntry.second, mapEntry.first}); } From 76e187aa08556ce90e84b17e836784ffbb6905e0 Mon Sep 17 00:00:00 2001 From: Kushashwa Ravi Shrimali Date: Thu, 2 Sep 2021 01:08:53 -0700 Subject: [PATCH 458/530] Port `gather` to structured kernel (#63312) Summary: Will add a description once this is ready for review. cc: ysiraichi ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/63312 Reviewed By: iramazanli Differential Revision: D30597447 Pulled By: ezyang fbshipit-source-id: d36e59835c2f4b38e286032dd2a1111a7e16b7e5 --- aten/src/ATen/native/ScatterGatherChecks.h | 32 +++------- .../ATen/native/TensorAdvancedIndexing.cpp | 51 ++++++++++----- aten/src/ATen/native/TensorAdvancedIndexing.h | 12 ++-- .../ATen/native/cpu/ScatterGatherKernel.cpp | 34 +++------- .../ATen/native/cuda/ScatterGatherKernel.cu | 62 +++---------------- aten/src/ATen/native/native_functions.yaml | 7 +-- test/test_torch.py | 7 ++- 7 files changed, 76 insertions(+), 129 deletions(-) diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h index ad3b3fca097ca..0fc38d5bd7418 100644 --- a/aten/src/ATen/native/ScatterGatherChecks.h +++ b/aten/src/ATen/native/ScatterGatherChecks.h @@ -9,7 +9,7 @@ namespace at { namespace native { namespace { // checks whether index.dtype == int64 -// and self.dtyp == src.dtype if src is a Tensor +// and self.dtype == src.dtype if src is a Tensor static void scatter_gather_dtype_check( const std::string& method_name, const Tensor& self, @@ -31,42 +31,31 @@ static void scatter_gather_dtype_check( } // Used for `gather`-like methods +// Note: self means the input tensor here // Test: -// 1. index.size(d) == self.size(d) for all d != dim -// 2. index.size(d) <= src.size(d) for all d != dim -// 3. index.dim() == self.dim() == src.dim() +// 1. index.size(d) <= self.size(d) for all d != dim +// 2. index.dim() == self.dim() static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim, - const Tensor& index, const Tensor& src + const Tensor& index ) { auto self_dims = ensure_nonempty_dim(self.dim()); TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()), - "Index tensor must have the same number of dimensions as out tensor" - ); - - auto src_dims = ensure_nonempty_dim(src.dim()); - TORCH_CHECK(src_dims == ensure_nonempty_dim(index.dim()), "Index tensor must have the same number of dimensions as input tensor" ); for (int64_t i = 0; i < self_dims; ++i) { if (i != dim) { TORCH_CHECK( - ensure_nonempty_size(index, i) == ensure_nonempty_size(self, i), - "Size does not match at dimension ", i, - " get ", ensure_nonempty_size(self, i), - " vs ", ensure_nonempty_size(index, i) - ); - - TORCH_CHECK( - ensure_nonempty_size(index, i) <= ensure_nonempty_size(src, i), + ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i), "Size does not match at dimension ", i, " expected index ", index.sizes(), - " to be smaller than src ", src.sizes(), + " to be smaller than self ", self.sizes(), " apart from dimension ", dim ); } } } + // Used for `scatter` and `scatter_add` // Tests: // 1. index.size(d) <= self.size(d) for all d != dim @@ -76,10 +65,7 @@ static C10_UNUSED void scatter_shape_check( const Tensor& self, int64_t dim, const Tensor& index, const c10::optional& src_opt = c10::nullopt ) { - if (index.numel() == 0) { - return; - } - + if (index.numel() == 0) return; TORCH_CHECK( ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()), "Index tensor must have the same number of dimensions as self tensor" diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 43cebba51b9e7..3fb38cc8832ec 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -83,6 +83,31 @@ native::SCATTER_GATHER_OP get_operator_enum(const c10::string_view reduce) { } } +TORCH_META_FUNC(gather) +(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) { + const Tensor& result = maybe_get_output(0); + int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim()); + + // Memory overlap checks need to be done after resizing (if required) is done. + // But it only makes sense to do these checks when result was defined, hence + // the boolean variable `check_result` here. + // For more details, see: https://github.com/pytorch/pytorch/pull/63312#discussion_r694794832 + // and https://github.com/pytorch/pytorch/issues/63837 + bool check_result = result.defined(); + set_output(index.sizes(), self.options()); + if (check_result) { + at::assert_no_internal_overlap(result); + at::assert_no_overlap(result, self); + at::assert_no_partial_overlap(result, index); + } + + TORCH_CHECK( + index.scalar_type() == at::ScalarType::Long, + "gather", "(): Expected dtype int64 for index" + ); + at::native::gather_shape_check(self, wrapped_dim, index); +} + template void scatter_meta_impl( Meta& meta, @@ -1112,23 +1137,12 @@ Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const return self.clone(at::MemoryFormat::Preserve).index_fill_(dim, index, source); } -Tensor& gather_out_cpu_cuda( - const Tensor& self, - int64_t dim, - const Tensor& index, - bool sparse_grad, - Tensor& result) { - at::native::resize_output(result, index.sizes()); - at::assert_no_internal_overlap(result); - at::assert_no_overlap(result, self); - at::assert_no_partial_overlap(result, index); +// gather_out_cpu_cuda +TORCH_IMPL_FUNC(gather_out) +(const Tensor& self, int64_t dim, const Tensor& index, bool sparse_grad, const Tensor& result) { + if (index.numel() == 0) return; + dim = at::maybe_wrap_dim(dim, self.dim()); gather_stub(result.device().type(), result, self, dim, index); - return result; -} - -Tensor gather(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) { - Tensor result = at::empty({0}, self.options()); - return at::native::gather_out_cpu_cuda(self, dim, index, sparse_grad, result); } Tensor gather_backward(const Tensor& grad, const Tensor& self, int64_t dim, const Tensor& index, bool sparse_grad) { @@ -1148,6 +1162,8 @@ void scatter_impl( ReduceStub& reduce_stub, FillStub& fill_stub, const c10::optional reduce = nullopt) { + if (index.numel() == 0) return; + dim = at::maybe_wrap_dim(dim, self.dim()); auto mut_out = const_cast(out); if (!self.is_same(mut_out)) { @@ -1217,11 +1233,14 @@ TORCH_IMPL_FUNC(scatter_add) const Tensor& src, const Tensor& out) { auto mut_out = const_cast(out); + dim = maybe_wrap_dim(dim, self.dim()); if (!self.is_same(mut_out)) { mut_out.copy_(self); } + if (index.numel() == 0) return; + if (globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA && self.dim() == 1) { TORCH_CHECK(index.dim() == 1 && src.dim() == 1, "index and src should be 1D tensors when self is a 1D tensor, " "but their dims are ", index.dim(), " and ", src.dim(), ", respectively"); diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h index cd2835aa8139b..d8271a8355ded 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.h +++ b/aten/src/ATen/native/TensorAdvancedIndexing.h @@ -24,13 +24,13 @@ using take_fn = void(*)(TensorIterator & iter, const Tensor& input); using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride); using masked_scatter_fn = void(*)(TensorIterator &, const Tensor &); -using gather_fn = void (*)(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index); -using scatter_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); -using scatter_fill_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Scalar& src); -using scatter_add_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); -using scatter_reduce_fn = void(*)(Tensor& self, const int64_t dim, const Tensor& index, +using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index); +using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); +using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src); +using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); +using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const SCATTER_GATHER_OP& reduce); -using scatter_scalar_reduce_fn = void(*)(Tensor& self, const int64_t dim, const Tensor& index, +using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, const Scalar& value, const SCATTER_GATHER_OP& reduce); DECLARE_DISPATCH(index_fn, index_stub); diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp index c32efeb276bd7..2ab92fbdb2bb2 100644 --- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp +++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp @@ -100,15 +100,9 @@ struct _cpu_scatter_gather_dim_loop { template struct cpu_scatter_gather_base_kernel { template - void operator()(Tensor& self, int64_t dim, + void operator()(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& value, const std::string& method_name, func_t& kernel_func) { - // no-op if index is empty - if (index.numel() == 0) { - return; - } - - dim = maybe_wrap_dim(dim, self.dim()); auto index_sizes = ensure_nonempty_vec(index.sizes().vec()); auto index_strides = ensure_nonempty_vec(index.strides().vec()); @@ -193,22 +187,10 @@ struct cpu_scatter_gather_base_kernel { } template - void operator()(Tensor& self, int64_t dim, + void operator()(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src, const std::string& method_name, func_t& kernel_func) { - // no-op if index is empty - if (index.numel() == 0) { - return; - } - - dim = maybe_wrap_dim(dim, self.dim()); - - scatter_gather_dtype_check(method_name, self, index, src); - if (!is_scatter_like) { - gather_shape_check(self, dim, index, src); - } - auto iter = TensorIteratorConfig() .check_all_same_dtype(false) .resize_outputs(false) @@ -292,30 +274,30 @@ struct cpu_scatter_gather_base_kernel { } }; -void gather_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) { +void gather_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) { cpu_scatter_gather_base_kernel()( result, dim, index, self, "gather_out_cpu", tensor_assign); } -void scatter_cpu_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { +void scatter_cpu_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { cpu_scatter_gather_base_kernel<>()( self, dim, index, src, "scatter_cpu_", tensor_assign); } -void scatter_fill_cpu_kernel(Tensor& self, int64_t dim, const Tensor& index, const Scalar& value) { +void scatter_fill_cpu_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& value) { cpu_scatter_gather_base_kernel<>()( self, dim, index, value, "scatter_fill_cpu_", tensor_assign); } -void scatter_add_cpu_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { +void scatter_add_cpu_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { cpu_scatter_gather_base_kernel<>()( self, dim, index, src, "scatter_add_", reduce_add); } -void scatter_reduce_cpu_kernel(Tensor& self, const int64_t dim, const Tensor& index, +void scatter_reduce_cpu_kernel(const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const SCATTER_GATHER_OP& reduce) { switch (reduce) { case SCATTER_GATHER_OP::REDUCE_ADD : @@ -329,7 +311,7 @@ void scatter_reduce_cpu_kernel(Tensor& self, const int64_t dim, const Tensor& in } } -void scatter_scalar_reduce_cpu_kernel(Tensor& self, const int64_t dim, const Tensor& index, +void scatter_scalar_reduce_cpu_kernel(const Tensor& self, const int64_t dim, const Tensor& index, const Scalar& value, const SCATTER_GATHER_OP& reduce) { switch (reduce) { case SCATTER_GATHER_OP::REDUCE_ADD : diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu index b95570109de91..5f03cc450f206 100644 --- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu +++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu @@ -89,10 +89,6 @@ struct _cuda_scatter_gather_internal_kernel { int64_t index_stride, const func_t& f ) { - if (iter.numel() == 0) { - return; - } - if (!iter.can_use_32bit_indexing()) { for (auto& sub_iter : iter.with_32bit_indexing()) { _cuda_scatter_gather_internal_kernel()( @@ -132,24 +128,13 @@ template struct cuda_scatter_gather_base_kernel { template void operator()( - Tensor& self, int64_t dim, + const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src, const std::string& method_name, const func_t& f ) { - // no-op if index is empty - if (index.numel() == 0) { - return; - } at::assert_no_internal_overlap(self); - dim = maybe_wrap_dim(dim, self.dim()); - - scatter_gather_dtype_check(method_name, self, index, src); - if (!is_scatter_like) { - gather_shape_check(self, dim, index, src); - } - auto index_sizes = ensure_nonempty_vec(index.sizes().vec()); auto self_strides = ensure_nonempty_vec(self.strides().vec()); auto src_strides = ensure_nonempty_vec(src.strides().vec()); @@ -201,24 +186,13 @@ struct cuda_scatter_gather_base_kernel { } void operator()( - Tensor& self, int64_t dim, + const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src, const std::string& method_name, const ReduceMultiply& f ) { - // no-op if index is empty - if (index.numel() == 0) { - return; - } at::assert_no_internal_overlap(self); - dim = maybe_wrap_dim(dim, self.dim()); - - scatter_gather_dtype_check(method_name, self, index, src); - if (!is_scatter_like) { - gather_shape_check(self, dim, index, src); - } - auto index_sizes = ensure_nonempty_vec(index.sizes().vec()); auto self_strides = ensure_nonempty_vec(self.strides().vec()); auto src_strides = ensure_nonempty_vec(src.strides().vec()); @@ -280,10 +254,6 @@ struct _cuda_scatter_fill_internal_kernel { int64_t index_stride, const func_t& f ) { - if (iter.numel() == 0) { - return; - } - if (!iter.can_use_32bit_indexing()) { for (auto& sub_iter : iter.with_32bit_indexing()) { _cuda_scatter_fill_internal_kernel()( @@ -322,19 +292,13 @@ template struct cuda_scatter_fill_base_kernel { template void operator()( - Tensor& self, int64_t dim, + const Tensor& self, int64_t dim, const Tensor& index, Scalar src, const std::string& method_name, const func_t& f ) { - // no-op if index is empty - if (index.numel() == 0) { - return; - } at::assert_no_internal_overlap(self); - dim = maybe_wrap_dim(dim, self.dim()); - auto index_sizes = ensure_nonempty_vec(index.sizes().vec()); // restride self such that @@ -371,19 +335,13 @@ struct cuda_scatter_fill_base_kernel { } void operator()( - Tensor& self, int64_t dim, + const Tensor& self, int64_t dim, const Tensor& index, Scalar src, const std::string& method_name, const ReduceMultiply& f ) { - // no-op if index is empty - if (index.numel() == 0) { - return; - } at::assert_no_internal_overlap(self); - dim = maybe_wrap_dim(dim, self.dim()); - auto index_sizes = ensure_nonempty_vec(index.sizes().vec()); // restride self such that @@ -420,25 +378,25 @@ struct cuda_scatter_fill_base_kernel { } }; // struct cuda_scatter_fill_base_kernel -void gather_cuda_kernel(Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) { +void gather_cuda_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) { cuda_scatter_gather_base_kernel()( result, dim, index, self, "gather_out_cuda", tensor_assign); } -void scatter_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { +void scatter_cuda_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { cuda_scatter_gather_base_kernel<>()( self, dim, index, src, "scatter_cuda_", tensor_assign); } -void scatter_fill_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, const Scalar& src) { +void scatter_fill_cuda_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src) { cuda_scatter_fill_base_kernel<>()( self, dim, index, src, "scatter_fill_cuda_", tensor_assign); } -void scatter_add_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { +void scatter_add_cuda_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("scatter_add_cuda_kernel"); @@ -447,7 +405,7 @@ void scatter_add_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, con "scatter_add_cuda_", reduce_add); } -void scatter_reduce_cuda_kernel(Tensor& self, const int64_t dim, const Tensor& index, +void scatter_reduce_cuda_kernel(const Tensor& self, const int64_t dim, const Tensor& index, const Tensor& src, const SCATTER_GATHER_OP& reduce) { switch (reduce) { case SCATTER_GATHER_OP::REDUCE_ADD : @@ -461,7 +419,7 @@ void scatter_reduce_cuda_kernel(Tensor& self, const int64_t dim, const Tensor& i } } -void scatter_scalar_reduce_cuda_kernel(Tensor& self, const int64_t dim, const Tensor& index, +void scatter_scalar_reduce_cuda_kernel(const Tensor& self, const int64_t dim, const Tensor& index, const Scalar& value, const SCATTER_GATHER_OP& reduce) { switch (reduce) { case SCATTER_GATHER_OP::REDUCE_ADD : diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index fae433cd6aae6..ca13e058411a4 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6601,14 +6601,13 @@ variants: method, function - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: - CPU: gather_out_cpu_cuda - CUDA: gather_out_cpu_cuda + CPU, CUDA: gather_out - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor variants: method, function - dispatch: - CPU, CUDA: gather + structured_delegate: gather.out - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor variants: function diff --git a/test/test_torch.py b/test/test_torch.py index a790839bbd50e..2899f2ef4c3b2 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -1057,8 +1057,11 @@ def _test_gather(self, cast, test_bounds=True): torch.gather(src, dim, idx.to(torch.int)) # should throw an error when out.dtype != src.dtype. - with self.assertRaisesRegex(RuntimeError, 'Expected self.dtype to be equal to src.dtype'): - torch.gather(src, dim, idx, out=expected.to(torch.int)) + # Note that on Windows, the out tensor's dtype is returned as: struct c10::complex in the error + # message, hence the use of .* in regex here + with self.assertRaisesRegex(RuntimeError, + 'Expected out tensor to have dtype .*c10::complex, but got int instead'): + torch.gather(src.to(torch.complex128), dim, idx, out=expected.to(torch.int)) # checks for the same dimensionality with self.assertRaisesRegex(RuntimeError, 'Index tensor must have the same number of dimensions as input tensor'): From 6db8f7a70920f91418078fe09477eed0b0adefdb Mon Sep 17 00:00:00 2001 From: Kefei Lu Date: Thu, 2 Sep 2021 01:17:56 -0700 Subject: [PATCH 459/530] Fix TRTModule not adding outputs in order (#64418) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64418 In T99368564, we found that when running TRT lowered module, the output tensors are out-of-order, as compared to the output from the original, non-lowered module. It turns out that in `TRTModule.forward()`, we cannot rely on `ICudaEngine` bindings natural order indices to create the output tensors, but rather, we should explicitly construct the output tensor from the bindings' names, in an ordered that we supply. Test Plan: * Arc lint * Run CI/sandcastle tests * Run GPU lowering using commands and code changes in D30171741 and ensure we don't observe out-of-order outputs Reviewed By: yinghai Differential Revision: D30693545 fbshipit-source-id: 32a894ceeb148fcf4e8d279be3835c7d1f1aa2ba --- torch/fx/experimental/fx2trt/fx2trt.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py index f1d17e701790d..4c0b44c83085f 100644 --- a/torch/fx/experimental/fx2trt/fx2trt.py +++ b/torch/fx/experimental/fx2trt/fx2trt.py @@ -1,5 +1,5 @@ import warnings -from typing import List, NamedTuple, Iterable, Any, Optional, Tuple +from typing import List, NamedTuple, Iterable, Any, Optional, Tuple, Sequence import tensorrt as trt import torch @@ -53,6 +53,12 @@ def __init__( # Indicate output is in fp16 self.fp16_output = fp16_output + # Indices of outputs into the CUDA engine bindings, in the order as they are + # in the fx graph's `output` node. + self.output_indices_in_order: Sequence[int] = [ + self.engine.get_binding_index(name) for name in self.output_names + ] + def _on_state_dict(self, state_dict, prefix, local_metadata): state_dict[prefix + "engine"] = bytearray(self.engine.serialize()) state_dict[prefix + "input_names"] = self.input_names @@ -96,7 +102,7 @@ def forward(self, *inputs): # create output tensors outputs: List[torch.Tensor] = [] - for idx in range(len(inputs), len(inputs) + len(self.output_names)): + for idx in self.output_indices_in_order: dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx)) if self.engine.has_implicit_batch_dimension: From 1c735768ede21a60ad4f6ed9565a21d9b4f5bc92 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 2 Sep 2021 03:45:06 -0700 Subject: [PATCH 460/530] Update hub.load() signature to avoid polluting kwargs param (#63755) Summary: This PR addresses an old comment about Python2 EOL, directly putting some parameters in the function signature instead of in a `**kargs` dict. I believe the changes are fully backward compatible. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63755 Reviewed By: zou3519 Differential Revision: D30695634 Pulled By: NicolasHug fbshipit-source-id: 398f347c5a04bfb58e77e46773a869cb9d0eb225 --- torch/hub.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/torch/hub.py b/torch/hub.py index 499640b8bc6ee..7cf752f89166c 100644 --- a/torch/hub.py +++ b/torch/hub.py @@ -316,11 +316,8 @@ def help(github, model, force_reload=False, skip_validation=False): return entry.__doc__ -# Ideally this should be `def load(github, model, *args, forece_reload=False, **kwargs):`, -# but Python2 complains syntax error for it. We have to skip force_reload in function -# signature here but detect it in kwargs instead. -# TODO: fix it after Python2 EOL -def load(repo_or_dir, model, *args, **kwargs): +def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose=True, skip_validation=False, + **kwargs): r""" Load a model from a github repo or a local directory. @@ -329,7 +326,7 @@ def load(repo_or_dir, model, *args, **kwargs): If :attr:`source` is ``'github'``, :attr:`repo_or_dir` is expected to be of the form ``repo_owner/repo_name[:tag_name]`` with an optional - tag/branch. + tag/branch. The default branch is ``master`` if not specified. If :attr:`source` is ``'local'``, :attr:`repo_or_dir` is expected to be a path to a local directory. @@ -367,10 +364,7 @@ def load(repo_or_dir, model, *args, **kwargs): >>> path = '/some/local/path/pytorch/vision' >>> model = torch.hub.load(path, 'resnet50', pretrained=True) """ - source = kwargs.pop('source', 'github').lower() - force_reload = kwargs.pop('force_reload', False) - verbose = kwargs.pop('verbose', True) - skip_validation = kwargs.pop('skip_validation', False) + source = source.lower() if source not in ('github', 'local'): raise ValueError( From 030154e24119cdd16819ed4459d60379cf44c51f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 2 Sep 2021 03:46:59 -0700 Subject: [PATCH 461/530] Remove outdated comment in hub.py (#63757) Summary: This PR removes an outdated comment about Python2 that was orginally introduced in https://github.com/pytorch/pytorch/pull/25083/files. The code has changed since then, but the comment wasn't removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63757 Reviewed By: zou3519 Differential Revision: D30695656 Pulled By: NicolasHug fbshipit-source-id: 431cf414588b9e5a1ad6acdae724ff5af1b16971 --- torch/hub.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/torch/hub.py b/torch/hub.py index 7cf752f89166c..4cfbc83421bfe 100644 --- a/torch/hub.py +++ b/torch/hub.py @@ -425,8 +425,6 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True): """ file_size = None - # We use a different API for python2 since urllib(2) doesn't recognize the CA - # certificates in older Python req = Request(url, headers={"User-Agent": "torch.hub"}) u = urlopen(req) meta = u.info() From aedd70fcfe62a8bfb726b609f16edb8448a74299 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 2 Sep 2021 03:48:44 -0700 Subject: [PATCH 462/530] Fix list() and help() torchhub functions for Windows (#63773) Summary: This PR Fixes the help() and list() torchhub functions which were probably failing for Windows since the `/` OS separator was hardcoded. Before merging this I need to double check whether the CI actually runs the corresponding tests on Windows or not Pull Request resolved: https://github.com/pytorch/pytorch/pull/63773 Reviewed By: zou3519 Differential Revision: D30695664 Pulled By: NicolasHug fbshipit-source-id: fac328163fd05db804a8186ae28f22b3cc3a6404 --- torch/hub.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torch/hub.py b/torch/hub.py index 4cfbc83421bfe..bcd53f79d9c77 100644 --- a/torch/hub.py +++ b/torch/hub.py @@ -277,7 +277,8 @@ def list(github, force_reload=False, skip_validation=False): sys.path.insert(0, repo_dir) - hub_module = import_module(MODULE_HUBCONF, repo_dir + '/' + MODULE_HUBCONF) + hubconf_path = os.path.join(repo_dir, MODULE_HUBCONF) + hub_module = import_module(MODULE_HUBCONF, hubconf_path) sys.path.remove(repo_dir) @@ -307,7 +308,8 @@ def help(github, model, force_reload=False, skip_validation=False): sys.path.insert(0, repo_dir) - hub_module = import_module(MODULE_HUBCONF, repo_dir + '/' + MODULE_HUBCONF) + hubconf_path = os.path.join(repo_dir, MODULE_HUBCONF) + hub_module = import_module(MODULE_HUBCONF, hubconf_path) sys.path.remove(repo_dir) From be5b05c1dc7f8281311842a3953288b8f158a07a Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Thu, 2 Sep 2021 04:04:59 -0700 Subject: [PATCH 463/530] require that `TARGET_DET_LIST` is sorted (and sort it here) (#64102) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64102 We sort this list so that we may add comments to indicate the absence of a file right where that file would need to be put. This makes it difficult to wrongly add such a file. The sorting itself was done programmatically to ensure that no entries were inadvertently removed. I printed the sorted list with: ``` for p in sorted(TARGET_DET_LIST): print(f' "{p}",') ``` Then copied it back into the file. Test Plan: Imported from OSS Reviewed By: driazati Differential Revision: D30625076 Pulled By: dagitses fbshipit-source-id: cf36fcb3e53e274b76d1f4aae83da1f53c03f9ed --- test/test_determination.py | 7 ++ tools/testing/modulefinder_determinator.py | 87 +++++++++++----------- 2 files changed, 51 insertions(+), 43 deletions(-) diff --git a/test/test_determination.py b/test/test_determination.py index 277bbd2bc166c..ca00835429c4c 100644 --- a/test/test_determination.py +++ b/test/test_determination.py @@ -33,6 +33,13 @@ def determined_tests(cls, changed_files): if run_test.should_run_test(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions()) ] + def test_target_det_list_is_sorted(self): + # We keep TARGET_DET_LIST sorted to minimize merge conflicts + # but most importantly to allow us to comment on the absence + # of a test. It would be very difficult to add a file right + # next to a comment that says to keep it out of the list. + self.assertListEqual(run_test.TARGET_DET_LIST, sorted(run_test.TARGET_DET_LIST)) + def test_config_change_only(self): """CI configs trigger all tests""" self.assertEqual( diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py index 8acd0ed9cc2f0..b6c94e7a2d48f 100644 --- a/tools/testing/modulefinder_determinator.py +++ b/tools/testing/modulefinder_determinator.py @@ -12,50 +12,8 @@ # run with --determine-from, we use another generated list based on this one and the # previous test stats. TARGET_DET_LIST = [ - "distributions/test_distributions", - "test_nn", - "test_autograd", - "test_cpp_extensions_jit", - "test_jit_legacy", - "test_dataloader", - "test_overrides", - "test_linalg", - "test_jit", - "test_jit_profiling", - "test_torch", - "test_binary_ufuncs", - "test_numpy_interop", - "test_reductions", - "test_shape_ops", - "test_sort_and_select", - "test_testing", - "test_view_ops", - "distributed/nn/jit/test_instantiator", - "distributed/rpc/test_tensorpipe_agent", - "distributed/rpc/cuda/test_tensorpipe_agent", "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks", - "distributed/test_distributed_spawn", - "test_cuda", - "test_cuda_primary_ctx", - "test_cpp_extensions_aot_ninja", - "test_cpp_extensions_aot_no_ninja", - "test_serialization", - "test_optim", - "test_utils", - "test_multiprocessing", - "test_tensorboard", - "distributed/test_c10d_common", - "distributed/test_c10d_gloo", - "distributed/test_c10d_nccl", - "distributed/test_jit_c10d", - "distributed/test_c10d_spawn_gloo", - "distributed/test_c10d_spawn_nccl", - "distributed/test_store", - "distributed/test_pg_wrapper", - "test_quantization", - "test_pruning_op", - "test_determination", - "test_futures", + "distributed/nn/jit/test_instantiator", "distributed/pipeline/sync/skip/test_api", "distributed/pipeline/sync/skip/test_gpipe", "distributed/pipeline/sync/skip/test_inspect_skip_layout", @@ -78,8 +36,51 @@ "distributed/pipeline/sync/test_stream", "distributed/pipeline/sync/test_transparency", "distributed/pipeline/sync/test_worker", + "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/rpc/test_tensorpipe_agent", + "distributed/test_c10d_common", + "distributed/test_c10d_gloo", + "distributed/test_c10d_nccl", + "distributed/test_c10d_spawn_gloo", + "distributed/test_c10d_spawn_nccl", + "distributed/test_distributed_spawn", + "distributed/test_jit_c10d", + "distributed/test_pg_wrapper", + "distributed/test_store", + "distributions/test_distributions", + "test_autograd", + "test_binary_ufuncs", + "test_cpp_extensions_aot_ninja", + "test_cpp_extensions_aot_no_ninja", + "test_cpp_extensions_jit", + "test_cuda", + "test_cuda_primary_ctx", + "test_dataloader", + "test_determination", + "test_futures", + "test_jit", + "test_jit_legacy", + "test_jit_profiling", + "test_linalg", + "test_multiprocessing", + "test_nn", + "test_numpy_interop", + "test_optim", + "test_overrides", + "test_pruning_op", + "test_quantization", + "test_reductions", + "test_serialization", + "test_shape_ops", + "test_sort_and_select", + "test_tensorboard", + "test_testing", + "test_torch", + "test_utils", + "test_view_ops", ] + _DEP_MODULES_CACHE: Dict[str, Set[str]] = {} From cdb46f4c6e836ffe559781a40846c2f3b50b9e9c Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Thu, 2 Sep 2021 04:04:59 -0700 Subject: [PATCH 464/530] extract TestAutogradComplex into its own test file (#63400) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63400 This is the first step to break up test_autograd.py for #63205. Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D30541499 Pulled By: dagitses fbshipit-source-id: 8d9d32007938b9eade0e88f95a6a3190e7e2ef01 --- test/autograd/test_complex.py | 103 ++++++++++++++++++++ test/test_autograd.py | 106 +-------------------- tools/testing/modulefinder_determinator.py | 5 +- 3 files changed, 112 insertions(+), 102 deletions(-) create mode 100644 test/autograd/test_complex.py diff --git a/test/autograd/test_complex.py b/test/autograd/test_complex.py new file mode 100644 index 0000000000000..74fcfdafbce2a --- /dev/null +++ b/test/autograd/test_complex.py @@ -0,0 +1,103 @@ +import torch + +from torch.testing._internal.common_utils import TestCase, run_tests, gradcheck + + +class TestAutogradComplex(TestCase): + def test_view_func_for_complex_views(self): + # case 1: both parent and child have view_func + x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True) + y = x.detach().requires_grad_(True) + + x0 = x.clone() + x1 = torch.view_as_complex(x0) + x2 = torch.view_as_real(x1) + x2.mul_(2) + x2.sum().backward() + + y0 = y.clone() + y0.mul_(2) + y0.sum().backward() + + self.assertEqual(x.grad, y.grad) + + # case 2: parent has view_func but child does not + x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True) + y = x.detach().requires_grad_(True) + + def fn(a): + b = a.clone() + b1 = torch.view_as_complex(b) + b2 = b1.reshape(b1.numel()) + return b2 + + x0 = fn(x) + x0.mul_(2) + x0.sum().backward() + + y0 = fn(y) + y1 = y0.mul(2) + y1.sum().backward() + + self.assertEqual(x.grad, y.grad) + + # case 3: parent does not have a view_func but child does + x = torch.randn(10, dtype=torch.cdouble, requires_grad=True) + y = x.detach().requires_grad_(True) + + def fn(a, dim0_size=5): + b = a.clone() + b1 = b.reshape(dim0_size, 2) + b2 = torch.view_as_real(b1) + return b2 + + x0 = fn(x) + x0.mul_(2) + x0.sum().backward() + + y0 = fn(y) + y1 = y0.mul(2) + y1.sum().backward() + + self.assertEqual(x.grad, y.grad) + + def test_view_with_multi_output(self): + x = torch.randn(2, 2, 2, dtype=torch.double) + + x1 = torch.view_as_complex(x) + # Taking an invalid view should always be allowed as long as it is not + # modified inplace + res = x1.unbind(0) + + with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"): + res[0] += torch.rand(2, requires_grad=True) + + x.requires_grad_(True) + x1 = torch.view_as_complex(x) + # Taking an invalid view should always be allowed as long as it is not + # modified inplace + res = x1.unbind(0) + + with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"): + res[0] += torch.rand(2, requires_grad=True) + + def as_identity(self): + # view_as_real and view_as_complex behavior should be like an identity + def func(z): + z_ = torch.view_as_complex(z) + z_select = torch.select(z_, z_.dim() - 1, 0) + z_select_real = torch.view_as_real(z_select) + return z_select_real.sum() + + z = torch.randn(10, 2, 2, dtype=torch.double, requires_grad=True) + gradcheck(func, [z]) + func(z).backward() + + z1 = z.clone().detach().requires_grad_(True) + torch.select(z1, z1.dim() - 2, 0).sum().backward() + + self.assertEqual(z.grad, z1.grad) + + +if __name__ == '__main__': + run_tests() diff --git a/test/test_autograd.py b/test/test_autograd.py index ebe3aa5d29e18..fde64b0b062d3 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -28,7 +28,6 @@ from torch.testing._internal.common_cuda import TEST_CUDA from torch.testing._internal.common_utils import (TestCase, run_tests, skipIfNoLapack, suppress_warnings, slowTest, - load_tests, IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck, TEST_WITH_ROCM, disable_gc, gradcheck, gradgradcheck) @@ -44,11 +43,6 @@ deviceCountAtLeast, skipCUDAIfCudnnVersionLessThan, skipCUDAIf, skipMeta) - -# load_tests from common_utils is used to automatically filter tests for -# sharding on sandcastle. This line silences flake warnings -load_tests = load_tests - import pickle PRECISION = 1e-4 @@ -6173,101 +6167,6 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks, test_case.assertEqual(self_variable.size(), self_variable.grad.size()) -class TestAutogradComplex(TestCase): - def test_view_func_for_complex_views(self): - # case 1: both parent and child have view_func - x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True) - y = x.detach().requires_grad_(True) - - x0 = x.clone() - x1 = torch.view_as_complex(x0) - x2 = torch.view_as_real(x1) - x2.mul_(2) - x2.sum().backward() - - y0 = y.clone() - y0.mul_(2) - y0.sum().backward() - - self.assertEqual(x.grad, y.grad) - - # case 2: parent has view_func but child does not - x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True) - y = x.detach().requires_grad_(True) - - def fn(a): - b = a.clone() - b1 = torch.view_as_complex(b) - b2 = b1.reshape(b1.numel()) - return b2 - - x0 = fn(x) - x0.mul_(2) - x0.sum().backward() - - y0 = fn(y) - y1 = y0.mul(2) - y1.sum().backward() - - self.assertEqual(x.grad, y.grad) - - # case 3: parent does not have a view_func but child does - x = torch.randn(10, dtype=torch.cdouble, requires_grad=True) - y = x.detach().requires_grad_(True) - - def fn(a, dim0_size=5): - b = a.clone() - b1 = b.reshape(dim0_size, 2) - b2 = torch.view_as_real(b1) - return b2 - - x0 = fn(x) - x0.mul_(2) - x0.sum().backward() - - y0 = fn(y) - y1 = y0.mul(2) - y1.sum().backward() - - self.assertEqual(x.grad, y.grad) - - def test_view_with_multi_output(self): - x = torch.randn(2, 2, 2, dtype=torch.double) - - x1 = torch.view_as_complex(x) - # Taking an invalid view should always be allowed as long as it is not - # modified inplace - res = x1.unbind(0) - - with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"): - res[0] += torch.rand(2, requires_grad=True) - - x.requires_grad_(True) - x1 = torch.view_as_complex(x) - # Taking an invalid view should always be allowed as long as it is not - # modified inplace - res = x1.unbind(0) - - with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"): - res[0] += torch.rand(2, requires_grad=True) - - def as_identity(self): - # view_as_real and view_as_complex behavior should be like an identity - def func(z): - z_ = torch.view_as_complex(z) - z_select = torch.select(z_, z_.dim() - 1, 0) - z_select_real = torch.view_as_real(z_select) - return z_select_real.sum() - - z = torch.randn(10, 2, 2, dtype=torch.double, requires_grad=True) - gradcheck(func, [z]) - func(z).backward() - - z1 = z.clone().detach().requires_grad_(True) - torch.select(z1, z1.dim() - 2, 0).sum().backward() - - self.assertEqual(z.grad, z1.grad) - class TestAutogradFunctional(TestCase): def _assert_same_struct(self, res, base): # base and res should be Tensors or tuple of Tensors with the same size @@ -9640,6 +9539,11 @@ def fn(x1, x2): torch.autograd.gradcheck(fn, [inp_r, inp_c], check_forward_ad=True) torch.autograd.gradcheck(fn, [inp_c, inp_r], check_forward_ad=True) +# Import test cases from below autograd/ here. These are found +# implicitly by the loader, so Flake8 thinks they are unused, hence +# the suppressions. + +from autograd.test_complex import TestAutogradComplex # noqa: F401 # e.g., TestAutogradDeviceTypeCPU and TestAutogradDeviceTypeCUDA instantiate_device_type_tests( diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py index b6c94e7a2d48f..32dc1031b5616 100644 --- a/tools/testing/modulefinder_determinator.py +++ b/tools/testing/modulefinder_determinator.py @@ -48,7 +48,10 @@ "distributed/test_pg_wrapper", "distributed/test_store", "distributions/test_distributions", - "test_autograd", + # test_autograd.py is not slow, so it does not belong here. But + # note that if you try to add it back it will run into + # https://bugs.python.org/issue40350 because it imports files + # under test/autograd/. "test_binary_ufuncs", "test_cpp_extensions_aot_ninja", "test_cpp_extensions_aot_no_ninja", From 99b064fac4b24a5a76808b52107b88425a402c60 Mon Sep 17 00:00:00 2001 From: Ivan Kobzarev Date: Thu, 2 Sep 2021 05:27:59 -0700 Subject: [PATCH 465/530] [jit] shape propagation for prepack (#63585) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63585 Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30428905 Pulled By: IvanKobzarev fbshipit-source-id: c18f6605a69b2e000bdf14a23e637c5a1c2ec64c --- aten/src/ATen/native/xnnpack/Convolution.cpp | 15 + aten/src/ATen/native/xnnpack/Convolution.h | 3 + aten/src/ATen/native/xnnpack/Linear.cpp | 10 + aten/src/ATen/native/xnnpack/Linear.h | 3 + .../native/xnnpack/RegisterOpContextClass.cpp | 3 + .../jit/runtime/symbolic_shape_registry.cpp | 39 +- .../csrc/jit/serialization/import_source.cpp | 1128 ++++++++--------- torch/csrc/jit/serialization/import_source.h | 61 +- 8 files changed, 684 insertions(+), 578 deletions(-) diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp index 8c5d99a242196..f46052d9c5ef6 100644 --- a/aten/src/ATen/native/xnnpack/Convolution.cpp +++ b/aten/src/ATen/native/xnnpack/Convolution.cpp @@ -425,6 +425,21 @@ Tensor conv2d_clamp_run( return op_context->run(input); } +// Op is registered to have Any argument as we plan to reuse it for prepacked conv2d of other backends +std::tuple, IntArrayRef, IntArrayRef, IntArrayRef, int64_t> +unpack_prepacked_sizes_conv2d(const IValue& ivalue) { + auto op_context = ivalue.toCustomClass(); + const auto tuple = op_context->unpack(); + const auto& bias = std::get<1>(tuple); + return std::make_tuple( + std::get<0>(tuple).sizes(), + (bias && bias->defined()) ? c10::optional(bias->sizes()) : c10::nullopt, + std::get<2>(tuple), + std::get<3>(tuple), + std::get<4>(tuple), + std::get<5>(tuple)); +} + Tensor conv2d_transpose_clamp_run( const Tensor& input, const c10::intrusive_ptr& op_context) { diff --git a/aten/src/ATen/native/xnnpack/Convolution.h b/aten/src/ATen/native/xnnpack/Convolution.h index 403f26cdec70e..b89059de2c615 100644 --- a/aten/src/ATen/native/xnnpack/Convolution.h +++ b/aten/src/ATen/native/xnnpack/Convolution.h @@ -39,6 +39,9 @@ Tensor conv2d_clamp_run( const Tensor& input, const c10::intrusive_ptr& op_context); +std::tuple, IntArrayRef, IntArrayRef, IntArrayRef, int64_t> +unpack_prepacked_sizes_conv2d(const IValue& ivalue); + Tensor conv2d_transpose_clamp_run( const Tensor& input, const c10::intrusive_ptr& op_context); diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp index 9a459b660d6fb..19c474f34cef9 100644 --- a/aten/src/ATen/native/xnnpack/Linear.cpp +++ b/aten/src/ATen/native/xnnpack/Linear.cpp @@ -180,6 +180,16 @@ Tensor linear_clamp_run( return op_context->run(input); } +std::tuple> +unpack_prepacked_sizes_linear(const IValue& ivalue) { + auto op_context = ivalue.toCustomClass(); + const auto tuple = op_context->unpack(); + const auto& bias = std::get<1>(tuple); + return std::make_tuple( + std::get<0>(tuple).sizes(), + (bias && bias->defined()) ? c10::optional(bias->sizes()) : c10::nullopt); +} + } // namespace linear } // namespace internal diff --git a/aten/src/ATen/native/xnnpack/Linear.h b/aten/src/ATen/native/xnnpack/Linear.h index 3e4df0466d261..d25f63bafa739 100644 --- a/aten/src/ATen/native/xnnpack/Linear.h +++ b/aten/src/ATen/native/xnnpack/Linear.h @@ -20,6 +20,9 @@ c10::intrusive_ptr createLinearClampPrePackOpContext( Tensor linear_clamp_run(const Tensor& input, const c10::intrusive_ptr& op_context); +std::tuple> +unpack_prepacked_sizes_linear(const IValue& ivalue); + ContextLinear create( const Tensor& weight, const c10::optional& bias, diff --git a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp index 03ac612aa12d0..f09c2dc22a39c 100644 --- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp +++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp @@ -80,7 +80,10 @@ TORCH_LIBRARY(xnnpack, m) { } +// Registration using the TORCH_LIBRARY def gives dispatching errors when there is no tensor input TORCH_LIBRARY(prepacked, m) { + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::unpack_prepacked_sizes_conv2d(Any W_prepack) -> (int[], int[]?, int[], int[], int[], int)"), [](const IValue& inp) { return internal::convolution2d::unpack_prepacked_sizes_conv2d(inp);}); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::unpack_prepacked_sizes_linear(Any W_prepack) -> (int[], int[]?)"), [](const IValue& inp) { return internal::linear::unpack_prepacked_sizes_linear(inp);}); m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext")); m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y")); m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext")); diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp index d4471998d11e8..871b65d75f6b7 100644 --- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp +++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace torch { @@ -292,7 +293,21 @@ const std::string shape_compute_functions = for i in range(end_dim + 1, len(input)): shape.append(input[i]) return shape - )"; + )" +#ifdef USE_XNNPACK + R"( + def prepacked_conv2d_clamp_run(input: List[int], conv2dOpContext: Any): + assert isinstance(conv2dOpContext, __torch__.torch.classes.xnnpack.Conv2dOpContext) + (weight, bias, stride, padding, dilation, groups) = ops.prepacked.unpack_prepacked_sizes_conv2d(conv2dOpContext) + return conv2d(input, weight, bias, stride, padding, dilation, groups) + + def prepacked_linear_clamp_run(input: List[int], linearOpContext: Any): + assert isinstance(linearOpContext, __torch__.torch.classes.xnnpack.LinearOpContext) + (weight, bias) = ops.prepacked.unpack_prepacked_sizes_linear(linearOpContext) + return linear(input, weight, bias) + )" +#endif + ; // mapping function schema to shape compute graphs allows multiple functions to // share the same shape compute graph, which is memory efficient and also will @@ -317,8 +332,11 @@ static const OperatorMap& get_schema_to_function_graph() { {"aten::div.Scalar(Tensor self, Scalar other) -> Tensor", "unary_one_unused_input"}, {"aten::gt.Tensor(Tensor self, Tensor other) -> Tensor", "broadcast"}, {"aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "broadcast_one_unused_input"}, + {"aten::add_.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "broadcast_one_unused_input"}, {"aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "unary_two_unused_inputs"}, {"aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor", "unary_two_unused_inputs"}, + {"aten::hardswish_(Tensor self) -> Tensor", "unary"}, + {"aten::hardsigmoid_(Tensor self) -> Tensor", "unary"}, {"aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor", "adaptive_avg_pool2d"}, {"aten::mm(Tensor self, Tensor mat2) -> Tensor", "mm"}, {"aten::dot(Tensor self, Tensor tensor) -> Tensor", "dot"}, @@ -335,6 +353,10 @@ static const OperatorMap& get_schema_to_function_graph() { {"aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", "view"}, {"aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "mean_dim"}, {"aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "addmm"}, +#ifdef USE_XNNPACK + {"prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y", "prepacked_conv2d_clamp_run"}, + {"prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y", "prepacked_linear_clamp_run"}, +#endif }; // clang-format on return schema_to_function_graph; @@ -344,7 +366,7 @@ std::unordered_map> cached_schema_to_graph; // CompilationUnit that holds all these Functions and keeps them alive. -CompilationUnit compilation_unit; +auto compilation_unit = std::make_shared(); void loadModule(const CompilationUnit& module) { std::unordered_map> reused_functions; @@ -371,9 +393,16 @@ void loadModule(const CompilationUnit& module) { } void loadFunctions() { - compilation_unit.define( - c10::nullopt, shape_compute_functions, nativeResolver(), nullptr); - loadModule(compilation_unit); + auto src = std::make_shared(shape_compute_functions); + std::vector constantTable; + auto resolver = std::make_shared( + compilation_unit, + &constantTable, + [&](const std::string& name) -> std::shared_ptr { return src; }, + 1); + compilation_unit->define( + c10::nullopt, shape_compute_functions, resolver, nullptr); + loadModule(*compilation_unit); } } // anonymous namespace diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp index fb1de17a54eea..e7d9da26df41d 100644 --- a/torch/csrc/jit/serialization/import_source.cpp +++ b/torch/csrc/jit/serialization/import_source.cpp @@ -91,629 +91,615 @@ struct ConstantTableValue : public SugaredValue { const std::vector* constants_; }; -struct SourceImporterImpl : public Resolver, - std::enable_shared_from_this { - SourceImporterImpl( - std::shared_ptr cu, - const std::vector* constant_table, - SourceLoader source_loader, - size_t version) - : cu_(std::move(cu)), source_loader_(std::move(source_loader)) { - env_ = { - {"torch", std::make_shared("aten", version)}, - {"ops", std::make_shared(version)}, - // Constants present in the model. Used to resolve "CONSTANTS.n" to the - // actual value - {"CONSTANTS", std::make_shared(constant_table)}, - {"fork", SpecialFormValue::create(prim::fork)}, - {"annotate", SpecialFormValue::create(prim::annotate)}, - {"unchecked_cast", SpecialFormValue::create(prim::unchecked_cast)}, - {"uninitialized", SpecialFormValue::create(prim::Uninitialized)}, - }; - } - - TypePtr findNamedType(const QualifiedName& name) { - if (auto custom_class = getCustomClass(name.qualifiedName())) { - return custom_class; - } - parseSourceIfNeeded(name.prefix()); - auto it = to_be_defined_.find(name); - if (it != to_be_defined_.end() && it->second->kind() == TK_CLASS_DEF) { - ClassDef cd(it->second); - to_be_defined_.erase(it); - importNamedType(name.prefix(), cd); - } - return cu_->get_type(name); +SourceImporterImpl::SourceImporterImpl( + std::shared_ptr cu, + const std::vector* constant_table, + SourceLoader source_loader, + size_t version) + : cu_(std::move(cu)), source_loader_(std::move(source_loader)) { + env_ = { + {"torch", std::make_shared("aten", version)}, + {"ops", std::make_shared(version)}, + // Constants present in the model. Used to resolve "CONSTANTS.n" to the + // actual value + {"CONSTANTS", std::make_shared(constant_table)}, + {"fork", SpecialFormValue::create(prim::fork)}, + {"annotate", SpecialFormValue::create(prim::annotate)}, + {"unchecked_cast", SpecialFormValue::create(prim::unchecked_cast)}, + {"uninitialized", SpecialFormValue::create(prim::Uninitialized)}, + }; +} + +TypePtr SourceImporterImpl::findNamedType(const QualifiedName& name) { + if (auto custom_class = getCustomClass(name.qualifiedName())) { + return custom_class; + } + parseSourceIfNeeded(name.prefix()); + auto it = to_be_defined_.find(name); + if (it != to_be_defined_.end() && it->second->kind() == TK_CLASS_DEF) { + ClassDef cd(it->second); + to_be_defined_.erase(it); + importNamedType(name.prefix(), cd); } + return cu_->get_type(name); +} - Function* findFunction(const QualifiedName& name) { - parseSourceIfNeeded(name.prefix()); - auto it = to_be_defined_.find(name); - if (it != to_be_defined_.end() && it->second->kind() == TK_DEF) { - Def d(it->second); - to_be_defined_.erase(it); - importFunction(name.prefix(), d); - } - return cu_->find_function(name); +Function* SourceImporterImpl::findFunction(const QualifiedName& name) { + parseSourceIfNeeded(name.prefix()); + auto it = to_be_defined_.find(name); + if (it != to_be_defined_.end() && it->second->kind() == TK_DEF) { + Def d(it->second); + to_be_defined_.erase(it); + importFunction(name.prefix(), d); } + return cu_->find_function(name); +} - void parseSourceIfNeeded(const std::string& qualifier) { - // qualifier may be blank, for instance checking if __torch__ is a class. - if (qualifier == "" || loaded_sources_.count(qualifier)) { - return; - } - loaded_sources_.insert(qualifier); - std::shared_ptr src = source_loader_(qualifier); - - // The importer, when looking for classes/functions doesn't know if 'foo' - // contains definitions or if it is a prefix of 'foo.bar', we only figure it - // out by testing if `foo.py` exists in the source loader. If it doesn't - // then there is nothing to load here - if (!src) { - return; - } - Parser p(src); - parsePossibleVersionNumber(p.lexer()); - - auto& L = p.lexer(); - - while (L.cur().kind != TK_EOF) { - parseImports(L); - auto tk = L.cur(); - auto kind = tk.kind; - switch (kind) { - case TK_CLASS_DEF: { - auto parsed_treeref = ClassDef(p.parseClass()); - to_be_defined_[QualifiedName( - qualifier, parsed_treeref.name().name())] = parsed_treeref; - } break; - case TK_DEF: { - auto parsed_treeref = Def(p.parseFunction(/*is_method=*/false)); - to_be_defined_[QualifiedName( - qualifier, parsed_treeref.name().name())] = parsed_treeref; - } break; - default: - throw ErrorReport(L.cur().range) - << "Unexpected token in code import: " << kindToString(kind); - } +void SourceImporterImpl::parseSourceIfNeeded(const std::string& qualifier) { + // qualifier may be blank, for instance checking if __torch__ is a class. + if (qualifier == "" || loaded_sources_.count(qualifier)) { + return; + } + loaded_sources_.insert(qualifier); + std::shared_ptr src = source_loader_(qualifier); + + // The importer, when looking for classes/functions doesn't know if 'foo' + // contains definitions or if it is a prefix of 'foo.bar', we only figure it + // out by testing if `foo.py` exists in the source loader. If it doesn't + // then there is nothing to load here + if (!src) { + return; + } + Parser p(src); + parsePossibleVersionNumber(p.lexer()); + + auto& L = p.lexer(); + + while (L.cur().kind != TK_EOF) { + parseImports(L); + auto tk = L.cur(); + auto kind = tk.kind; + switch (kind) { + case TK_CLASS_DEF: { + auto parsed_treeref = ClassDef(p.parseClass()); + to_be_defined_[QualifiedName(qualifier, parsed_treeref.name().name())] = + parsed_treeref; + } break; + case TK_DEF: { + auto parsed_treeref = Def(p.parseFunction(/*is_method=*/false)); + to_be_defined_[QualifiedName(qualifier, parsed_treeref.name().name())] = + parsed_treeref; + } break; + default: + throw ErrorReport(L.cur().range) + << "Unexpected token in code import: " << kindToString(kind); } } +} - void LEGACY_import_methods( - const Module& mod, - const std::shared_ptr& src) { - auto self = SimpleSelf(mod.type()); - c10::QualifiedName prefix = *mod.type()->name(); - Parser p(src); +void SourceImporterImpl::LEGACY_import_methods( + const Module& mod, + const std::shared_ptr& src) { + auto self = SimpleSelf(mod.type()); + c10::QualifiedName prefix = *mod.type()->name(); + Parser p(src); - parsePossibleVersionNumber(p.lexer()); + parsePossibleVersionNumber(p.lexer()); - parseImports(p.lexer()); + parseImports(p.lexer()); - std::vector definitions; - std::vector resolvers; - while (p.lexer().cur().kind != TK_EOF) { - auto def = Def(p.parseFunction(/*is_method=*/true)); - definitions.emplace_back(def); - resolvers.emplace_back(shared_from_this()); - } - cu_->define( - prefix, - /*properties=*/{}, - /*propResolvers=*/{}, - definitions, - resolvers, - &self); + std::vector definitions; + std::vector resolvers; + while (p.lexer().cur().kind != TK_EOF) { + auto def = Def(p.parseFunction(/*is_method=*/true)); + definitions.emplace_back(def); + resolvers.emplace_back(shared_from_this()); } + cu_->define( + prefix, + /*properties=*/{}, + /*propResolvers=*/{}, + definitions, + resolvers, + &self); +} - std::shared_ptr resolveValue( - const std::string& name, - Function& m, - const SourceRange& loc) override { - auto it = env_.find(name); - if (it != env_.end()) { - return it->second; - } - auto graph = m.graph(); - if (name == "inf") { - return std::make_shared( - graph->insertConstant(std::numeric_limits::infinity(), loc)); - } - if (name == "nan") { - return std::make_shared( - graph->insertConstant(std::numeric_limits::quiet_NaN(), loc)); - } - if (name == "infj") { - return std::make_shared(graph->insertConstant( - c10::complex(0, std::numeric_limits::infinity()), - loc)); - } - if (name == "nanj") { - return std::make_shared(graph->insertConstant( - c10::complex(0, std::numeric_limits::quiet_NaN()), - loc)); - } - if (name == "__torch__") { - return std::make_shared( - c10::QualifiedName(name), shared_from_this()); - } - return nullptr; +std::shared_ptr SourceImporterImpl::resolveValue( + const std::string& name, + Function& m, + const SourceRange& loc) { + auto it = env_.find(name); + if (it != env_.end()) { + return it->second; + } + auto graph = m.graph(); + if (name == "inf") { + return std::make_shared( + graph->insertConstant(std::numeric_limits::infinity(), loc)); + } + if (name == "nan") { + return std::make_shared( + graph->insertConstant(std::numeric_limits::quiet_NaN(), loc)); } + if (name == "infj") { + return std::make_shared(graph->insertConstant( + c10::complex(0, std::numeric_limits::infinity()), loc)); + } + if (name == "nanj") { + return std::make_shared(graph->insertConstant( + c10::complex(0, std::numeric_limits::quiet_NaN()), + loc)); + } + if (name == "__torch__") { + return std::make_shared( + c10::QualifiedName(name), shared_from_this()); + } + return nullptr; +} + +TypePtr SourceImporterImpl::resolveType( + const std::string& name, + const SourceRange& loc) { + return findNamedType(QualifiedName(name)); +} - TypePtr resolveType(const std::string& name, const SourceRange& loc) - override { - return findNamedType(QualifiedName(name)); +void SourceImporterImpl::importFunction( + const std::string& qualifier, + const Def& def) { + std::vector definitions{def}; + std::vector resolvers{shared_from_this()}; + cu_->define( + qualifier, + /*properties=*/{}, + /*propResolvers=*/{}, + definitions, + resolvers, + nullptr); +} + +void SourceImporterImpl::importNamedType( + const std::string& qualifier, + const ClassDef& class_def) { + const auto qualified_name = + QualifiedName(QualifiedName(qualifier), class_def.name().name()); + if (!class_def.superclass().present()) { + return importClass(qualified_name, class_def, /*is_module=*/false); + } + const auto& superclass_name = Var(class_def.superclass().get()).name().name(); + if (superclass_name == "Module") { + importClass(qualified_name, class_def, /*is_module=*/true); + } else if (superclass_name == "NamedTuple") { + // NamedTuples have special rules (since they are TupleTypes and not + // ClassTypes) + return importNamedTuple(qualified_name, class_def); + } else if (superclass_name == "Interface") { + cu_->define_interface( + qualified_name, class_def, shared_from_this(), /*is_module=*/false); + } else if (superclass_name == "ModuleInterface") { + cu_->define_interface( + qualified_name, class_def, shared_from_this(), /*is_module=*/true); + } else if (superclass_name == "Enum") { + importEnum(qualified_name, class_def); + } else { + throw ErrorReport(class_def.range()) + << "Torchscript does not support class inheritance."; } +} - private: - void importFunction(const std::string& qualifier, const Def& def) { - std::vector definitions{def}; - std::vector resolvers{shared_from_this()}; - cu_->define( - qualifier, - /*properties=*/{}, - /*propResolvers=*/{}, - definitions, - resolvers, - nullptr); - } - - void importNamedType( - const std::string& qualifier, - const ClassDef& class_def) { - const auto qualified_name = - QualifiedName(QualifiedName(qualifier), class_def.name().name()); - if (!class_def.superclass().present()) { - return importClass(qualified_name, class_def, /*is_module=*/false); +c10::optional SourceImporterImpl:: + attributeAssignmentSpecialHandlingHack( + const QualifiedName& qualified_classname, + const Assign& assign) { + struct AttrTypeReplacementDescr { + std::string attr_name; + std::string expected_type; + std::string replacement_type; + }; + + // module demangled qualname -> ReplacementDescr + static std::unordered_map replacements{ + {"__torch__.torch.nn.quantized.modules.linear.LinearPackedParams", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.LinearPackedParamsBase"}}, + {"__torch__.torch.nn.quantized.modules.linear.Linear", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.LinearPackedParamsBase"}}, + {"__torch__.torch.nn.quantized.dynamic.modules.linear.Linear", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.LinearPackedParamsBase"}}, + {"__torch__.torch.nn.quantized.modules.conv.Conv2d", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}}, + {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU2d", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}}, + {"__torch__.torch.nn.quantized.modules.conv.Conv3d", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}}, + {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU3d", + {"_packed_params", + "Tensor", + "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}}}; + // @lint-ignore-every CLANGTIDY facebook-hte-StdRegexIsAwful + static std::regex mangle_re("\\.___torch_mangle_\\d+"); + auto demangled_classname = + std::regex_replace(qualified_classname.qualifiedName(), mangle_re, ""); + if (replacements.count(demangled_classname)) { + auto lhs = Var(assign.lhs()); + if (!assign.type().present() || assign.type().get().kind() != TK_VAR) { + return c10::nullopt; } - const auto& superclass_name = - Var(class_def.superclass().get()).name().name(); - if (superclass_name == "Module") { - importClass(qualified_name, class_def, /*is_module=*/true); - } else if (superclass_name == "NamedTuple") { - // NamedTuples have special rules (since they are TupleTypes and not - // ClassTypes) - return importNamedTuple(qualified_name, class_def); - } else if (superclass_name == "Interface") { - cu_->define_interface( - qualified_name, class_def, shared_from_this(), /*is_module=*/false); - } else if (superclass_name == "ModuleInterface") { - cu_->define_interface( - qualified_name, class_def, shared_from_this(), /*is_module=*/true); - } else if (superclass_name == "Enum") { - importEnum(qualified_name, class_def); - } else { - throw ErrorReport(class_def.range()) - << "Torchscript does not support class inheritance."; + auto type = Var(assign.type().get()); + + auto& attr_name = replacements.at(demangled_classname).attr_name; + auto& expected_type = replacements.at(demangled_classname).expected_type; + auto& replacement_type = + replacements.at(demangled_classname).replacement_type; + if (lhs.name().name() == attr_name && type.name().name() == expected_type) { + Parser p(std::make_shared(replacement_type)); + auto typename_expr = p.parseExp(); + auto maybe_typename = + Maybe::create(typename_expr.range(), typename_expr); + return Assign::create( + assign.range(), assign.lhs_list(), assign.rhs(), maybe_typename); } } + return c10::nullopt; +} - c10::optional attributeAssignmentSpecialHandlingHack( - const QualifiedName& qualified_classname, - const Assign& assign) { - struct AttrTypeReplacementDescr { - std::string attr_name; - std::string expected_type; - std::string replacement_type; - }; - - // module demangled qualname -> ReplacementDescr - static std::unordered_map replacements{ - {"__torch__.torch.nn.quantized.modules.linear.LinearPackedParams", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.LinearPackedParamsBase"}}, - {"__torch__.torch.nn.quantized.modules.linear.Linear", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.LinearPackedParamsBase"}}, - {"__torch__.torch.nn.quantized.dynamic.modules.linear.Linear", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.LinearPackedParamsBase"}}, - {"__torch__.torch.nn.quantized.modules.conv.Conv2d", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}}, - {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU2d", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}}, - {"__torch__.torch.nn.quantized.modules.conv.Conv3d", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}}, - {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU3d", - {"_packed_params", - "Tensor", - "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}}}; - static std::regex mangle_re("\\.___torch_mangle_\\d+"); - auto demangled_classname = - std::regex_replace(qualified_classname.qualifiedName(), mangle_re, ""); - if (replacements.count(demangled_classname)) { - auto lhs = Var(assign.lhs()); - if (!assign.type().present() || assign.type().get().kind() != TK_VAR) { - return c10::nullopt; - } - auto type = Var(assign.type().get()); - - auto& attr_name = replacements.at(demangled_classname).attr_name; - auto& expected_type = replacements.at(demangled_classname).expected_type; - auto& replacement_type = - replacements.at(demangled_classname).replacement_type; - if (lhs.name().name() == attr_name && - type.name().name() == expected_type) { - Parser p(std::make_shared(replacement_type)); - auto typename_expr = p.parseExp(); - auto maybe_typename = - Maybe::create(typename_expr.range(), typename_expr); - return Assign::create( - assign.range(), assign.lhs_list(), assign.rhs(), maybe_typename); - } - } - return c10::nullopt; - } - - void importClass( - const QualifiedName& qualified_classname, - const ClassDef& class_def, - bool is_module) { - // BC for TorchBind classes - // - // Previously we would serialize TorchBind classes as actual - // classes with methods that delegate to things in the - // torch.ops.* namespace. We've switched away from this and - // now just rely on those classes being present in the binary - // and emit code for them based on the ClassType in memory. - // - // TODO: remove this once we no longer have old TorchBind code - // in production models - { - static QualifiedName torch_classes_qualname("__torch__.torch.classes"); - if (torch_classes_qualname.isPrefixOf(qualified_classname)) { - return; - } +void SourceImporterImpl::importClass( + const QualifiedName& qualified_classname, + const ClassDef& class_def, + bool is_module) { + // BC for TorchBind classes + // + // Previously we would serialize TorchBind classes as actual + // classes with methods that delegate to things in the + // torch.ops.* namespace. We've switched away from this and + // now just rely on those classes being present in the binary + // and emit code for them based on the ClassType in memory. + // + // TODO: remove this once we no longer have old TorchBind code + // in production models + { + static QualifiedName torch_classes_qualname("__torch__.torch.classes"); + if (torch_classes_qualname.isPrefixOf(qualified_classname)) { + return; } - auto class_type = ClassType::create( - c10::QualifiedName(qualified_classname), cu_, is_module); - - std::vector methods; - std::vector method_resolvers; - std::map pre_hook_def_map; - std::map hook_def_map; - std::map pre_hook_resolver_map; - std::map hook_resolver_map; - std::vector attributes; - std::vector constants; - - // Module-specific: which attrs are parameters? - std::unordered_set parameter_names; - std::unordered_set buffer_names; - std::unordered_set pre_hook_names; - std::unordered_set hook_names; - // used to keep track of original ordering of hooks and prehooks - // in case any are called more than once - std::vector pre_hooks_order; - std::vector hooks_order; - // Process statements, splitting things into attribute and method - // definitions. - for (const auto& statement : class_def.body()) { - switch (statement.kind()) { - case TK_ASSIGN: { - const auto assign = Assign(statement); - switch (assign.lhs().kind()) { - case TK_VAR: { - const auto name = Var(assign.lhs()).name().name(); - if (name == "__parameters__") { - // Populate the module parameter list. This is a field that - // looks like: - // __parameters__ = ["foo", "bar", "baz"] - // which tells us which attributes are module parameters. - TORCH_INTERNAL_ASSERT( - is_module, - "Assignments in class body only " - "supported on modules right now"); - const auto param_list = - ListLiteral(assign.rhs().get()).inputs(); - for (const auto& param : param_list) { - parameter_names.insert(StringLiteral(param).text()); - } - } else if (name == "__annotations__") { - // This is to initialize the annotations dict, just ignore. - continue; - } else if (name == "__buffers__") { - TORCH_INTERNAL_ASSERT( - is_module, "Buffers only exist on modules at the moment"); - const auto buffer_list = - ListLiteral(assign.rhs().get()).inputs(); - for (const auto& buffer : buffer_list) { - buffer_names.insert(StringLiteral(buffer).text()); - } - } else if (name == "__forward_pre_hooks__") { - TORCH_INTERNAL_ASSERT( - is_module, - "Forward pre hooks only exist on modules at the moment"); - const auto pre_hook_list = - ListLiteral(assign.rhs().get()).inputs(); - for (const auto& pre_hook : pre_hook_list) { - std::string pre_hook_name = StringLiteral(pre_hook).text(); - pre_hook_names.insert(pre_hook_name); - pre_hooks_order.emplace_back(pre_hook_name); - } - } else if (name == "__forward_hooks__") { - TORCH_INTERNAL_ASSERT( - is_module, - "Forward hooks only exist on modules at the moment"); - const auto hook_list = ListLiteral(assign.rhs().get()).inputs(); - for (const auto& hook : hook_list) { - std::string hook_name = StringLiteral(hook).text(); - hook_names.insert(hook_name); - hooks_order.emplace_back(hook_name); - } - } else { - if (auto fixed_up = attributeAssignmentSpecialHandlingHack( - qualified_classname, assign)) { - attributes.push_back(std::move(*fixed_up)); - } else if (assign.rhs().present()) { - // This is a constant assignment, of the form: - // foo : Final[int] = 3 - constants.push_back(assign); - } else { - // This is a regular attribute assignment, of the form: - // foo : Tensor - attributes.push_back(assign); - } + } + auto class_type = ClassType::create( + c10::QualifiedName(qualified_classname), cu_, is_module); + + std::vector methods; + std::vector method_resolvers; + std::map pre_hook_def_map; + std::map hook_def_map; + std::map pre_hook_resolver_map; + std::map hook_resolver_map; + std::vector attributes; + std::vector constants; + + // Module-specific: which attrs are parameters? + std::unordered_set parameter_names; + std::unordered_set buffer_names; + std::unordered_set pre_hook_names; + std::unordered_set hook_names; + // used to keep track of original ordering of hooks and prehooks + // in case any are called more than once + std::vector pre_hooks_order; + std::vector hooks_order; + // Process statements, splitting things into attribute and method + // definitions. + for (const auto& statement : class_def.body()) { + switch (statement.kind()) { + case TK_ASSIGN: { + const auto assign = Assign(statement); + switch (assign.lhs().kind()) { + case TK_VAR: { + const auto name = Var(assign.lhs()).name().name(); + if (name == "__parameters__") { + // Populate the module parameter list. This is a field that + // looks like: + // __parameters__ = ["foo", "bar", "baz"] + // which tells us which attributes are module parameters. + TORCH_INTERNAL_ASSERT( + is_module, + "Assignments in class body only " + "supported on modules right now"); + const auto param_list = ListLiteral(assign.rhs().get()).inputs(); + for (const auto& param : param_list) { + parameter_names.insert(StringLiteral(param).text()); } - } break; - case TK_SUBSCRIPT: { - // This is a special attribute assignment where the attribute - // is not a valid python, identifier. Looks like: - // __annotations__["0"] = Tensor - const auto lhs = Subscript(assign.lhs()); + } else if (name == "__annotations__") { + // This is to initialize the annotations dict, just ignore. + continue; + } else if (name == "__buffers__") { TORCH_INTERNAL_ASSERT( - Var(lhs.value()).name().name() == "__annotations__"); - TORCH_INTERNAL_ASSERT(lhs.subscript_exprs().size() == 1); - attributes.push_back(assign); - } break; - default: { + is_module, "Buffers only exist on modules at the moment"); + const auto buffer_list = ListLiteral(assign.rhs().get()).inputs(); + for (const auto& buffer : buffer_list) { + buffer_names.insert(StringLiteral(buffer).text()); + } + } else if (name == "__forward_pre_hooks__") { TORCH_INTERNAL_ASSERT( - false, - "Unexpected statement kind in module metadata: ", - kindToString(statement.kind())); + is_module, + "Forward pre hooks only exist on modules at the moment"); + const auto pre_hook_list = + ListLiteral(assign.rhs().get()).inputs(); + for (const auto& pre_hook : pre_hook_list) { + std::string pre_hook_name = StringLiteral(pre_hook).text(); + pre_hook_names.insert(pre_hook_name); + pre_hooks_order.emplace_back(pre_hook_name); + } + } else if (name == "__forward_hooks__") { + TORCH_INTERNAL_ASSERT( + is_module, + "Forward hooks only exist on modules at the moment"); + const auto hook_list = ListLiteral(assign.rhs().get()).inputs(); + for (const auto& hook : hook_list) { + std::string hook_name = StringLiteral(hook).text(); + hook_names.insert(hook_name); + hooks_order.emplace_back(hook_name); + } + } else { + if (auto fixed_up = attributeAssignmentSpecialHandlingHack( + qualified_classname, assign)) { + attributes.push_back(std::move(*fixed_up)); + } else if (assign.rhs().present()) { + // This is a constant assignment, of the form: + // foo : Final[int] = 3 + constants.push_back(assign); + } else { + // This is a regular attribute assignment, of the form: + // foo : Tensor + attributes.push_back(assign); + } } + } break; + case TK_SUBSCRIPT: { + // This is a special attribute assignment where the attribute + // is not a valid python, identifier. Looks like: + // __annotations__["0"] = Tensor + const auto lhs = Subscript(assign.lhs()); + TORCH_INTERNAL_ASSERT( + Var(lhs.value()).name().name() == "__annotations__"); + TORCH_INTERNAL_ASSERT(lhs.subscript_exprs().size() == 1); + attributes.push_back(assign); + } break; + default: { + TORCH_INTERNAL_ASSERT( + false, + "Unexpected statement kind in module metadata: ", + kindToString(statement.kind())); } - } break; - case TK_DEF: { - Def def = Def(statement); - if (pre_hook_names.find(def.name().name()) != pre_hook_names.end()) { - pre_hook_def_map.emplace(def.name().name(), def); - pre_hook_resolver_map.emplace( - def.name().name(), shared_from_this()); - } else if (hook_names.find(def.name().name()) != hook_names.end()) { - hook_def_map.emplace(def.name().name(), def); - hook_resolver_map.emplace(def.name().name(), shared_from_this()); - } else { - methods.emplace_back(def); - method_resolvers.push_back(shared_from_this()); - } - } break; - default: { - TORCH_INTERNAL_ASSERT( - false, - "Unexpected statement kind in class body: ", - kindToString(statement.kind())); } - } - } - - // Populate class attributes - ScriptTypeParser type_parser(shared_from_this()); - for (const auto& assign : attributes) { - switch (assign.lhs().kind()) { - case TK_VAR: { - const auto name = Var(assign.lhs()).name().name(); - TORCH_INTERNAL_ASSERT(name != "__parameters__"); - const auto type = type_parser.parseTypeFromExpr(assign.type().get()); - const bool is_parameter = parameter_names.count(name); - const bool is_buffer = buffer_names.count(name); - class_type->addAttribute(name, type, is_parameter, is_buffer); - } break; - case TK_SUBSCRIPT: { - const auto name = - StringLiteral(Subscript(assign.lhs()).subscript_exprs()[0]) - .text(); - const auto type = type_parser.parseTypeFromExpr(assign.rhs().get()); - const bool is_parameter = parameter_names.count(name); - const bool is_buffer = buffer_names.count(name); - class_type->addAttribute(name, type, is_parameter, is_buffer); + } break; + case TK_DEF: { + Def def = Def(statement); + if (pre_hook_names.find(def.name().name()) != pre_hook_names.end()) { + pre_hook_def_map.emplace(def.name().name(), def); + pre_hook_resolver_map.emplace(def.name().name(), shared_from_this()); + } else if (hook_names.find(def.name().name()) != hook_names.end()) { + hook_def_map.emplace(def.name().name(), def); + hook_resolver_map.emplace(def.name().name(), shared_from_this()); + } else { + methods.emplace_back(def); + method_resolvers.push_back(shared_from_this()); } + } break; + default: { + TORCH_INTERNAL_ASSERT( + false, + "Unexpected statement kind in class body: ", + kindToString(statement.kind())); } } + } - // Populate class constants - for (const auto& assign : constants) { - auto const_val = type_parser.parseClassConstant(assign); - const auto name = Var(assign.lhs()).name().name(); - class_type->addConstant(name, const_val); + // Populate class attributes + ScriptTypeParser type_parser(shared_from_this()); + for (const auto& assign : attributes) { + switch (assign.lhs().kind()) { + case TK_VAR: { + const auto name = Var(assign.lhs()).name().name(); + TORCH_INTERNAL_ASSERT(name != "__parameters__"); + const auto type = type_parser.parseTypeFromExpr(assign.type().get()); + const bool is_parameter = parameter_names.count(name); + const bool is_buffer = buffer_names.count(name); + class_type->addAttribute(name, type, is_parameter, is_buffer); + } break; + case TK_SUBSCRIPT: { + const auto name = + StringLiteral(Subscript(assign.lhs()).subscript_exprs()[0]).text(); + const auto type = type_parser.parseTypeFromExpr(assign.rhs().get()); + const bool is_parameter = parameter_names.count(name); + const bool is_buffer = buffer_names.count(name); + class_type->addAttribute(name, type, is_parameter, is_buffer); + } } + } - // build pre hook and hook def/resolver pairs - // pairs are dedupped in ir_emitter.cpp's CompilationUnit::define_hooks() - // ordering here is call order for hooks - std::vector hooks; - std::vector hook_resolvers; - for (const std::string& hook_name : hooks_order) { - hooks.emplace_back(hook_def_map.find(hook_name)->second); - hook_resolvers.push_back(hook_resolver_map.find(hook_name)->second); - } - std::vector pre_hooks; - std::vector pre_hook_resolvers; - for (const std::string& pre_hook_name : pre_hooks_order) { - pre_hooks.emplace_back(pre_hook_def_map.find(pre_hook_name)->second); - pre_hook_resolvers.push_back( - pre_hook_resolver_map.find(pre_hook_name)->second); - } + // Populate class constants + for (const auto& assign : constants) { + auto const_val = type_parser.parseClassConstant(assign); + const auto name = Var(assign.lhs()).name().name(); + class_type->addConstant(name, const_val); + } - cu_->register_type(class_type); - const auto self = SimpleSelf(class_type); - cu_->define( - qualified_classname, - /*properties=*/{}, - /*propResolvers=*/{}, - methods, - method_resolvers, - &self); - cu_->define_hooks( - qualified_classname, - hooks, - hook_resolvers, - pre_hooks, - pre_hook_resolvers, - &self); - } - - void importEnum( - const QualifiedName& qualified_name, - const ClassDef& enum_def) { - std::vector names_values; - - TypePtr value_type = nullptr; - auto set_or_check_type = [&value_type]( - const TypePtr& t, const SourceRange& loc) { - if (!value_type) { - value_type = t; - } else if (value_type != t) { - throw ErrorReport(loc) - << "Enum class with varying value types are not supported."; - } - }; + // build pre hook and hook def/resolver pairs + // pairs are dedupped in ir_emitter.cpp's CompilationUnit::define_hooks() + // ordering here is call order for hooks + std::vector hooks; + std::vector hook_resolvers; + for (const std::string& hook_name : hooks_order) { + hooks.emplace_back(hook_def_map.find(hook_name)->second); + hook_resolvers.push_back(hook_resolver_map.find(hook_name)->second); + } + std::vector pre_hooks; + std::vector pre_hook_resolvers; + for (const std::string& pre_hook_name : pre_hooks_order) { + pre_hooks.emplace_back(pre_hook_def_map.find(pre_hook_name)->second); + pre_hook_resolvers.push_back( + pre_hook_resolver_map.find(pre_hook_name)->second); + } - for (const auto& statement : enum_def.body()) { - if (statement.kind() != TK_ASSIGN) { - throw ErrorReport(statement.range()) - << "Unexpected statement in Enum class body: " - "only enum attribute definitions are currently supported."; - } + cu_->register_type(class_type); + const auto self = SimpleSelf(class_type); + cu_->define( + qualified_classname, + /*properties=*/{}, + /*propResolvers=*/{}, + methods, + method_resolvers, + &self); + cu_->define_hooks( + qualified_classname, + hooks, + hook_resolvers, + pre_hooks, + pre_hook_resolvers, + &self); +} - const auto assign = Assign(statement); - const auto name = Var(assign.lhs()).name().name(); - - IValue ivalue; - auto rhs = assign.rhs().get(); - switch (rhs.kind()) { - case TK_STRINGLITERAL: - ivalue = IValue(StringLiteral(rhs).text()); - set_or_check_type(StringType::get(), statement.range()); - break; - case TK_CONST: { - auto numeric_const = Const(rhs); - if (numeric_const.isFloatingPoint()) { - ivalue = IValue(numeric_const.asFloatingPoint()); - set_or_check_type(FloatType::get(), statement.range()); - } else if (numeric_const.isIntegral()) { - ivalue = IValue(numeric_const.asIntegral()); - set_or_check_type(IntType::get(), statement.range()); - } - break; - } - default: - throw ErrorReport(rhs.range()) - << "Unsupported enum value type: " << rhs.kind() - << ". Only Integers, Floats and Strings are supported."; - } +void SourceImporterImpl::importEnum( + const QualifiedName& qualified_name, + const ClassDef& enum_def) { + std::vector names_values; - names_values.emplace_back(std::make_pair(name, ivalue)); + TypePtr value_type = nullptr; + auto set_or_check_type = [&value_type]( + const TypePtr& t, const SourceRange& loc) { + if (!value_type) { + value_type = t; + } else if (value_type != t) { + throw ErrorReport(loc) + << "Enum class with varying value types are not supported."; } + }; - if (!value_type) { - throw ErrorReport(enum_def.range()) - << "No enum values defined for " << qualified_name.qualifiedName(); + for (const auto& statement : enum_def.body()) { + if (statement.kind() != TK_ASSIGN) { + throw ErrorReport(statement.range()) + << "Unexpected statement in Enum class body: " + "only enum attribute definitions are currently supported."; } - auto enum_type = EnumType::create( - qualified_name, std::move(value_type), std::move(names_values), cu_); - cu_->register_type(enum_type); - } - - void importNamedTuple( - const QualifiedName& qualified_name, - const ClassDef& named_tuple_def) { - ScriptTypeParser type_parser(shared_from_this()); - std::vector field_names; - std::vector field_types; - std::vector field_defaults; - for (const auto& statement : named_tuple_def.body()) { - if (statement.kind() != TK_ASSIGN) { - throw ErrorReport(statement.range()) - << "Unexpected statement in NamedTuple body: " - "only attribute annotations are currently supported."; - } - const auto assign = Assign(statement); - - auto name = Var(Assign(statement).lhs()).name().name(); - c10::optional default_val; - if (assign.rhs().present()) { - std::vector parsed = type_parser.evaluateDefaults( - assign.rhs().range(), {assign.rhs().get()}, {assign.type().get()}); - TORCH_INTERNAL_ASSERT(parsed.size() == 1); - default_val = parsed[0]; + const auto assign = Assign(statement); + const auto name = Var(assign.lhs()).name().name(); + + IValue ivalue; + auto rhs = assign.rhs().get(); + switch (rhs.kind()) { + case TK_STRINGLITERAL: + ivalue = IValue(StringLiteral(rhs).text()); + set_or_check_type(StringType::get(), statement.range()); + break; + case TK_CONST: { + auto numeric_const = Const(rhs); + if (numeric_const.isFloatingPoint()) { + ivalue = IValue(numeric_const.asFloatingPoint()); + set_or_check_type(FloatType::get(), statement.range()); + } else if (numeric_const.isIntegral()) { + ivalue = IValue(numeric_const.asIntegral()); + set_or_check_type(IntType::get(), statement.range()); + } + break; } + default: + throw ErrorReport(rhs.range()) + << "Unsupported enum value type: " << rhs.kind() + << ". Only Integers, Floats and Strings are supported."; + } - auto type = type_parser.parseTypeFromExpr(assign.type().get()); + names_values.emplace_back(std::make_pair(name, ivalue)); + } - field_names.emplace_back(std::move(name)); - field_types.emplace_back(std::move(type)); - if (default_val) { - field_defaults.emplace_back(std::move(*default_val)); - } + if (!value_type) { + throw ErrorReport(enum_def.range()) + << "No enum values defined for " << qualified_name.qualifiedName(); + } + + auto enum_type = EnumType::create( + qualified_name, std::move(value_type), std::move(names_values), cu_); + cu_->register_type(enum_type); +} + +void SourceImporterImpl::importNamedTuple( + const QualifiedName& qualified_name, + const ClassDef& named_tuple_def) { + ScriptTypeParser type_parser(shared_from_this()); + std::vector field_names; + std::vector field_types; + std::vector field_defaults; + for (const auto& statement : named_tuple_def.body()) { + if (statement.kind() != TK_ASSIGN) { + throw ErrorReport(statement.range()) + << "Unexpected statement in NamedTuple body: " + "only attribute annotations are currently supported."; + } + const auto assign = Assign(statement); + + auto name = Var(Assign(statement).lhs()).name().name(); + c10::optional default_val; + if (assign.rhs().present()) { + std::vector parsed = type_parser.evaluateDefaults( + assign.rhs().range(), {assign.rhs().get()}, {assign.type().get()}); + TORCH_INTERNAL_ASSERT(parsed.size() == 1); + default_val = parsed[0]; } - auto tt = TupleType::createNamed( - qualified_name, field_names, field_types, field_defaults); - cu_->register_type(tt); - } + auto type = type_parser.parseTypeFromExpr(assign.type().get()); - void parsePossibleVersionNumber(Lexer& L) { - // Older versions of serialization produced an op_version_set string - // per-file We now just use a single version which is handled by - // PyTorchStreamReader. We used to check if op_version_set was _newer_ for - // forward compatibility reasons but now that it doesn't exist there can't - // be a newer one, so we just discard this. - if (L.cur().kind == TK_IDENT && L.cur().text() == "op_version_set") { - auto range = L.cur().range; - L.next(); - L.expect('='); - std::string version_text = L.expect(TK_NUMBER).text(); - L.expect(TK_NEWLINE); + field_names.emplace_back(std::move(name)); + field_types.emplace_back(std::move(type)); + if (default_val) { + field_defaults.emplace_back(std::move(*default_val)); } } - // older versions of serialization required import statements, - // and defined classes file-at-a-time in import order. - // The problem is that in Python - // it is possible to construct cyclic dependencies between files even - // when there are none between individual classes. New versions of loading - // just compile class-at-a-time, so we no longer need to follow the import - // order. Future serialization may stop producing the import code. - void parseImports(Lexer& L) { - while (L.nextIf(TK_IMPORT)) { - std::ostringstream s; - while (L.cur().kind != TK_NEWLINE) { - s << L.cur().text(); - L.next(); - } - L.expect(TK_NEWLINE); - } + auto tt = TupleType::createNamed( + qualified_name, field_names, field_types, field_defaults); + cu_->register_type(tt); +} + +void SourceImporterImpl::parsePossibleVersionNumber(Lexer& L) { + // Older versions of serialization produced an op_version_set string + // per-file We now just use a single version which is handled by + // PyTorchStreamReader. We used to check if op_version_set was _newer_ for + // forward compatibility reasons but now that it doesn't exist there can't + // be a newer one, so we just discard this. + if (L.cur().kind == TK_IDENT && L.cur().text() == "op_version_set") { + auto range = L.cur().range; + L.next(); + L.expect('='); + std::string version_text = L.expect(TK_NUMBER).text(); + L.expect(TK_NEWLINE); } +} - std::shared_ptr cu_; - std::unordered_map> env_; - SourceLoader source_loader_; - std::unordered_set loaded_sources_; - // named types and functions loaded from a file but not yet defined because - // their type has not been requested yet. - std::unordered_map to_be_defined_; -}; +// older versions of serialization required import statements, +// and defined classes file-at-a-time in import order. +// The problem is that in Python +// it is possible to construct cyclic dependencies between files even +// when there are none between individual classes. New versions of loading +// just compile class-at-a-time, so we no longer need to follow the import +// order. Future serialization may stop producing the import code. +void SourceImporterImpl::parseImports(Lexer& L) { + while (L.nextIf(TK_IMPORT)) { + std::ostringstream s; + while (L.cur().kind != TK_NEWLINE) { + s << L.cur().text(); + L.next(); + } + L.expect(TK_NEWLINE); + } +} std::shared_ptr ClassNamespaceValue::attr( const SourceRange& loc, diff --git a/torch/csrc/jit/serialization/import_source.h b/torch/csrc/jit/serialization/import_source.h index e87ab59271594..f52f38afe6b15 100644 --- a/torch/csrc/jit/serialization/import_source.h +++ b/torch/csrc/jit/serialization/import_source.h @@ -1,22 +1,79 @@ #pragma once +#include +#include #include +#include +#include +#include #include +#include +#include #include #include +#include #include #include namespace torch { namespace jit { -struct SourceImporterImpl; +using SourceLoader = std::function(const std::string&)>; + +struct SourceImporterImpl : public Resolver, + std::enable_shared_from_this { + SourceImporterImpl( + std::shared_ptr cu, + const std::vector* constant_table, + SourceLoader source_loader, + size_t version); + TypePtr findNamedType(const QualifiedName& name); + Function* findFunction(const QualifiedName& name); + void parseSourceIfNeeded(const std::string& qualifier); + void LEGACY_import_methods( + const Module& mod, + const std::shared_ptr& src); + + std::shared_ptr resolveValue( + const std::string& name, + Function& m, + const SourceRange& loc) override; + TypePtr resolveType(const std::string& name, const SourceRange& loc) override; + + private: + void importFunction(const std::string& qualifier, const Def& def); + void importNamedType(const std::string& qualifier, const ClassDef& class_def); + c10::optional attributeAssignmentSpecialHandlingHack( + const QualifiedName& qualified_classname, + const Assign& assign); + void importClass( + const QualifiedName& qualified_classname, + const ClassDef& class_def, + bool is_module); + void importEnum( + const QualifiedName& qualified_name, + const ClassDef& enum_def); + void importNamedTuple( + const QualifiedName& qualified_name, + const ClassDef& named_tuple_def); + + void parsePossibleVersionNumber(Lexer& L); + + void parseImports(Lexer& L); + + std::shared_ptr cu_; + std::unordered_map> env_; + SourceLoader source_loader_; + std::unordered_set loaded_sources_; + // named types and functions loaded from a file but not yet defined because + // their type has not been requested yet. + std::unordered_map to_be_defined_; +}; // Given a directory of serialized TorchScript sources, // This class allows the loading of individual named types in source. // Resolves the dependencies between source files and parses // the source files as necessary. -using SourceLoader = std::function(const std::string&)>; struct TORCH_API SourceImporter { SourceImporter( From b2c7c1dfcf9c366ecef5db635b201954981c609f Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Thu, 2 Sep 2021 06:12:07 -0700 Subject: [PATCH 466/530] fix copy.deepcopy on LinearPackedParams (#64367) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64367 This is the same thing as https://github.com/pytorch/pytorch/pull/56154 but for quantized linear. It fixes the behavior of `copy.deepcopy` on these modules. Before this PR, copied instances of `LinearPackedParams` were not properly initialized, and inspecting them raised errors of missing `_modules`. After this PR, inspecting and using the copies works. Test Plan: ``` python test/test_quantization.py TestStaticQuantizedModule.test_linear_api ``` Imported from OSS Reviewed By: jerryzh168 Differential Revision: D30702667 fbshipit-source-id: 38c26d1e72663416eeb989985b77ffc2052c12b9 --- .../core/test_quantized_module.py | 29 +++++++++++++++---- torch/nn/quantized/modules/linear.py | 10 +++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py index b0bc78294d9b5..51e62174cc081 100644 --- a/test/quantization/core/test_quantized_module.py +++ b/test/quantization/core/test_quantized_module.py @@ -103,8 +103,7 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, zero_point = 3 qlinear = class_map[use_fused](in_features, out_features) - qlinear_copy = qlinear # deepcopy does not work right now - # qlinear_copy = copy.deepcopy(qlinear) + qlinear_copy = copy.deepcopy(qlinear) self.checkScriptable(qlinear_copy, [[X_q]], check_save_load=True) # Run module with default-initialized parameters. # This tests that the constructor is correct. @@ -155,15 +154,16 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, linear_unpack(loaded_qlinear._packed_params._packed_params)) self.assertEqual(qlinear.scale, loaded_qlinear.scale) self.assertEqual(qlinear.zero_point, loaded_qlinear.zero_point) - # make sure loaded_qlinear has the same dir as qlinear since - # scripting the module will add __overloads__ to __dict__ - self.checkScriptable(loaded_qlinear, [[X_q]], check_save_load=True) + # scripting will add __overloads__ to __dict__, which is why we script a copy + # to be able to do the check in the next line + self.checkScriptable(copy.deepcopy(loaded_qlinear), [[X_q]], check_save_load=True) self.assertTrue(dir(qlinear) == dir(loaded_qlinear)) self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias()) self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params)) Z_q2 = loaded_qlinear(X_q) self.assertEqual(Z_q, Z_q2) + # Test serialization b = io.BytesIO() torch.save(qlinear, b) b.seek(0) @@ -172,6 +172,25 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, self.assertEqual(qlinear.scale, loaded.scale) self.assertEqual(qlinear.zero_point, loaded.zero_point) + # Test copy and deepcopy + copied_linear = copy.copy(qlinear) + self.assertEqual(copied_linear.bias(), qlinear.bias()) + self.assertEqual(copied_linear.scale, qlinear.scale) + self.assertEqual(copied_linear.zero_point, + qlinear.zero_point) + Y_copied = copied_linear(X_q) + np.testing.assert_array_almost_equal( + Z_q.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0) + + deepcopied_linear = copy.deepcopy(qlinear) + self.assertEqual(deepcopied_linear.bias(), qlinear.bias()) + self.assertEqual(deepcopied_linear.scale, qlinear.scale) + self.assertEqual(deepcopied_linear.zero_point, + qlinear.zero_point) + Y_deepcopied = copied_linear(X_q) + np.testing.assert_array_almost_equal( + Z_q.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0) + # Test JIT self.checkScriptable(qlinear, [[X_q]], check_save_load=True) diff --git a/torch/nn/quantized/modules/linear.py b/torch/nn/quantized/modules/linear.py index 4abd2115e4125..4df775105ba82 100644 --- a/torch/nn/quantized/modules/linear.py +++ b/torch/nn/quantized/modules/linear.py @@ -94,6 +94,16 @@ def __setstate__(self, state): self.set_weight_bias(state[0], state[1]) self.training = state[2] + def __deepcopy__(self, memo): + new_instance = type(self).__new__(type(self)) + torch.nn.Module.__init__(new_instance) + state = self.__getstate__() + new_instance.__setstate__(state) + return new_instance + + def __copy__(self): + return self.__deepcopy__({}) + def __repr__(self): return self._weight_bias().__repr__() From b737629ff0d4dd82f246b0efa6aef53f15971e78 Mon Sep 17 00:00:00 2001 From: Michael Dagitses Date: Thu, 2 Sep 2021 06:49:09 -0700 Subject: [PATCH 467/530] simplify op name determination into a single forward pass (#64261) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64261 Note that this does not preserve byte-for-byte compatibility with existing names. Test Plan: * Rely on CI to catch gross errors. * Merge after release cut to catch subtle issues. Reviewed By: albanD Differential Revision: D30700647 Pulled By: dagitses fbshipit-source-id: 7b02f34b8fae3041240cc78fbc6bcae498c3acd4 --- ...tAutograd.test_function-x_grad_desc.expect | 2 +- ...tAutograd.test_function-y_grad_desc.expect | 2 +- test/test_autograd.py | 4 +- test/test_cuda.py | 2 +- tools/autograd/load_derivatives.py | 82 +++++++------------ torch/csrc/autograd/variable.cpp | 6 +- 6 files changed, 38 insertions(+), 60 deletions(-) diff --git a/test/expect/TestAutograd.test_function-x_grad_desc.expect b/test/expect/TestAutograd.test_function-x_grad_desc.expect index b6fdb63db272a..68242e2ffae90 100644 --- a/test/expect/TestAutograd.test_function-x_grad_desc.expect +++ b/test/expect/TestAutograd.test_function-x_grad_desc.expect @@ -1 +1 @@ -CopyBackwards(None, AddBackward0(ExpandBackward(AccumulateGrad()), MulBackward0(ExpandBackward(AccumulateGrad()), AccumulateGrad()))) \ No newline at end of file +CopyBackwards(None, AddBackward0(ExpandBackward0(AccumulateGrad()), MulBackward0(ExpandBackward0(AccumulateGrad()), AccumulateGrad()))) \ No newline at end of file diff --git a/test/expect/TestAutograd.test_function-y_grad_desc.expect b/test/expect/TestAutograd.test_function-y_grad_desc.expect index e32d5888e1e7a..88db87320a92e 100644 --- a/test/expect/TestAutograd.test_function-y_grad_desc.expect +++ b/test/expect/TestAutograd.test_function-y_grad_desc.expect @@ -1 +1 @@ -CopyBackwards(None, AddBackward0(MulBackward0(ExpandBackward(AccumulateGrad()), None), MulBackward0(ExpandBackward(AccumulateGrad()), AccumulateGrad()))) \ No newline at end of file +CopyBackwards(None, AddBackward0(MulBackward0(ExpandBackward0(AccumulateGrad()), None), MulBackward0(ExpandBackward0(AccumulateGrad()), AccumulateGrad()))) \ No newline at end of file diff --git a/test/test_autograd.py b/test/test_autograd.py index fde64b0b062d3..2da74cbd01938 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -3450,7 +3450,7 @@ def test_inplace_on_view_backward(self): gradient_penalty.backward() fn = gradient_penalty.grad_fn.next_functions[0][0].next_functions[1][0] - self.assertEqual(fn.name(), "ThresholdBackwardBackward") + self.assertEqual(fn.name(), "ThresholdBackwardBackward0") def test_inplace_on_view_weak_grad_fn(self): # Issue 23502: Test that b's grad_fn is preserved. @@ -4859,7 +4859,7 @@ def maybe_check_raise(fn, should_raise): # The 3 elements are for view_as, first output of unbind and second output of unbind run_test(grad_mode=True, requires_grad=False, is_view=True, should_raise_tuple=(None, None, None)) - inp_change_err = "Output {} of UnbindBackward is a view and is being modified inplace." + inp_change_err = "Output {} of UnbindBackward0 is a view and is being modified inplace." run_test(grad_mode=True, requires_grad=True, is_view=True, should_raise_tuple=(None, inp_change_err.format("0"), inp_change_err.format("1"))) leaf_grad_err = "A view was created in no_grad mode and is being modified inplace" diff --git a/test/test_cuda.py b/test/test_cuda.py index 33dbade7380b8..cddd15a7670e9 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -3049,7 +3049,7 @@ def test_autocast_rnn(self): # Autocast wrapper requires at::_cudnn_rnn is autograd-exposed. This check can't guarantee # at::_cudnn_rnn is autograd-exposed, but if it fires, it indicates some funny business has # occurred and we should double check that at::_cudnn_rnn remains autograd-exposed. - self.assertEqual(out.grad_fn.name(), "CudnnRnnBackward") + self.assertEqual(out.grad_fn.name(), "CudnnRnnBackward0") out.sum().backward() grads = [p.grad.clone() for p in rnn.parameters()] diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py index 3ff11f4d18691..8a5904b732918 100644 --- a/tools/autograd/load_derivatives.py +++ b/tools/autograd/load_derivatives.py @@ -2,9 +2,9 @@ # # Each autograd function is represented by `DifferentiabilityInfo` containing # a list of `Derivative`. See `tools.codegen.api.autograd` for the data models. -from collections import defaultdict, Counter +from collections import defaultdict import re -from typing import Sequence, Any, Tuple, List, Set, Dict, Match, Optional +from typing import Counter, Sequence, Any, Tuple, List, Set, Dict, Match, Optional import yaml from tools.codegen.api.autograd import (Derivative, DifferentiabilityInfo, @@ -43,32 +43,15 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque assert str(function.func) not in functions_by_schema functions_by_schema[str(function.func)] = function + # Keep track of how many of which ops we've seen so we can + # disambiguate them with a numeric suffix. + op_counter = Counter[str]() + infos = [ - create_differentiability_info(defn, functions_by_signature, functions_by_schema) + create_differentiability_info(defn, functions_by_signature, functions_by_schema, op_counter) for defn in definitions] - # To keep it byte-for-byte compatible with the old codegen, we assign op names as a separate - # step. We only assign op names to those with differentiable args, and only append suffix to - # duplicated op names. This can be simplified if the first of the duplicates can be named - # 'XyzBackward' instead of 'XyzBackward0' or unconditionally append '0' to singletons. - op_names = create_op_names(infos) - res = [ - DifferentiabilityInfo( - name=info.name, - func=info.func, - op=op_name, - derivatives=info.derivatives, - forward_derivatives=info.forward_derivatives, - all_saved_inputs=info.all_saved_inputs, - all_saved_outputs=info.all_saved_outputs, - args_with_derivatives=info.args_with_derivatives, - non_differentiable_arg_names=info.non_differentiable_arg_names, - output_differentiability=info.output_differentiability, - output_differentiability_conditions=info.output_differentiability_conditions, - ) - for info, op_name in zip(infos, op_names)] - - _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = res + _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = infos return _GLOBAL_LOAD_DERIVATIVE_CACHE[key] @@ -279,6 +262,7 @@ def create_differentiability_info( defn: Dict[Any, Any], functions_by_signature: Dict[FunctionSchema, List[NativeFunction]], functions_by_schema: Dict[str, NativeFunction], + op_counter: Counter[str], ) -> DifferentiabilityInfo: """Processes a single entry `defn` in derivatives.yaml""" @@ -424,10 +408,17 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[ derivatives, forward_derivatives, args_with_derivatives, non_differentiable_arg_names = set_up_derivatives(canonical) + # only assign an op name if we are actually going to calculate a derivative + op = None + if args_with_derivatives: + op_prefix = _create_op_prefix(defn_name) + op = f'{op_prefix}{op_counter[op_prefix]}' + op_counter[op_prefix] += 1 + return DifferentiabilityInfo( name=defn_name, func=canonical, - op=None, + op=op, derivatives=derivatives, forward_derivatives=forward_derivatives, all_saved_inputs=dedup_vars([v for d in derivatives for v in d.saved_inputs]), @@ -566,35 +557,22 @@ def repl(m: Match[str]) -> str: return formula, tuple(saved) -def create_op_name(info: DifferentiabilityInfo) -> Optional[str]: - # only assign an op name if we are actually going to calculate a derivative - if not info.args_with_derivatives: - return None - name = info.name +def _create_op_prefix(name: str) -> str: + """Takes a native function name converts to a op prefix name. + + Note that the "name" parameter must be the native function name + without the optional variant suffix, so "add" instead of + "add.out". + + OP names correspond to classes, hence the change to title case. + + Example:: + >>> _create_op_prefix('add') + 'AddBackward' + """ camel_case = ''.join([p.title() for p in name.split('_')]) return (camel_case + 'Backward').replace('ForwardBackward', 'Backward') -def create_op_names(infos: Sequence[DifferentiabilityInfo]) -> Sequence[Optional[str]]: - names = list(map(create_op_name, infos)) - dups = set(item for item, count in Counter(names).items() if count > 1) - - # de-duplicate operation names - # you end up with something like: - # AddBackward0 - # AddBackward1 - # one for each overload - counter: Dict[str, int] = Counter() - dedup: List[Optional[str]] = [] - for name in names: - if name is None: - # Keep a placeholder - dedup.append(None) - elif name in dups: - dedup.append(f'{name}{counter[name]}') - counter[name] += 1 - else: - dedup.append(name) - return dedup def dedup_vars(vars: Sequence[SavedAttribute]) -> Sequence[SavedAttribute]: seen: Set[str] = set() diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 32af5f97ad4e4..7ae1ac0bdee8d 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -551,10 +551,10 @@ const std::shared_ptr& VariableHooks::grad_fn(const Tenso // self = view_op_n(view_n-1) // self = inplace_op(self) // - // For CPU/CUDA backends, we employ one AsStridedBackward Node to represent the chain of + // For CPU/CUDA backends, we employ one AsStridedBackward0 Node to represent the chain of // view backward ops for effienciency. // - // However in XLA backend we don't have full support of AsStridedBackward, we instead run a full + // However in XLA backend we don't have full support of AsStridedBackward0, we instead run a full // forward pass with a tensor that requires gradient to get proper grad_fn setup, // then save it to DifferentiableViewMeta for future use. // This is fairly cheap for XLA lazy tensor approach (but would be really expensive for CPU/CUDA). @@ -572,7 +572,7 @@ const std::shared_ptr& VariableHooks::grad_fn(const Tenso auto diff_view = view_fn(view_info.base_); diff_view_meta->grad_fn_ = diff_view.grad_fn(); } else { - auto fn = std::make_shared(); + auto fn = std::make_shared(); fn->self_geometry = at::TensorGeometry(view_info.base_); fn->size = self.sizes().vec(); fn->stride = self.strides().vec(); From f04e6594ed7d7657a059ef63e82e136aa2bbc0fd Mon Sep 17 00:00:00 2001 From: Seth Elliott Date: Thu, 2 Sep 2021 07:48:47 -0700 Subject: [PATCH 468/530] Fix broken caffe2 test: PlanExecutorTest.BlockingErrorPlan (#64401) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64401 PlanExecutorTest.BlockingErrorPlan uses `ASSERT_DEATH` which internally performs a `fork()`. This can cause problems under certain configurations that use threads. This change updates this test to use the "threadsafe" style for GTest death tests in order to improve its quality in multithreaded environments. Test Plan: I confirmed that this change fixes the issue on my devvm with the following command: ``` buck test mode/dev //caffe2/caffe2:caffe2_test_cpu -- PlanExecutorTest.BlockingErrorPlan ``` Reviewed By: praihan Differential Revision: D30709447 fbshipit-source-id: 12ffd9ad0371e2e5b43a9873c80568e5ab02d246 --- caffe2/core/plan_executor_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/caffe2/core/plan_executor_test.cc b/caffe2/core/plan_executor_test.cc index 6f0c237a8b086..7a54403805ecb 100644 --- a/caffe2/core/plan_executor_test.cc +++ b/caffe2/core/plan_executor_test.cc @@ -290,6 +290,8 @@ TEST(PlanExecutorTest, BlockingErrorPlan) { #endif #endif + testing::GTEST_FLAG(death_test_style) = "threadsafe"; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) ASSERT_DEATH( [] { From cd3be4675faddb1ddda5d7d2b7d163574de6f9eb Mon Sep 17 00:00:00 2001 From: CodemodService FBSourceClangFormatLinterBot <> Date: Thu, 2 Sep 2021 08:10:37 -0700 Subject: [PATCH 469/530] [AutoAccept][Codemod][FBSourceClangFormatLinter] Daily `arc lint --take CLANGFORMAT` Reviewed By: zertosh Differential Revision: D30710635 fbshipit-source-id: e8dae05a7e3a19d656067a4f102aab4a3c93ac42 --- torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h index e69a76cddc8ed..22c732862620a 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h @@ -53,8 +53,7 @@ class TORCH_API FaultyTensorPipeAgent : public TensorPipeAgent { const WorkerInfo& to, c10::intrusive_ptr message, const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout, - const DeviceMap& deviceMap = {}) - override; + const DeviceMap& deviceMap = {}) override; // Add delay to writes void pipeWrite( From 616fd9219da18bcfe69da8b0c3a96dd2c6298066 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Thu, 2 Sep 2021 08:12:48 -0700 Subject: [PATCH 470/530] [Static Runtime] Add sign/abs/lop1p/mul fusion pass (#64209) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64209 Add a new fusion pass that turns transforms the following pattern: ``` graph(%input): %0 : Tensor = aten::sign(%input) %1 : Tensor = aten::abs(%input) %2 : Tensor = aten::log1p(%1) %res : Tensor = aten::mul(%0, %2) return (%res) ``` Into a single op: ``` graph(%input): %res : Tensor = static_runtim::signed_log1p(%input) return (%res) ``` The intent is to reduce the number of passes over the tensor. However, enabling this pass actually causes a performance regression, probably due to a lack of vectorization in the fused implementation. Because of this issue, this diff **does not** enable this pass. Followup: navahgar will add an NNC kernel which is faster than the the unfused version and enable this pass. We still need this version as a fallback since the NNC kernel will not support all dtypes. Test Plan: `buck test caffe2/benchmarks/static_runtime:static_runtime_cpptest -- SignedLog1p` Test passed with new graph pass disabled and enabled. Reviewed By: hlu1 Differential Revision: D30559929 fbshipit-source-id: e4e080cb2e6a705cfdde1fc98bee92b723f8132a --- benchmarks/static_runtime/test_scripts.h | 11 ++++ .../static_runtime/test_static_runtime.cc | 8 +++ torch/csrc/jit/runtime/static/ops.cpp | 63 +++++++++++++++++++ torch/csrc/jit/runtime/static/passes.cpp | 22 +++++++ torch/csrc/jit/runtime/static/passes.h | 2 + 5 files changed, 106 insertions(+) diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h index 99b73db79f3d1..b17ddeda45dff 100644 --- a/benchmarks/static_runtime/test_scripts.h +++ b/benchmarks/static_runtime/test_scripts.h @@ -827,3 +827,14 @@ const auto cumsum_script_dtype = R"JIT( def forward(self, a: Tensor, dim: int, dtype: int): return torch.cumsum(a, dim, dtype=dtype).clone() )JIT"; + +const std::string signed_log1p_script = R"IR( + graph(%input): + %0 : Tensor = aten::sign(%input) + %1 : Tensor = aten::abs(%input) + %2 : Tensor = aten::log1p(%1) + %3 : Tensor = aten::mul(%0, %2) + %none : NoneType = prim::Constant() + %res : Tensor = aten::clone(%3, %none) + return (%res) +)IR"; diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 16941dab84760..5eb3dfe28bd84 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1356,3 +1356,11 @@ TEST(StaticRuntime, IndividualOps_Nonzero) { auto b = at::randint(0, 2, {4, 3, 2}); testStaticRuntime(nonzero_tensor, {a}, {b}); } + +TEST(StaticRuntime, SignedLog1p) { + std::vector args1 = {at::randn({2, 2})}; + testStaticRuntime(signed_log1p_script, args1, {}, true); + + std::vector args2 = {at::randn({3, 3, 3})}; + testStaticRuntime(signed_log1p_script, args1, args2, true); +} diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 7ede15c524296..62f5bb28c1553 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -1837,5 +1837,68 @@ REGISTER_OPERATOR_FUNCTOR( } }; }); + +namespace { + +// This template and its specialization help us avoid compiler warnings +// about taking the absolute value of an unsigned type in signed_log1p +template +T abs_if_signed(T val) { + return std::abs(val); +} + +template <> +unsigned char abs_if_signed(unsigned char val) { + return val; +} + +// Computes f(x) = sign(x) * ln(|1 + x|) for each x in the input tensor +void signed_log1p_out(at::Tensor& out, const at::Tensor& input) { + at::native::resize_(out, input.sizes(), c10::nullopt); + + const auto input_contig = input.expect_contiguous(); + auto output_contig = out.expect_contiguous(); + + AT_DISPATCH_ALL_TYPES(input.scalar_type(), "signed_log1p_kernel", [&]() { + const auto input_data = input_contig->data_ptr(); + auto output_data = output_contig->data_ptr(); + const auto N = input.numel(); + + for (const auto i : c10::irange(N)) { + const int sign = input_data[i] < 0 ? -1 : 1; + output_data[i] = std::log1p(abs_if_signed(input_data[i])) * sign; + } + }); +} + +at::Tensor signed_log1p(const at::Tensor& input) { + auto out = create_empty_from(input); + signed_log1p_out(out, input); + return out; +} + +} // namespace + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +REGISTER_OPERATOR_FUNCTOR( + static_runtime::signed_log1p, + static_runtime_signed_log1p, + [](Node* n) -> SROperator { + if (!n->matches(torch::schema( + "static_runtime::signed_log1p(Tensor x) -> Tensor"))) { + LogAndDumpSchema(n); + return nullptr; + } + return [](ProcessedNode* p_node) { + const auto& input = p_node->Input(0).toTensor(); + if (p_node->Output(0).isNone()) { + p_node->Output(0) = signed_log1p(input); + } else { + auto& out = p_node->Output(0).toTensor(); + fastResizeToZero(out); + signed_log1p_out(out, input); + } + }; + }); } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp index 5099dc1ba6e2b..0eaebfdf0e7aa 100644 --- a/torch/csrc/jit/runtime/static/passes.cpp +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -306,6 +306,28 @@ TORCH_LIBRARY_FRAGMENT(static_runtime, m) { m.def(torch::schema( "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor, Tensor, Tensor)", c10::AliasAnalysisKind::PURE_FUNCTION)); + m.def("static_runtime::signed_log1p(Tensor input) -> Tensor"); +} + +void FuseSignLog1P(std::shared_ptr& graph) { + std::string pattern = R"IR( + graph(%input): + %0 : Tensor = aten::sign(%input) + %1 : Tensor = aten::abs(%input) + %2 : Tensor = aten::log1p(%1) + %res : Tensor = aten::mul(%0, %2) + return (%res) + )IR"; + + std::string fused_pattern = R"IR( + graph(%input): + %res : Tensor = static_runtime::signed_log1p(%input) + return (%res) + )IR"; + + SubgraphRewriter fuse; + fuse.RegisterRewritePattern(pattern, fused_pattern); + fuse.runOnGraph(graph); } bool HasInplaceOp(std::shared_ptr& graph, const AliasDb& alias_db) { diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h index a42bc97f19618..0904d37fb02c4 100644 --- a/torch/csrc/jit/runtime/static/passes.h +++ b/torch/csrc/jit/runtime/static/passes.h @@ -20,5 +20,7 @@ TORCH_API bool HasInplaceOp( std::shared_ptr& graph, const AliasDb& alias_db); +TORCH_API void FuseSignLog1P(std::shared_ptr& graph); + } // namespace jit } // namespace torch From 2c258d91cc1dc11c338e97d6970ac77a4f8978ec Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Thu, 2 Sep 2021 08:59:53 -0700 Subject: [PATCH 471/530] Fix torch.istft length mismatch and window runtime error (#63469) Summary: The PR fixes two issues: - See https://github.com/pytorch/pytorch/issues/62747 and https://github.com/pytorch/audio/issues/1409. The length mismatch when the given ``length`` parameter is longer than expected. Add padding logic in consistent with librosa. - See https://github.com/pytorch/pytorch/issues/62323. The current implementations checks if the min value of window_envelop.abs() is greater than zero. In librosa they normalize the signal on non-zero values by indexing. Like ``` approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum) y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/63469 Reviewed By: fmassa Differential Revision: D30695827 Pulled By: nateanl fbshipit-source-id: d034e53f0d65b3fd1dbd150c9c5acf3faf25a164 --- aten/src/ATen/native/SpectralOps.cpp | 10 ++++- test/test_spectral_ops.py | 64 +++++++++++++++++++++++++--- torch/functional.py | 3 +- 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index cd042073794c3..f9472b1f3dd3d 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -920,7 +920,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional ho // We need to trim the front padding away if centered const auto start = center ? n_fft / 2 : 0; - const auto end = lengthOpt.has_value()? start + lengthOpt.value() : - n_fft / 2; + const auto end = lengthOpt.has_value() ? start + lengthOpt.value() : (center ? - n_fft / 2 : -1); y = y.slice(2, start, end, 1); window_envelop = window_envelop.slice(2, start, end, 1); @@ -935,6 +935,14 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional ho if (input_dim == 3) { y = y.squeeze(0); } + // zero padding if the given lengthOpt is longer than expected + if(end > expected_output_signal_len) { + TORCH_WARN_ONCE( + "The length of signal is shorter than the length parameter. Result is being padded with zeros in the tail. " + "Please check your center and hop_length settings." + ); + y = at::constant_pad_nd(y, {0, end - expected_output_signal_len}, 0); + } return y; #undef REPR diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index fdc8c01417fd1..f632e95d9c704 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -1126,9 +1126,6 @@ def _test_istft_is_inverse_of_stft(stft_kwargs): original = torch.randn(*sizes, dtype=dtype, device=device) stft = torch.stft(original, return_complex=True, **stft_kwargs) inversed = torch.istft(stft, length=original.size(1), **istft_kwargs) - - # trim the original for case when constructed signal is shorter than original - original = original[..., :inversed.size(-1)] self.assertEqual( inversed, original, msg='istft comparison against original', atol=7e-6, rtol=0, exact_dtype=True) @@ -1167,21 +1164,63 @@ def _test_istft_is_inverse_of_stft(stft_kwargs): 'normalized': True, 'onesided': False, }, - # hamming_window, not centered, not normalized, onesided + # hamming_window, centered, not normalized, onesided # window same size as n_fft { 'n_fft': 5, 'hop_length': 2, 'win_length': 5, 'window': torch.hamming_window(5, dtype=dtype, device=device), - 'center': False, + 'center': True, 'pad_mode': 'constant', 'normalized': False, 'onesided': True, }, + ] + for i, pattern in enumerate(patterns): + _test_istft_is_inverse_of_stft(pattern) + + @onlyOnCPUAndCUDA + @skipCPUIfNoFFT + @dtypes(torch.double) + def test_istft_round_trip_with_padding(self, device, dtype): + """long hop_length or not centered may cause length mismatch in the inversed signal""" + def _test_istft_is_inverse_of_stft_with_padding(stft_kwargs): + # generates a random sound signal for each tril and then does the stft/istft + # operation to check whether we can reconstruct signal + num_trials = 100 + sizes = stft_kwargs['size'] + del stft_kwargs['size'] + istft_kwargs = stft_kwargs.copy() + del istft_kwargs['pad_mode'] + for i in range(num_trials): + original = torch.randn(*sizes, dtype=dtype, device=device) + stft = torch.stft(original, return_complex=True, **stft_kwargs) + with self.assertWarnsOnceRegex(UserWarning, "The length of signal is shorter than the length parameter."): + inversed = torch.istft(stft, length=original.size(-1), **istft_kwargs) + n_frames = stft.size(-1) + if stft_kwargs["center"] is True: + len_expected = stft_kwargs["n_fft"] // 2 + stft_kwargs["hop_length"] * (n_frames - 1) + else: + len_expected = stft_kwargs["n_fft"] + stft_kwargs["hop_length"] * (n_frames - 1) + # trim the original for case when constructed signal is shorter than original + padding = inversed[..., len_expected:] + inversed = inversed[..., :len_expected] + original = original[..., :len_expected] + # test the padding points of the inversed signal are all zeros + zeros = torch.zeros_like(padding, device=padding.device) + self.assertEqual( + padding, zeros, msg='istft padding values against zeros', + atol=7e-6, rtol=0, exact_dtype=True) + self.assertEqual( + inversed, original, msg='istft comparison against original', + atol=7e-6, rtol=0, exact_dtype=True) + + patterns = [ # hamming_window, not centered, not normalized, not onesided # window same size as n_fft { + 'size': [2, 20], 'n_fft': 3, 'hop_length': 2, 'win_length': 3, @@ -1191,9 +1230,22 @@ def _test_istft_is_inverse_of_stft(stft_kwargs): 'normalized': False, 'onesided': False, }, + # hamming_window, centered, not normalized, onesided, long hop_length + # window same size as n_fft + { + 'size': [2, 500], + 'n_fft': 256, + 'hop_length': 254, + 'win_length': 256, + 'window': torch.hamming_window(256, dtype=dtype, device=device), + 'center': True, + 'pad_mode': 'constant', + 'normalized': False, + 'onesided': True, + }, ] for i, pattern in enumerate(patterns): - _test_istft_is_inverse_of_stft(pattern) + _test_istft_is_inverse_of_stft_with_padding(pattern) @onlyOnCPUAndCUDA def test_istft_throws(self, device): diff --git a/torch/functional.py b/torch/functional.py index 81b3de234e1ca..63470cf2d443f 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -569,7 +569,8 @@ def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame, ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False - since the signal isn't padded). + since the signal isn't padded). If `length` is given in the arguments and is longer than expected, + ``istft`` will pad zeros to the end of the returned signal. If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc. Left padding can be trimmed off exactly because they can be calculated but right padding cannot be From 50067c020a14d183b49861771effa35d472220e9 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 2 Sep 2021 09:02:35 -0700 Subject: [PATCH 472/530] TST Adds __repr__ and str to module info (#63737) Summary: Follow up to https://github.com/pytorch/pytorch/pull/61935 This PR adds `test_repr` to `test_modules`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/63737 Reviewed By: gchanan Differential Revision: D30729642 Pulled By: jbschlosser fbshipit-source-id: c11a28bc0739abd3ed40727389dd28ed4069edad --- test/test_modules.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_modules.py b/test/test_modules.py index 52520dad080de..6d6adbc7ac57d 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -110,6 +110,20 @@ def test_factory_kwargs(self, device, dtype, module_info): buffer.dtype, dtype, f'Buffer {name} is of dtype {buffer.dtype} instead of the expected dtype {dtype}') + @modules(module_db) + def test_repr(self, device, dtype, module_info): + # Test module can be represented with repr and str without errors. + module_cls = module_info.module_cls + module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype, + requires_grad=False) + for module_input in module_inputs: + args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs + m = module_cls(*args, **kwargs) + + # Check that these methods do not raise errors + m.__repr__() + str(m) + @modules(module_db) def test_pickle(self, device, dtype, module_info): # Test that module can be pickled and unpickled. From 66ddc6ef9e33c4fb3a3694106432c3ba81d1ad90 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 2 Sep 2021 09:27:44 -0700 Subject: [PATCH 473/530] Fixes and details to torchhub docs (#63783) Summary: This PR: - adds a few details regarding the newly added `skip_validation` parameter https://github.com/pytorch/pytorch/pull/62139 - uses double-backticks instead of single-backticks since this is rst, not mardown. - adds a few minor doc nits here and there Pull Request resolved: https://github.com/pytorch/pytorch/pull/63783 Reviewed By: zou3519 Differential Revision: D30696658 Pulled By: NicolasHug fbshipit-source-id: 6f01c7eb3cfcd7e17e4c33c09d193054fa18ad36 --- torch/hub.py | 61 ++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/torch/hub.py b/torch/hub.py index bcd53f79d9c77..82287d84b14f6 100644 --- a/torch/hub.py +++ b/torch/hub.py @@ -257,18 +257,20 @@ def set_dir(d): def list(github, force_reload=False, skip_validation=False): r""" - List all entrypoints available in `github` hubconf. + List all callable entrypoints available in the repo specified by ``github``. Args: github (string): a string with format "repo_owner/repo_name[:tag_name]" with an optional - tag/branch. The default branch is `master` if not specified. + tag/branch. The default branch is ``master`` if not specified. Example: 'pytorch/vision[:hub]' force_reload (bool, optional): whether to discard the existing cache and force a fresh download. - Default is `False`. - skip_validation (bool, optional): whether to check package validity against github. - Default is `False`. + Default is ``False``. + skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit + specified by the ``github`` argument properly belongs to the repo owner. This will make + requests to the GitHub API; you can specify a non-default GitHub token by setting the + ``GITHUB_TOKEN`` environment variable. Default is ``False``. Returns: - entrypoints: a list of available entrypoint names + list: The available callables entrypoint Example: >>> entrypoints = torch.hub.list('pytorch/vision', force_reload=True) @@ -290,17 +292,19 @@ def list(github, force_reload=False, skip_validation=False): def help(github, model, force_reload=False, skip_validation=False): r""" - Show the docstring of entrypoint `model`. + Show the docstring of entrypoint ``model``. Args: github (string): a string with format with an optional - tag/branch. The default branch is `master` if not specified. + tag/branch. The default branch is ``master`` if not specified. Example: 'pytorch/vision[:hub]' - model (string): a string of entrypoint name defined in repo's hubconf.py + model (string): a string of entrypoint name defined in repo's ``hubconf.py`` force_reload (bool, optional): whether to discard the existing cache and force a fresh download. - Default is `False`. - skip_validation (bool, optional): whether to check package validity against github. - Default is `False`. + Default is ``False``. + skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit + specified by the ``github`` argument properly belongs to the repo owner. This will make + requests to the GitHub API; you can specify a non-default GitHub token by setting the + ``GITHUB_TOKEN`` environment variable. Default is ``False``. Example: >>> print(torch.hub.help('pytorch/vision', 'resnet18', force_reload=True)) """ @@ -326,11 +330,11 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose Note: Loading a model is the typical use case, but this can also be used to for loading other objects such as tokenizers, loss functions, etc. - If :attr:`source` is ``'github'``, :attr:`repo_or_dir` is expected to be + If ``source`` is 'github', ``repo_or_dir`` is expected to be of the form ``repo_owner/repo_name[:tag_name]`` with an optional tag/branch. The default branch is ``master`` if not specified. - If :attr:`source` is ``'local'``, :attr:`repo_or_dir` is expected to be a + If ``source`` is 'local', ``repo_or_dir`` is expected to be a path to a local directory. Args: @@ -339,9 +343,9 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose ``source = 'local'``. model (string): the name of a callable (entrypoint) defined in the repo/dir's ``hubconf.py``. - *args (optional): the corresponding args for callable :attr:`model`. - source (string, optional): ``'github'`` | ``'local'``. Specifies how - ``repo_or_dir`` is to be interpreted. Default is ``'github'``. + *args (optional): the corresponding args for callable ``model``. + source (string, optional): 'github' or 'local'. Specifies how + ``repo_or_dir`` is to be interpreted. Default is 'github'. force_reload (bool, optional): whether to force a fresh download of the github repo unconditionally. Does not have any effect if ``source = 'local'``. Default is ``False``. @@ -349,13 +353,14 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose local caches. Note that the message about first download cannot be muted. Does not have any effect if ``source = 'local'``. Default is ``True``. - skip_validation (bool, optional): whether to check package validity against github. - Default is `False`. - **kwargs (optional): the corresponding kwargs for callable - :attr:`model`. + skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit + specified by the ``github`` argument properly belongs to the repo owner. This will make + requests to the GitHub API; you can specify a non-default GitHub token by setting the + ``GITHUB_TOKEN`` environment variable. Default is ``False``. + **kwargs (optional): the corresponding kwargs for callable ``model``. Returns: - The output of the :attr:`model` callable when called with the given + The output of the ``model`` callable when called with the given ``*args`` and ``**kwargs``. Example: @@ -387,7 +392,7 @@ def _load_local(hubconf_dir, model, *args, **kwargs): hubconf_dir (string): path to a local directory that contains a ``hubconf.py``. model (string): name of an entrypoint defined in the directory's - `hubconf.py`. + ``hubconf.py``. *args (optional): the corresponding args for callable ``model``. **kwargs (optional): the corresponding kwargs for callable ``model``. @@ -416,8 +421,8 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True): Args: url (string): URL of the object to download - dst (string): Full path where object will be saved, e.g. `/tmp/temporary_file` - hash_prefix (string, optional): If not None, the SHA256 downloaded file should start with `hash_prefix`. + dst (string): Full path where object will be saved, e.g. ``/tmp/temporary_file`` + hash_prefix (string, optional): If not None, the SHA256 downloaded file should start with ``hash_prefix``. Default: None progress (bool, optional): whether or not to display a progress bar to stderr Default: True @@ -513,8 +518,8 @@ def load_state_dict_from_url(url, model_dir=None, map_location=None, progress=Tr If the object is already present in `model_dir`, it's deserialized and returned. - The default value of `model_dir` is ``/checkpoints`` where - `hub_dir` is the directory returned by :func:`~torch.hub.get_dir`. + The default value of ``model_dir`` is ``/checkpoints`` where + ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`. Args: url (string): URL of the object to download @@ -527,7 +532,7 @@ def load_state_dict_from_url(url, model_dir=None, map_location=None, progress=Tr digits of the SHA256 hash of the contents of the file. The hash is used to ensure unique names and to verify the contents of the file. Default: False - file_name (string, optional): name for the downloaded file. Filename from `url` will be used if not set. + file_name (string, optional): name for the downloaded file. Filename from ``url`` will be used if not set. Example: >>> state_dict = torch.hub.load_state_dict_from_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') From aeafcde087fa76618708b8a2841c450e7e184761 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 2 Sep 2021 09:50:56 -0700 Subject: [PATCH 474/530] CI: Enable using labels to control GHA workflows (#64314) Summary: Fixes https://github.com/pytorch/pytorch/issues/62852 Sets a global environment variable containing a list of PR labels. For this PR, the PR_LABELS variable looks like: ``` [ "cla signed", "ciflow/default" ] ``` confirmed in a run: https://github.com/pytorch/pytorch/runs/3490072161?check_suite_focus=true This information can be used in other workflow steps to control the logic. For example, if I want to force a build, I can label my PR with "force-build" and do something like the following in my build script: ``` if [[ "${PR_LABELS}" = *force-build* ]]; then python setup.py install else #use cached wheel or something fi ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/64314 Reviewed By: driazati Differential Revision: D30714570 Pulled By: janeyx99 fbshipit-source-id: 80b060ee32643ddd22eb7b8ec548579c7ccf6441 --- .github/templates/bazel_ci_workflow.yml.j2 | 2 ++ .github/templates/linux_ci_workflow.yml.j2 | 4 ++++ .github/templates/windows_ci_workflow.yml.j2 | 1 + .../generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 ++ .../generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 ++ .../workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 3 +++ .../workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml | 3 +++ .../workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 3 +++ .../workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 3 +++ .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 4 ++++ .../generated-linux-xenial-py3.6-gcc7-bazel-test.yml | 3 +++ ...ted-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 ++ .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 3 +++ .../workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml | 1 + .github/workflows/generated-win-vs2019-cpu-py3.yml | 1 + .github/workflows/generated-win-vs2019-cuda10.1-py3.yml | 1 + .github/workflows/generated-win-vs2019-cuda11.3-py3.yml | 1 + 17 files changed, 39 insertions(+) diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 index 7f9d5230e0d9f..57b4567876e35 100644 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ b/.github/templates/bazel_ci_workflow.yml.j2 @@ -62,6 +62,7 @@ on: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e PR_LABELS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ @@ -110,6 +111,7 @@ on: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e CONTINUE_THROUGH_ERROR \ + -e PR_LABELS \ -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 49b6d7dd68c77..01ea7af15305d 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -46,6 +46,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }} @@ -144,6 +145,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -310,6 +312,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ @@ -405,6 +408,7 @@ jobs: -e CIRCLE_SHA1="$GITHUB_SHA" \ -e DOCS_VERSION="${target}" \ -e DOCS_TYPE \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 38c346c1134f8..b927281b84ff8 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -51,6 +51,7 @@ env: IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} SCCACHE_BUCKET: "ossci-compiler-cache" VC_PRODUCT: "BuildTools" VC_VERSION: "" diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index e621bee2ad666..4a434ac9772a2 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 9daf916ae2642..34fd21e15dcde 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 4821c1e306715..f1c0ea491c415 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -414,6 +416,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 79edf0d741950..e13c6191ac64e 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: linux-bionic-py3.8-gcc9-coverage-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -414,6 +416,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 316da3604fc91..e44dfbe44759a 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -414,6 +416,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 6c9e67d380c29..618b13148567d 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -414,6 +416,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 2337b4f5bf429..6ceabb04770e6 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }} @@ -196,6 +197,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -414,6 +416,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ @@ -574,6 +577,7 @@ jobs: -e CIRCLE_SHA1="$GITHUB_SHA" \ -e DOCS_VERSION="${target}" \ -e DOCS_TYPE \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 17dc3a6742d73..4bfe1d1211786 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -24,6 +24,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }} @@ -214,6 +215,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e PR_LABELS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ @@ -264,6 +266,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e CONTINUE_THROUGH_ERROR \ + -e PR_LABELS \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index 2f5cab7538601..ce4fe5bc0e250 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -22,6 +22,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -194,6 +195,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 8c81ab1717221..5b60a1bcdf900 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -22,6 +22,7 @@ env: # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} @@ -194,6 +195,7 @@ jobs: -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e PR_LABELS \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -412,6 +414,7 @@ jobs: -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 306e93aca7990..de74cdc16f889 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -18,6 +18,7 @@ env: IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} SCCACHE_BUCKET: "ossci-compiler-cache" VC_PRODUCT: "BuildTools" VC_VERSION: "" diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index f79cad7b04c00..4f43d2743d18b 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -20,6 +20,7 @@ env: IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} SCCACHE_BUCKET: "ossci-compiler-cache" VC_PRODUCT: "BuildTools" VC_VERSION: "" diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index 35c6cede0eefe..ad4cf37f958e1 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -20,6 +20,7 @@ env: IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} SCCACHE_BUCKET: "ossci-compiler-cache" VC_PRODUCT: "BuildTools" VC_VERSION: "" diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 4bfc5654186f7..299b48d66a411 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -20,6 +20,7 @@ env: IN_CI: 1 INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: "3.8" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} SCCACHE_BUCKET: "ossci-compiler-cache" VC_PRODUCT: "BuildTools" VC_VERSION: "" From 6da7552a8eaae6b85e271bf3edac2fa2ae9f1148 Mon Sep 17 00:00:00 2001 From: Kefei Lu Date: Thu, 2 Sep 2021 10:38:43 -0700 Subject: [PATCH 475/530] Add fx2trt pass for removing duplicate output args (#64433) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64433 Fx2TRT does not support duplicate nodes in the output args tuple. This pass removes duplicate output args from the target subnets and fixes their uses in the top level module where the subnets are called. This pass must be called after acc split on the top-level net and subsequent calls to the acc trace on the subnets. This pass will change both the subnets and top level module. Test Plan: Run: ``` buck run mode/opt -c python.package_style=inplace //caffe2/torch/fb/fx2trt/tests/passes/:test_remove_duplicate_output_args ``` Reviewed By: 842974287 Differential Revision: D30468409 fbshipit-source-id: b4d91b76ab5d8a5275d68dd48d1327a44c22568e --- .../passes/remove_duplicate_output_args.py | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py diff --git a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py new file mode 100644 index 0000000000000..488ce45c75952 --- /dev/null +++ b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 + +import operator +import typing as t +import logging +import torch.fx as fx +import dataclasses as dc + + +_LOGGER = logging.getLogger(__name__) + + +def remove_duplicate_output_args( + top_level: fx.GraphModule, + target_subnets: t.Collection[str] +) -> t.Mapping[str, "RemoveDuplicateResult"]: + """Removes duplicate output args. + + This pass removes duplicate output args from the target subnets and fixes + their uses in the top level module where the subnets are called. This pass + must be called after acc split on the top-level net and subsequent calls to + the acc trace on the subnets. + + This pass will change both the subnets and top level module. + + Returns: + a mapping of the target subnet name to its dedupcate result + """ + + processed_subnets = {} + for node in top_level.graph.nodes: + node: fx.Node + if node.op == "call_module" and node.name in target_subnets: + sub_gm = top_level.get_submodule(node.target) + assert isinstance(sub_gm, fx.GraphModule) + + replace_res = _remove_duplicate_output_args(sub_gm) + processed_subnets[node.name] = replace_res + if replace_res.replacement_map is None: + continue + sub_gm.recompile() + + needs_recompile = False + # iterate on the copy since we will be changing elements of node.users + for user in list(node.users): + idx = _ensure_proper_output_use(user, node) + idx_new = replace_res.replacement_map[idx] + if idx_new != idx: + user.args = (user.args[0], idx_new) + needs_recompile = True + + if needs_recompile: + top_level.recompile() + return processed_subnets + + +@dc.dataclass(frozen=True) +class RemoveDuplicateResult: + replacement_map: t.Optional[t.List[int]] + module: fx.GraphModule + + +def _ensure_proper_output_use(user: fx.Node, target_node: fx.Node) -> int: + """ + Ensures the node looks in proper form of calling the output of an fx2trt + splitter sub-net. Specifically: + + 1. op is call function, target: operator.getitem + 2. args is a 2-element tuple + 3. args[0] is the name of the subnet's output + 4. args[1] is the index into the subnet output tuple + + E.g.: + + %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%_run_on_acc_1, 4), kwargs = {}) + + returns the index into the subnet output tuple + """ + _LOGGER.info(f"Checking user node: {user.format_node()}") + assert ( + user.op == "call_function" + and user.target == operator.getitem + and len(user.args) == 2 + and user.args[0].name == target_node.name + and isinstance(user.args[1], int) + ), f"Node is not a proper user of splitter output: {user.format_node()}" + + return user.args[1] + + +def _remove_duplicate_output_args(gm: fx.GraphModule) -> RemoveDuplicateResult: + output_nodes = [n for n in gm.graph.nodes if n.op == "output"] + assert len(output_nodes) == 1, \ + f"Expecting exactly one `output` node, but got {len(output_nodes)}" + + changed = False + # arg node name to its index in the new output args tuple + name_to_idx: t.Dict[str, int] = {} + output_node = output_nodes[0] + + # Output op only uses its `args[0]`, and it does not have `kwargs`. + # https://pytorch.org/docs/stable/fx.html#torch.fx.Node + args = output_node.args[0] + + # Only concern outselves to the case where the args is an iterable of fx.Node. + # Other return cases (e.g., a single value) is possible and we don't handle + # that in this pass. + if not (isinstance(args, t.Iterable) and all(isinstance(a, fx.Node) for a in args)): + return RemoveDuplicateResult(replacement_map=None, module=gm) + + # Map old index of the arg node to the remaining node's idx, + # initialized to `i => i` + replacement_map: t.List[int] = list(range(len(args))) + args_new = [] + for idx, a in enumerate(args): + assert isinstance(a, fx.Node), \ + f"Expecting fx.Node instance, but got: {type(a)}" + + if a.name not in name_to_idx: + args_new.append(a) + name_to_idx[a.name] = len(args_new) - 1 + else: + changed = True + _LOGGER.warning( + f"Replaced duplicate output arg '{a.name}': " + f"{idx} -> {name_to_idx[a.name]}" + ) + replacement_map[idx] = name_to_idx[a.name] + + output_node.args = (tuple(args_new),) + if changed: + gm.recompile() + return RemoveDuplicateResult(replacement_map, module=gm) From 9214450b7fe3113a6078618514199f5af7bf82a0 Mon Sep 17 00:00:00 2001 From: Hui Guo Date: Thu, 2 Sep 2021 10:40:02 -0700 Subject: [PATCH 476/530] [tensorexpr] Wrap error msgs with buildErrorMessages for internal asserts (#64409) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64409 Test Plan: Imported from OSS Reviewed By: navahgar Differential Revision: D30717786 Pulled By: huiguoo fbshipit-source-id: a3b147d339ff4927f14efa24407cd3b63d80001d --- torch/csrc/jit/tensorexpr/graph_opt.cpp | 41 +++++++++++++++------ torch/csrc/jit/tensorexpr/ir_mutator.cpp | 12 ++++-- torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 8 +++- torch/csrc/jit/tensorexpr/kernel.cpp | 36 +++++++++++++----- 4 files changed, 70 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/graph_opt.cpp b/torch/csrc/jit/tensorexpr/graph_opt.cpp index 67f9a671bfa20..d55ea0559e5e1 100644 --- a/torch/csrc/jit/tensorexpr/graph_opt.cpp +++ b/torch/csrc/jit/tensorexpr/graph_opt.cpp @@ -26,14 +26,21 @@ Node* moveCatAfterUse(Node* cat, Node* user, std::shared_ptr subgraph) { // %4 = aten::cat(%3, ...) // return (%4) - TORCH_INTERNAL_ASSERT(cat->output()->hasUses()); - TORCH_INTERNAL_ASSERT(cat->output()->uses().size() == 1); - TORCH_INTERNAL_ASSERT(cat->input(0)->node()->kind() == prim::ListConstruct); + TORCH_INTERNAL_ASSERT( + cat->output()->hasUses(), + buildErrorMessage("aten::cat output is not used.")); + TORCH_INTERNAL_ASSERT( + cat->output()->uses().size() == 1, + buildErrorMessage("aten::cat output is used in multiple places.")); + TORCH_INTERNAL_ASSERT( + cat->input(0)->node()->kind() == prim::ListConstruct, + buildErrorMessage("aten::cat inputs are not expected.")); auto cat_list = cat->input(0)->node(); auto cat_inputs = cat_list->inputs(); auto user_tensor_type = user->output()->type()->cast(); - TORCH_INTERNAL_ASSERT(user_tensor_type); + TORCH_INTERNAL_ASSERT( + user_tensor_type, buildErrorMessage("Unexpected user tensor type")); std::unordered_map new_cat_inputs; for (auto inp : cat_inputs) { auto new_cat_input = subgraph->createClone( @@ -41,7 +48,8 @@ Node* moveCatAfterUse(Node* cat, Node* user, std::shared_ptr subgraph) { // Since we are cloning user, its result should be the same scalar type // as the user. But the dims should correspond to that of the input. auto input_tensor_type = inp->type()->cast(); - TORCH_INTERNAL_ASSERT(input_tensor_type); + TORCH_INTERNAL_ASSERT( + input_tensor_type, buildErrorMessage("Unexpected input tensor type")); auto new_input_type = input_tensor_type->withScalarType(user_tensor_type->scalarType()); new_cat_input->output()->setType(new_input_type); @@ -60,7 +68,9 @@ Node* moveCatAfterUse(Node* cat, Node* user, std::shared_ptr subgraph) { user->output()->replaceAllUsesWith(new_cat->output()); user->destroy(); - TORCH_INTERNAL_ASSERT(!cat->output()->hasUses()); + TORCH_INTERNAL_ASSERT( + !cat->output()->hasUses(), + buildErrorMessage("aten::cat output is not used.")); cat->destroy(); if (!cat_list->output()->hasUses()) { @@ -84,10 +94,15 @@ int numTensorInputs(Node* node) { // If the inputs to `cat` are of different types, then the implementation // of `cat` is expected to promote type. bool doesCatPromoteTypes(Node* node) { - TORCH_INTERNAL_ASSERT(node->kind() == aten::cat); - TORCH_INTERNAL_ASSERT(node->input(0)->node()->kind() == prim::ListConstruct); + TORCH_INTERNAL_ASSERT( + node->kind() == aten::cat, + buildErrorMessage("Graph node is not aten::cat.")); + TORCH_INTERNAL_ASSERT( + node->input(0)->node()->kind() == prim::ListConstruct, + buildErrorMessage("aten::cat inputs are not expected.")); auto inputs = node->input(0)->node()->inputs(); - TORCH_INTERNAL_ASSERT(!inputs.empty()); + TORCH_INTERNAL_ASSERT( + !inputs.empty(), buildErrorMessage("Empty inputs of ListConstruct")); auto scalar_type = inputs.front()->type()->cast()->scalarType(); for (size_t i = 1; i < inputs.size(); ++i) { @@ -122,14 +137,18 @@ bool doesCatPromoteTypes(Node* node) { // it user needs to reflect the original type. This is currently not // handled. TODO void moveCatOpToEnd(Node* cat, std::shared_ptr subgraph) { - TORCH_INTERNAL_ASSERT(cat->kind() == aten::cat); + TORCH_INTERNAL_ASSERT( + cat->kind() == aten::cat, + buildErrorMessage("Graph node is not aten::cat.")); if (cat->output()->uses().size() == 1) { auto use = cat->output()->uses().front(); if (use.user->isMemberOf(supported_eltwise_set()) && numTensorInputs(use.user) == 1) { if (!doesCatPromoteTypes(cat)) { TORCH_INTERNAL_ASSERT( - use.user->output()->owningGraph() == subgraph.get()); + use.user->output()->owningGraph() == subgraph.get(), + buildErrorMessage( + "aten::cat user graph does not math the given subgraph.")); auto new_cat = moveCatAfterUse(cat, use.user, subgraph); moveCatOpToEnd(new_cat, subgraph); } diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index 71a40a134e0b5..e2e9c46e133a5 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -424,14 +424,16 @@ StmtPtr IRMutator::mutate(SyncThreadsPtr v) { StmtPtr IRMutator::mutate(ExternalCallPtr v) { BufPtr buf = v->buf(); BufPtr buf_new = to(buf->accept_mutator(this)); - TORCH_INTERNAL_ASSERT(buf_new); + TORCH_INTERNAL_ASSERT( + buf_new, buildErrorMessage("IRMutator produced null for Buf.")); bool buf_args_changed = false; std::vector buf_args_new; buf_args_new.reserve(v->buf_args().size()); for (BufPtr buf_arg : v->buf_args()) { BufPtr buf_arg_new = to(buf_arg->accept_mutator(this)); - TORCH_INTERNAL_ASSERT(buf_arg_new); + TORCH_INTERNAL_ASSERT( + buf_arg_new, buildErrorMessage("IRMutator produced null for Buf.")); buf_args_new.push_back(buf_arg_new); buf_args_changed |= buf_arg_new != buf_arg; } @@ -460,7 +462,8 @@ StmtPtr IRMutator::mutate(ExternalCallPtr v) { StmtPtr IRMutator::mutate(AllocatePtr v) { BufPtr buf = v->buf(); BufPtr buf_new = to(buf->accept_mutator(this)); - TORCH_INTERNAL_ASSERT(buf_new); + TORCH_INTERNAL_ASSERT( + buf_new, buildErrorMessage("IRMutator produced null for Buf.")); if (buf != buf_new) { v->set_buf(buf_new); } @@ -470,7 +473,8 @@ StmtPtr IRMutator::mutate(AllocatePtr v) { StmtPtr IRMutator::mutate(FreePtr v) { BufPtr buf = v->buf(); BufPtr buf_new = to(buf->accept_mutator(this)); - TORCH_INTERNAL_ASSERT(buf_new); + TORCH_INTERNAL_ASSERT( + buf_new, buildErrorMessage("IRMutator produced null for Buf.")); if (buf != buf_new) { v->set_buf(buf_new); } diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index 6820bbb5748a2..3ce194325f08a 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -2351,7 +2351,9 @@ ExprPtr buf_flat_size(BufPtr v) { StmtPtr TermExpander::mutate(AllocatePtr v) { BufPtr buf = v->buf(); BufPtr buf_new = to(v->buf()->accept_mutator(this)); - TORCH_INTERNAL_ASSERT(buf_new); + TORCH_INTERNAL_ASSERT( + buf_new, + buildErrorMessage("TermExpander mutation produced null for Buf.")); ExprPtr flattened = buf_flat_size(buf_new); if (flattened->isConstant() && immediateEquals(flattened, 0)) { @@ -2368,7 +2370,9 @@ StmtPtr TermExpander::mutate(AllocatePtr v) { StmtPtr TermExpander::mutate(FreePtr v) { BufPtr buf = v->buf(); BufPtr buf_new = to(v->buf()->accept_mutator(this)); - TORCH_INTERNAL_ASSERT(buf_new); + TORCH_INTERNAL_ASSERT( + buf_new, + buildErrorMessage("TermExpander mutation produced null for Buf.")); if (eliminated_allocations_.count(buf_new->base_handle())) { eliminated_allocations_.erase(buf_new->base_handle()); diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index f9653aea68840..a86cb33a1b8bd 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -549,7 +549,7 @@ std::vector bufferSizes(BufPtr b) { std::vector sizes; for (size_t i = 0; i < b->ndim(); i++) { auto dim = intValue(b->dim(i)); - TORCH_INTERNAL_ASSERT(dim); + TORCH_INTERNAL_ASSERT(dim, buildErrorMessage("Non-constant buf dims")); sizes.push_back(*dim); } return sizes; @@ -889,7 +889,8 @@ ExprHandle promoteIntegerToDefaultType(const ExprHandle& e) { // We intend to promote Integers to floating-point types TORCH_INTERNAL_ASSERT( - !c10::isIntegralType(defaultType, /*includeBool*/ true)); + !c10::isIntegralType(defaultType, /*includeBool*/ true), + buildErrorMessage("Non-integer type")); return Cast::make( Dtype( @@ -1165,7 +1166,8 @@ std::pair> processCatList( std::vector nonEmptyInputs; for (auto buf : bufList) { bufInputs.push_back(buf); - TORCH_INTERNAL_ASSERT(buf.node()->dims().size() > 0); + TORCH_INTERNAL_ASSERT( + buf.node()->dims().size() > 0, buildErrorMessage("Invalid buf rank")); if (buf.node()->dims().size() == 1 && immediateAs(buf.node()->dim(0)) == 0) { continue; @@ -1378,7 +1380,9 @@ Tensor tensorexpr::computeOperandValue( auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) { return boolToInteger(lhs) + boolToInteger(rhs); }; - TORCH_INTERNAL_ASSERT(inputs.size() == 2 || inputs.size() == 3); + TORCH_INTERNAL_ASSERT( + inputs.size() == 2 || inputs.size() == 3, + buildErrorMessage("Invalid number of input operands")); return (inputs.size() > 2) ? computeTwoOperandWithAlpha( "aten_add", inputs, outputShape, outputType, add_lambda) @@ -1390,7 +1394,9 @@ Tensor tensorexpr::computeOperandValue( // NB: sub isn't supported on boolean, no need to promote to integer. return lhs - rhs; }; - TORCH_INTERNAL_ASSERT(inputs.size() == 2 || inputs.size() == 3); + TORCH_INTERNAL_ASSERT( + inputs.size() == 2 || inputs.size() == 3, + buildErrorMessage("Invalid number of input operands")); return (inputs.size() > 2) ? computeTwoOperandWithAlpha( "aten_sub", inputs, outputShape, outputType, sub_lambda) @@ -2153,7 +2159,8 @@ Tensor tensorexpr::computeOperandValue( outputShape, outputType, [outputType](const ExprHandle& a) { - TORCH_INTERNAL_ASSERT(outputType); + TORCH_INTERNAL_ASSERT( + outputType, buildErrorMessage("Output type is null.")); return Cast::make(ToDtype(*outputType), a); }); } break; @@ -2272,7 +2279,9 @@ Tensor tensorexpr::computeOperandValue( "aten_transpose", c10::fmap(outputShape), [&](std::vector axes) { - TORCH_INTERNAL_ASSERT(axes.size() <= 1); + TORCH_INTERNAL_ASSERT( + axes.size() <= 1, + buildErrorMessage("Invalid axes size in transpose")); return A.load(axes); }); } @@ -2935,7 +2944,10 @@ bool denseAndNonOverlapping( Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { const TensorTypePtr& tt = v->type()->expect(); - TORCH_INTERNAL_ASSERT(bufs_.count(v)); + TORCH_INTERNAL_ASSERT( + bufs_.count(v), + buildErrorMessage( + "Ouput tensor has no corresponding bufs in the fuser.")); BufPtr buf = bufs_.at(v); // No shape info is present in the graph @@ -2945,13 +2957,17 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { throw malformed_input(msg); } - TORCH_INTERNAL_ASSERT(tt->sizes().concrete_sizes()); + TORCH_INTERNAL_ASSERT( + tt->sizes().concrete_sizes(), + buildErrorMessage("Output shapes are unknown.")); auto sizes = *tt->sizes().concrete_sizes(); std::vector default_strides = TensorType::contiguousStridesOf(sizes); if (!tt->strides().concrete_sizes()) { return Tensor(buf, nullptr); } - TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes()); + TORCH_INTERNAL_ASSERT( + tt->strides().concrete_sizes(), + buildErrorMessage("Output strides are unknown.")); const std::vector strides = *tt->strides().concrete_sizes(); // All Tensors in NNC are layed out in default, contiguous layout. // If the output is also default contiguous we don't need to do anything From c0cdbb1cc53e8b55f26604a84135fc22640dec41 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Thu, 2 Sep 2021 10:56:57 -0700 Subject: [PATCH 477/530] Revert D30468409: Add fx2trt pass for removing duplicate output args Test Plan: revert-hammer Differential Revision: D30468409 (https://github.com/pytorch/pytorch/commit/6da7552a8eaae6b85e271bf3edac2fa2ae9f1148) Original commit changeset: b4d91b76ab5d fbshipit-source-id: e138dc425fe55ffe3585ea5fac4db476931bafed --- .../passes/remove_duplicate_output_args.py | 133 ------------------ 1 file changed, 133 deletions(-) delete mode 100644 torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py diff --git a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py deleted file mode 100644 index 488ce45c75952..0000000000000 --- a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python3 - -import operator -import typing as t -import logging -import torch.fx as fx -import dataclasses as dc - - -_LOGGER = logging.getLogger(__name__) - - -def remove_duplicate_output_args( - top_level: fx.GraphModule, - target_subnets: t.Collection[str] -) -> t.Mapping[str, "RemoveDuplicateResult"]: - """Removes duplicate output args. - - This pass removes duplicate output args from the target subnets and fixes - their uses in the top level module where the subnets are called. This pass - must be called after acc split on the top-level net and subsequent calls to - the acc trace on the subnets. - - This pass will change both the subnets and top level module. - - Returns: - a mapping of the target subnet name to its dedupcate result - """ - - processed_subnets = {} - for node in top_level.graph.nodes: - node: fx.Node - if node.op == "call_module" and node.name in target_subnets: - sub_gm = top_level.get_submodule(node.target) - assert isinstance(sub_gm, fx.GraphModule) - - replace_res = _remove_duplicate_output_args(sub_gm) - processed_subnets[node.name] = replace_res - if replace_res.replacement_map is None: - continue - sub_gm.recompile() - - needs_recompile = False - # iterate on the copy since we will be changing elements of node.users - for user in list(node.users): - idx = _ensure_proper_output_use(user, node) - idx_new = replace_res.replacement_map[idx] - if idx_new != idx: - user.args = (user.args[0], idx_new) - needs_recompile = True - - if needs_recompile: - top_level.recompile() - return processed_subnets - - -@dc.dataclass(frozen=True) -class RemoveDuplicateResult: - replacement_map: t.Optional[t.List[int]] - module: fx.GraphModule - - -def _ensure_proper_output_use(user: fx.Node, target_node: fx.Node) -> int: - """ - Ensures the node looks in proper form of calling the output of an fx2trt - splitter sub-net. Specifically: - - 1. op is call function, target: operator.getitem - 2. args is a 2-element tuple - 3. args[0] is the name of the subnet's output - 4. args[1] is the index into the subnet output tuple - - E.g.: - - %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%_run_on_acc_1, 4), kwargs = {}) - - returns the index into the subnet output tuple - """ - _LOGGER.info(f"Checking user node: {user.format_node()}") - assert ( - user.op == "call_function" - and user.target == operator.getitem - and len(user.args) == 2 - and user.args[0].name == target_node.name - and isinstance(user.args[1], int) - ), f"Node is not a proper user of splitter output: {user.format_node()}" - - return user.args[1] - - -def _remove_duplicate_output_args(gm: fx.GraphModule) -> RemoveDuplicateResult: - output_nodes = [n for n in gm.graph.nodes if n.op == "output"] - assert len(output_nodes) == 1, \ - f"Expecting exactly one `output` node, but got {len(output_nodes)}" - - changed = False - # arg node name to its index in the new output args tuple - name_to_idx: t.Dict[str, int] = {} - output_node = output_nodes[0] - - # Output op only uses its `args[0]`, and it does not have `kwargs`. - # https://pytorch.org/docs/stable/fx.html#torch.fx.Node - args = output_node.args[0] - - # Only concern outselves to the case where the args is an iterable of fx.Node. - # Other return cases (e.g., a single value) is possible and we don't handle - # that in this pass. - if not (isinstance(args, t.Iterable) and all(isinstance(a, fx.Node) for a in args)): - return RemoveDuplicateResult(replacement_map=None, module=gm) - - # Map old index of the arg node to the remaining node's idx, - # initialized to `i => i` - replacement_map: t.List[int] = list(range(len(args))) - args_new = [] - for idx, a in enumerate(args): - assert isinstance(a, fx.Node), \ - f"Expecting fx.Node instance, but got: {type(a)}" - - if a.name not in name_to_idx: - args_new.append(a) - name_to_idx[a.name] = len(args_new) - 1 - else: - changed = True - _LOGGER.warning( - f"Replaced duplicate output arg '{a.name}': " - f"{idx} -> {name_to_idx[a.name]}" - ) - replacement_map[idx] = name_to_idx[a.name] - - output_node.args = (tuple(args_new),) - if changed: - gm.recompile() - return RemoveDuplicateResult(replacement_map, module=gm) From 1519b6084f9a215fad407087d7ab2cf55d66b8e0 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Thu, 2 Sep 2021 11:06:34 -0700 Subject: [PATCH 478/530] nn.functional.linear OpInfo (#61971) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61971 Test Plan: - wait for tests Reviewed By: heitorschueroff Differential Revision: D30013750 Pulled By: zou3519 fbshipit-source-id: ca41dbd98176c12e50ad1410a658f4b06fe99a1e --- .../_internal/common_methods_invocations.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 10aae4146bae8..a45bcf54faba8 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2647,6 +2647,30 @@ def sample_inputs_hardswish(self, device, dtype, requires_grad): requires_grad=requires_grad, low=-5, high=5)) for _ in range(1, N)] return tensors +def sample_inputs_linear(self, device, dtype, requires_grad): + features_options = [[3, 4], [128, 128]] + batch_options: List[List[int]] = [ + [], # no batch + [0], + [64], + [5, 7], + ] + create_tensor = partial(make_tensor, device=device, dtype=dtype, + requires_grad=requires_grad, low=-2, high=2) + + sample_inputs = [] + for has_bias, (in_feat, out_feat), batch_shape in \ + itertools.product([True, False], features_options, batch_options): + input_tensor = create_tensor(batch_shape + [in_feat]) + weight = create_tensor([out_feat, in_feat]) + if not has_bias: + sample_inputs.append(SampleInput(input_tensor, args=(weight,))) + continue + + bias = create_tensor([out_feat]) + sample_inputs.append(SampleInput(input_tensor, args=(weight, bias))) + return sample_inputs + def sample_inputs_interpolate(mode, self, device, dtype, requires_grad): N, C = 2, 3 D = 4 @@ -7519,6 +7543,17 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypesIfCPU=floating_types_and(torch.int64), dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_avgpool2d), + OpInfo('nn.functional.linear', + aten_name='linear', + supports_autograd=True, + sample_inputs_func=sample_inputs_linear, + dtypesIfCPU=all_types_and_complex_and(torch.half, torch.bfloat16), + dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []), + backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, + *[torch.bfloat16] if CUDA11OrLater else []), + supports_forward_ad=True, + supports_out=False), UnaryUfuncInfo( 'nn.functional.logsigmoid', aten_name="log_sigmoid", From c932afe39b28be3b6d232f629ce597efcfd39815 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Thu, 2 Sep 2021 11:23:38 -0700 Subject: [PATCH 479/530] .github: Move upload-artifact-s3 to common var (#64435) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64435 Move upload-artifact-s3 to a common variable to be used amongst our jinja templates, this should make it easier in the future to update these images Signed-off-by: Eli Uriegas cc ezyang seemethere malfet lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D30732777 Pulled By: seemethere fbshipit-source-id: 51cd485f5abae134c3c49dfa878e6303ba8e5f25 --- .github/templates/common.yml.j2 | 2 ++ .github/templates/linux_ci_workflow.yml.j2 | 8 ++++---- .github/templates/windows_ci_workflow.yml.j2 | 2 +- .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 4 ++-- .../generated-linux-bionic-py3.8-gcc9-coverage.yml | 4 ++-- .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 4 ++-- .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 4 ++-- .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 4 ++-- ...enerated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 4 ++-- .../generated-periodic-win-vs2019-cuda11.1-py3.yml | 2 +- .github/workflows/generated-win-vs2019-cpu-py3.yml | 2 +- .github/workflows/generated-win-vs2019-cuda10.1-py3.yml | 2 +- .github/workflows/generated-win-vs2019-cuda11.3-py3.yml | 2 +- 13 files changed, 23 insertions(+), 21 deletions(-) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index f9296e017a1cf..6757785c3cff4 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -1,3 +1,5 @@ +{%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v3" -%} + {%- macro display_ec2_information() -%} - name: Display EC2 information shell: bash diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 01ea7af15305d..209e9c34a2d83 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -180,7 +180,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: !{{ common.upload_artifact_s3_action }} name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -356,7 +356,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: !{{ common.upload_artifact_s3_action }} name: Store PyTorch Test Reports on S3 if: always() with: @@ -423,7 +423,7 @@ jobs: run: | # Ensure the working directory gets chowned back to the current user docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 + - uses: !{{ common.upload_artifact_s3_action }} name: Upload Python Docs Preview if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} with: @@ -431,7 +431,7 @@ jobs: if-no-files-found: error path: pytorch.github.io/docs/merge/ s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }} - - uses: seemethere/upload-artifact-s3@v3 + - uses: !{{ common.upload_artifact_s3_action }} name: Upload C++ Docs Preview if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }} with: diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index b927281b84ff8..1268f275cc755 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -136,7 +136,7 @@ jobs: path: C:\${{ github.run_id }}\build-results - name: Upload artifacts to s3 if: always() - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + uses: !{{ common.upload_artifact_s3_action }} with: retention-days: 14 if-no-files-found: error diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index f1c0ea491c415..5dc8d9f2d19de 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -233,7 +233,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -454,7 +454,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Test Reports on S3 if: always() with: diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index e13c6191ac64e..c060b158b6007 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -233,7 +233,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -458,7 +458,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Test Reports on S3 if: always() with: diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index e44dfbe44759a..dfbf9bfd25d36 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -233,7 +233,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -454,7 +454,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Test Reports on S3 if: always() with: diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 618b13148567d..53b09cdf81ffe 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -233,7 +233,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -454,7 +454,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Test Reports on S3 if: always() with: diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 6ceabb04770e6..eb77554cb4a03 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -233,7 +233,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -454,7 +454,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Test Reports on S3 if: always() with: diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 5b60a1bcdf900..694a79ac22c81 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -231,7 +231,7 @@ jobs: - name: Archive artifacts into zip run: | zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Build Artifacts on S3 with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -452,7 +452,7 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + - uses: seemethere/upload-artifact-s3@v3 name: Store PyTorch Test Reports on S3 if: always() with: diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index de74cdc16f889..78e4c7cb42876 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -107,7 +107,7 @@ jobs: path: C:\${{ github.run_id }}\build-results - name: Upload artifacts to s3 if: always() - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + uses: seemethere/upload-artifact-s3@v3 with: retention-days: 14 if-no-files-found: error diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 4f43d2743d18b..bed6da45140e2 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -99,7 +99,7 @@ jobs: path: C:\${{ github.run_id }}\build-results - name: Upload artifacts to s3 if: always() - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + uses: seemethere/upload-artifact-s3@v3 with: retention-days: 14 if-no-files-found: error diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index ad4cf37f958e1..f5b2e6a96e2b4 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -109,7 +109,7 @@ jobs: path: C:\${{ github.run_id }}\build-results - name: Upload artifacts to s3 if: always() - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + uses: seemethere/upload-artifact-s3@v3 with: retention-days: 14 if-no-files-found: error diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 299b48d66a411..bf1cbe4119483 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -109,7 +109,7 @@ jobs: path: C:\${{ github.run_id }}\build-results - name: Upload artifacts to s3 if: always() - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162 + uses: seemethere/upload-artifact-s3@v3 with: retention-days: 14 if-no-files-found: error From 22f3bcd1643a120d4f3b7c5df59daf668bbbe746 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Thu, 2 Sep 2021 11:23:38 -0700 Subject: [PATCH 480/530] .github: Move squid vars to common vars (#64436) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64436 Moves the squid variables to our common jinja template so that when we have to update them they're all in the same place. Signed-off-by: Eli Uriegas cc ezyang seemethere malfet lg20987 pytorch/pytorch-dev-infra Test Plan: Imported from OSS Reviewed By: malfet, zhouzhuojie Differential Revision: D30732776 Pulled By: seemethere fbshipit-source-id: 22e3757c4eec775baa8abbaac2ba2a0c69c2b2a9 --- .github/templates/bazel_ci_workflow.yml.j2 | 4 ++-- .github/templates/common.yml.j2 | 5 +++++ .github/templates/linux_ci_workflow.yml.j2 | 8 ++------ .github/templates/windows_ci_workflow.yml.j2 | 14 +++++--------- ...d-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 3 +-- ...d-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 3 +-- .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 3 +-- .../generated-linux-bionic-py3.8-gcc9-coverage.yml | 3 +-- .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 3 +-- .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 3 +-- .../generated-linux-xenial-py3.6-gcc5.4.yml | 3 +-- ...enerated-linux-xenial-py3.6-gcc7-bazel-test.yml | 1 - ...c-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 3 +-- ...d-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 3 +-- .../generated-periodic-win-vs2019-cuda11.1-py3.yml | 1 - .github/workflows/generated-win-vs2019-cpu-py3.yml | 1 - .../generated-win-vs2019-cuda10.1-py3.yml | 1 - .../generated-win-vs2019-cuda11.3-py3.yml | 1 - 18 files changed, 23 insertions(+), 40 deletions(-) diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 index 57b4567876e35..9f982cdd5cb61 100644 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ b/.github/templates/bazel_ci_workflow.yml.j2 @@ -65,7 +65,7 @@ on: -e PR_LABELS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ + -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -112,7 +112,7 @@ on: -e SCCACHE_BUCKET \ -e CONTINUE_THROUGH_ERROR \ -e PR_LABELS \ - -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ + -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 6757785c3cff4..07ad771346399 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -1,5 +1,10 @@ {%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v3" -%} +{# squid_proxy is an private ELB that only available for GHA custom runners #} +{%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%} +{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} +{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} + {%- macro display_ec2_information() -%} - name: Display EC2 information shell: bash diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 209e9c34a2d83..8aa854782de52 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -1,8 +1,4 @@ {% import 'common.yml.j2' as common %} -{# squid_proxy is an private ELB that only available for GHA custom runners #} -{%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%} -{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} -{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} {%- block name -%} # Template is at: .github/templates/linux_ci_workflow.yml.j2 @@ -144,8 +140,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ -e PR_LABELS \ + -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ @@ -316,7 +312,7 @@ jobs: -e CONTINUE_THROUGH_ERROR \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ - -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \ + -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 1268f275cc755..05d739db523e3 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -1,8 +1,4 @@ {% import 'common.yml.j2' as common %} -{# squid_proxy is an private ELB that only available for GHA custom runners #} -{%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%} -{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} -{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} {%- macro wait_and_kill_ssh() -%} - name: Wait until all sessions have drained @@ -58,7 +54,7 @@ env: VS_VERSION: "16.8.6" VC_YEAR: "2019" ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: !{{ squid_no_proxy }} + no_proxy: !{{ common.squid_no_proxy }} {%- if cuda_version != "cpu" %} TORCH_CUDA_ARCH_LIST: "7.0" USE_CUDA: 1 @@ -88,8 +84,8 @@ jobs: {%- endif %} env: JOB_BASE_NAME: !{{ build_environment }}-build - http_proxy: "!{{ squid_proxy }}" - https_proxy: "!{{ squid_proxy }}" + http_proxy: "!{{ common. squid_proxy }}" + https_proxy: "!{{ common.squid_proxy }}" steps: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" uses: seemethere/add-github-ssh-key@v1 @@ -192,8 +188,8 @@ jobs: SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} TEST_CONFIG: ${{ matrix.config }} - http_proxy: "!{{ squid_proxy }}" - https_proxy: "!{{ squid_proxy }}" + http_proxy: "!{{ common.squid_proxy }}" + https_proxy: "!{{ common.squid_proxy }}" RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }} PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index 4a434ac9772a2..396284cf72f84 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: libtorch-linux-xenial-cuda10.2-py3.6-gcc7 @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 34fd21e15dcde..cb0a98591956b 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: libtorch-linux-xenial-cuda11.3-py3.6-gcc7 @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index 5dc8d9f2d19de..ddb38b7c84d5a 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-bionic-cuda10.2-py3.9-gcc7 @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index c060b158b6007..c4cdd2cd636d5 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-bionic-py3.8-gcc9-coverage @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index dfbf9bfd25d36..2d0d916237676 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-cuda10.2-py3.6-gcc7 @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 53b09cdf81ffe..0e1c7ba6ca6c5 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-cuda11.3-py3.6-gcc7 @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index eb77554cb4a03..6f4e5c2958904 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-py3.6-gcc5.4 @@ -196,8 +195,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 4bfe1d1211786..2d72fab281f6e 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/bazel_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: linux-xenial-py3.6-gcc7-bazel-test diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index ce4fe5bc0e250..b009d77b3a8d8 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7 @@ -194,8 +193,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 694a79ac22c81..738e6bb146cdd 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/linux_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: periodic-linux-xenial-cuda11.1-py3.6-gcc7 @@ -194,8 +193,8 @@ jobs: -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 78e4c7cb42876..61c63f42cadf6 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: periodic-win-vs2019-cuda11.1-py3 diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index bed6da45140e2..6ef8e85ac931b 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: win-vs2019-cpu-py3 diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index f5b2e6a96e2b4..c158f08731d99 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: win-vs2019-cuda10.1-py3 diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index bf1cbe4119483..bd945c3255a0a 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -1,5 +1,4 @@ # @generated DO NOT EDIT MANUALLY - # Template is at: .github/templates/windows_ci_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: win-vs2019-cuda11.3-py3 From d0cb26ba575e489c3835c7741a2f94ea3f365c3f Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Thu, 2 Sep 2021 11:37:54 -0700 Subject: [PATCH 481/530] [DDP] Fix logging iterations (#64411) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64411 These are not actually the training iterations, but are offset by how frequently DDP stats collection actually runs (default being kDDPRuntimeLoggingSampleRate = 100). So with this change, they are actually logged to scuba every: 10, 10 * 100, 40 * 100, etc iterations. Test Plan: CI Reviewed By: zhaojuanmao Differential Revision: D30718274 fbshipit-source-id: 146bd2428753c93363bee37e487f40104fce3c18 --- torch/csrc/distributed/c10d/logger.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp index b1efd0b238378..92e16614a6612 100644 --- a/torch/csrc/distributed/c10d/logger.cpp +++ b/torch/csrc/distributed/c10d/logger.cpp @@ -10,9 +10,12 @@ namespace c10d { -// When training runs at these iterations, log the runtime -// stats. -const int LoggingIterations[] = {10, 20, 100, 1000, 5000, 10000, 20000}; // NOLINT +// Logs runtime stats to configured destination. Note that since data collection +// only runs every ddp_runtime_logging_sample_rate iterations, the actual +// training iterations recorded will be like 10, +// (20-10) * ddp_runtime_logging_sample_rate, +// (50-10) * ddp_runtime_logging_sample_rate and so on. +const int LoggingIterations[] = {10, 20, 50, 100, 500, 800, 1000}; // NOLINT std::ostream& operator<<(std::ostream& output, const Logger& logger) { auto& ddp_logging_data = (*logger.ddp_logging_data_); From 7d010539c9b6356cbaae8f7accc5b5cf8cc0d0cc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 2 Sep 2021 12:15:03 -0700 Subject: [PATCH 482/530] ENH Adds test and docs for modules that already support no batch dims (#62729) Summary: Towards https://github.com/pytorch/pytorch/issues/60585 Pull Request resolved: https://github.com/pytorch/pytorch/pull/62729 Reviewed By: H-Huang Differential Revision: D30669546 Pulled By: jbschlosser fbshipit-source-id: c771c98c1fd9d28fa984b72893585c738c736505 --- test/cpp_api_parity/parity-tracker.md | 1 + torch/csrc/api/src/nn/modules/activation.cpp | 4 +- torch/nn/modules/activation.py | 8 ++-- torch/testing/_internal/common_nn.py | 50 +++++++++++++++++++- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/test/cpp_api_parity/parity-tracker.md b/test/cpp_api_parity/parity-tracker.md index 869ef300f6c85..88e1848f7da78 100644 --- a/test/cpp_api_parity/parity-tracker.md +++ b/test/cpp_api_parity/parity-tracker.md @@ -99,6 +99,7 @@ torch::nn::Identity|Yes|No torch::nn::Linear|Yes|No torch::nn::Bilinear|Yes|No torch::nn::Flatten|Yes|No +torch::nn::Unflatten|Yes|No torch::nn::Dropout|Yes|No torch::nn::Dropout2d|Yes|No torch::nn::Dropout3d|Yes|No diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp index 3c4d2b8c98f50..e724a75c58ec9 100644 --- a/torch/csrc/api/src/nn/modules/activation.cpp +++ b/torch/csrc/api/src/nn/modules/activation.cpp @@ -170,8 +170,8 @@ void Softmax2dImpl::pretty_print(std::ostream& stream) const { } Tensor Softmax2dImpl::forward(const Tensor& input) { - TORCH_CHECK(input.dim() == 4, "Softmax2d requires a 4D tensor as input"); - return F::detail::softmax(input, /*dim=*/1, c10::nullopt); + TORCH_CHECK(input.dim() == 4 || input.dim() == 3, "Softmax2d requires a 3D or 4D tensor as input"); + return F::detail::softmax(input, /*dim=*/-3, c10::nullopt); } // ============================================================================ diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index 91427c8aea2cd..90b901d9b690a 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -1236,8 +1236,8 @@ class Softmax2d(Module): apply `Softmax` to each location :math:`(Channels, h_i, w_j)` Shape: - - Input: :math:`(N, C, H, W)` - - Output: :math:`(N, C, H, W)` (same shape as input) + - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`. + - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input) Returns: a Tensor of the same dimension and shape as the input with @@ -1252,8 +1252,8 @@ class Softmax2d(Module): """ def forward(self, input: Tensor) -> Tensor: - assert input.dim() == 4, 'Softmax2d requires a 4D tensor as input' - return F.softmax(input, 1, _stacklevel=5) + assert input.dim() == 4 or input.dim() == 3, 'Softmax2d requires a 3D or 4D tensor as input' + return F.softmax(input, -3, _stacklevel=5) class LogSoftmax(Module): diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 73233df8cc5bb..b22b6ab1d2ec5 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -3659,6 +3659,28 @@ def single_batch_reference_fn(input, parameters, module): fullname='log_softmax_scalar', pickle=False, ), + dict( + module_name='Softmax2d', + input_size=(3, 4, 5), + reference_fn=single_batch_reference_fn, + desc='no_batch_dim', + ), + dict( + module_name='Softmax', + constructor_args=(-1,), + cpp_constructor_args='torch::nn::SoftmaxOptions(-1)', + input_size=(4, 5), + reference_fn=single_batch_reference_fn, + desc='no_batch_dim', + ), + dict( + module_name='LogSoftmax', + constructor_args=(-1,), + cpp_constructor_args='torch::nn::LogSoftmaxOptions(1)', + input_size=(4, 5), + reference_fn=single_batch_reference_fn, + desc='no_batch_dim', + ), dict( @@ -3819,6 +3841,14 @@ def single_batch_reference_fn(input, parameters, module): input_size=(), desc='scalar', ), + dict( + module_name='Softmin', + constructor_args=(-1,), + cpp_constructor_args='torch::nn::SoftminOptions(-1)', + input_size=(3, 4, 10), + reference_fn=single_batch_reference_fn, + desc='no_batch_dim', + ), dict( module_name='Tanhshrink', input_size=(), @@ -3985,6 +4015,22 @@ def single_batch_reference_fn(input, parameters, module): with_tf32=True, tf32_precision=0.005, ), + dict( + module_name='Flatten', + cpp_constructor_args='torch::nn::FlattenOptions().start_dim(-3).end_dim(-1)', + constructor_args=(-3, -1), + input_size=(3, 4, 5), + reference_fn=single_batch_reference_fn, + desc="no_batch_dim", + ), + dict( + module_name='Unflatten', + cpp_constructor_args='torch::nn::UnflattenOptions(-2, {2, 2})', + constructor_args=(-2, torch.Size([2, 2])), + input_size=(3, 4, 5), + reference_fn=single_batch_reference_fn, + desc="no_batch_dim", + ), ] # add conv padding mode tests: @@ -4027,7 +4073,7 @@ def single_batch_reference_fn(input, parameters, module): # Check that non linear activations work with no batch dimensions non_linear_activations_no_batch = [ 'ELU', 'Hardshrink', 'Hardsigmoid', 'Hardtanh', 'Hardswish', 'LeakyReLU', - 'LogSigmoid', 'PReLU', 'ReLU', 'ReLU6', 'RReLU', 'SELU', 'CELU', 'GELU', + 'LogSigmoid', 'PReLU', 'ReLU', 'ReLU6', 'RReLU', 'SELU', 'CELU', 'GELU', 'GLU', 'Sigmoid', 'SiLU', 'Mish', 'Softplus', 'Softshrink', 'Softsign', 'Tanh', 'Tanhshrink', 'Threshold' ] @@ -4043,7 +4089,7 @@ def single_batch_reference_fn(input, parameters, module): for non_linear_activation in non_linear_activations_no_batch: activation_test_info = dict( module_name=non_linear_activation, - input_size=(3,), + input_size=(4,), reference_fn=single_batch_reference_fn, desc='no_batch_dim', test_cpp_api_parity=False, From 3cd0a4ac153ea8404f0e960ddacf00098689e600 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Thu, 2 Sep 2021 12:25:15 -0700 Subject: [PATCH 483/530] Fix test_ind_worker_queue by setting max_num_worker based on system resource (#63779) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63779 Fixes #63657 Test Plan: Imported from OSS Reviewed By: gchanan Differential Revision: D30494185 Pulled By: ejguan fbshipit-source-id: d1bd24299b25d589889604aaf18ad347bdff4df4 --- test/test_dataloader.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/test/test_dataloader.py b/test/test_dataloader.py index c768246ff477c..5050feca3a373 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -2320,9 +2320,24 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers): current_worker_idx = 0 def test_ind_worker_queue(self): + max_num_workers = None + if hasattr(os, 'sched_getaffinity'): + try: + max_num_workers = len(os.sched_getaffinity(0)) + except Exception: + pass + if max_num_workers is None: + cpu_count = os.cpu_count() + if cpu_count is not None: + # Use half number of CPUs + max_num_workers = cpu_count // 2 + + if max_num_workers is None: + max_num_workers = 1 + for batch_size in (8, 16, 32, 64): - for num_workers in range(1, 6): - self._run_ind_worker_queue_test(batch_size=batch_size, num_workers=num_workers) + for num_workers in range(0, min(6, max_num_workers)): + self._run_ind_worker_queue_test(batch_size=batch_size, num_workers=num_workers + 1) class SetAffinityDataset(IterableDataset): From 4f434801866a60823124f3c2bd672d2005220c9c Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Thu, 2 Sep 2021 13:06:18 -0700 Subject: [PATCH 484/530] [DataPipe] adding/removing __len__ for different DataPipe (#64398) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64398 cc VitalyFedyunin ejguan Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D30710437 Pulled By: NivekT fbshipit-source-id: 524eda43a2faa0db0c1a662bf9bb4283f0ade83c --- test/test_datapipe.py | 21 +++++++++++++++++++ torch/utils/data/datapipes/iter/grouping.py | 6 ++++++ torch/utils/data/datapipes/iter/httpreader.py | 7 ++++++- torch/utils/data/datapipes/iter/selecting.py | 1 - 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 24d0ce20d63dd..f09583b722379 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -1626,6 +1626,27 @@ def test_simple_sharding(self): self.assertEqual(sorted(all_items), sorted(items)) + def test_sharding_length(self): + numbers_dp = IDP(range(13)) + sharded_dp0 = numbers_dp.sharding_filter() + torch.utils.data.sharding.apply_sharding(sharded_dp0, 3, 0) + sharded_dp1 = numbers_dp.sharding_filter() + torch.utils.data.sharding.apply_sharding(sharded_dp1, 3, 1) + sharded_dp2 = numbers_dp.sharding_filter() + torch.utils.data.sharding.apply_sharding(sharded_dp2, 3, 2) + self.assertEqual(13, len(numbers_dp)) + self.assertEqual(5, len(sharded_dp0)) + self.assertEqual(4, len(sharded_dp1)) + self.assertEqual(4, len(sharded_dp2)) + + numbers_dp = IDP(range(1)) + sharded_dp0 = numbers_dp.sharding_filter() + torch.utils.data.sharding.apply_sharding(sharded_dp0, 2, 0) + sharded_dp1 = numbers_dp.sharding_filter() + torch.utils.data.sharding.apply_sharding(sharded_dp1, 2, 1) + self.assertEqual(1, len(sharded_dp0)) + self.assertEqual(0, len(sharded_dp1)) + @skipIfNoDill def test_old_dataloader(self): dp = self._get_pipeline() diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index aece256d10650..d90ad08814ecf 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -28,6 +28,12 @@ def __iter__(self): if i % self.num_of_instances == self.instance_id: yield item + def __len__(self): + if isinstance(self.source_datapipe, Sized): + return len(self.source_datapipe) // self.num_of_instances +\ + (1 if (self.instance_id < len(self.source_datapipe) % self.num_of_instances) else 0) + raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) + @functional_datapipe('batch') class BatcherIterDataPipe(IterDataPipe[DataChunk]): diff --git a/torch/utils/data/datapipes/iter/httpreader.py b/torch/utils/data/datapipes/iter/httpreader.py index 747b5d567e4cd..0c8e2fc818e9f 100644 --- a/torch/utils/data/datapipes/iter/httpreader.py +++ b/torch/utils/data/datapipes/iter/httpreader.py @@ -1,5 +1,5 @@ from io import IOBase -from typing import Tuple +from typing import Sized, Tuple from urllib.error import HTTPError, URLError import urllib.request as urllib from torch.utils.data import IterDataPipe @@ -39,3 +39,8 @@ def __iter__(self): .format(reason=e.reason, url=furl)) except Exception: raise + + def __len__(self) -> int: + if isinstance(self.datapipe, Sized): + return len(self.datapipe) + raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index a89bfdfb39e5c..4e8703c8d3973 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -77,6 +77,5 @@ def _isNonEmpty(self, data): not (isinstance(data, list) and len(data) == 0 and self.drop_empty_batches) return r - def __len__(self): raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) From 4ce9c530d681fd4c860cf78f4497a17df5bdc018 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Thu, 2 Sep 2021 13:06:18 -0700 Subject: [PATCH 485/530] [DataPipe] removing filter's inheritance from map (#64404) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64404 This PR remove `filter`'s inheritance from `map`. This allows `filter` to not have a `__len__` function and that behavior is what we would like. cc VitalyFedyunin ejguan Test Plan: Imported from OSS Reviewed By: gchanan Differential Revision: D30713120 Pulled By: NivekT fbshipit-source-id: 4d5d07555297ee2bd4b49842c0d26cdc00638f6c --- test/test_datapipe.py | 2 +- torch/utils/data/datapipes/iter/selecting.py | 54 ++++++++++++++++---- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index f09583b722379..15cb05986b518 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -1033,7 +1033,7 @@ def _filter_fn(data, val, clip=False): for data, exp in zip(filter_dp, range(5, 10)): self.assertEqual(data, exp) - with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): + with self.assertRaisesRegex(TypeError, r"has no len"): len(filter_dp) def _non_bool_fn(data): diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index 4e8703c8d3973..f1889e5d7a8e4 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -1,13 +1,24 @@ +import warnings from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk from typing import Callable, TypeVar, Iterator, Optional, Tuple, Dict -from .callable import MapperIterDataPipe - T_co = TypeVar('T_co', covariant=True) +try: + import dill + + # XXX: By default, dill writes the Pickler dispatch table to inject its + # own logic there. This globally affects the behavior of the standard library + # pickler for any user who transitively depends on this module! + # Undo this extension to avoid altering the behavior of the pickler globally. + dill.extend(use_dill=False) + DILL_AVAILABLE = True +except ImportError: + DILL_AVAILABLE = False + @functional_datapipe('filter') -class FilterIterDataPipe(MapperIterDataPipe): +class FilterIterDataPipe(IterDataPipe[T_co]): r""" :class:`FilterIterDataPipe`. Iterable DataPipe to filter elements from datapipe according to filter_fn. @@ -22,18 +33,31 @@ class FilterIterDataPipe(MapperIterDataPipe): This also accepts -1 as input to apply filtering to the lowest nesting level. It currently doesn't support argument < -1. """ + datapipe: IterDataPipe + filter_fn: Callable drop_empty_batches: bool def __init__(self, - datapipe: IterDataPipe[T_co], - filter_fn: Callable[..., bool], + datapipe: IterDataPipe, + filter_fn: Callable, fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None, drop_empty_batches: bool = True, nesting_level: int = 0, ) -> None: + super().__init__() + self.datapipe = datapipe + # Partial object has no attribute '__name__', but can be pickled + if hasattr(filter_fn, '__name__') and filter_fn.__name__ == '' and not DILL_AVAILABLE: + warnings.warn("Lambda function is not supported for pickle, please use " + "regular python function or functools.partial instead.") + self.filter_fn = filter_fn # type: ignore[assignment] + self.args = () if fn_args is None else fn_args + self.kwargs = {} if fn_kwargs is None else fn_kwargs + if nesting_level < -1: + raise ValueError("nesting_level must be -1 or >= 0") + self.nesting_level = nesting_level self.drop_empty_batches = drop_empty_batches - super().__init__(datapipe, fn=filter_fn, fn_args=fn_args, fn_kwargs=fn_kwargs, nesting_level=nesting_level) def __iter__(self) -> Iterator[T_co]: res: bool @@ -66,7 +90,7 @@ def _applyFilter(self, data, nesting_level): return self._returnIfTrue(data) def _returnIfTrue(self, data): - condition = self.fn(data, *self.args, **self.kwargs) + condition = self.filter_fn(data, *self.args, **self.kwargs) if not isinstance(condition, bool): raise ValueError("Boolean output is required for `filter_fn` of FilterIterDataPipe") if condition: @@ -77,5 +101,17 @@ def _isNonEmpty(self, data): not (isinstance(data, list) and len(data) == 0 and self.drop_empty_batches) return r - def __len__(self): - raise TypeError("{} instance doesn't have valid length".format(type(self).__name__)) + def __getstate__(self): + if DILL_AVAILABLE: + dill_function = dill.dumps(self.filter_fn) + else: + dill_function = self.filter_fn + state = (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches, self.nesting_level) + return state + + def __setstate__(self, state): + (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches, self.nesting_level) = state + if DILL_AVAILABLE: + self.filter_fn = dill.loads(dill_function) # type: ignore[assignment] + else: + self.filter_fn = dill_function # type: ignore[assignment] From f555348aaa7abdbcaaa7bfdb06b33c2edf93b172 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 2 Sep 2021 13:30:51 -0700 Subject: [PATCH 486/530] Disable CircleCI ROCm build (#64434) Summary: Per jithunnair-amd suggestion Pull Request resolved: https://github.com/pytorch/pytorch/pull/64434 Reviewed By: seemethere, janeyx99 Differential Revision: D30732289 Pulled By: malfet fbshipit-source-id: 1932d0a7d1e648006f8030c8237b187d0709f688 --- .circleci/cimodel/data/pytorch_build_data.py | 15 ++++++++------- .circleci/config.yml | 8 -------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index df0cfa0027554..7d43a73f622a0 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -63,13 +63,14 @@ ]), ]), ]), - ("rocm", [ - ("3.9", [ - ("3.6", [ - ('build_only', [XImportant(True)]), - ]), - ]), - ]), + # @jithunnair-amd believes Jenkins builds are sufficient + # ("rocm", [ + # ("3.9", [ + # ("3.6", [ + # ('build_only', [XImportant(True)]), + # ]), + # ]), + # ]), ]), ] diff --git a/.circleci/config.yml b/.circleci/config.yml index 9989f1a289b7d..3e175764cad1e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7446,14 +7446,6 @@ workflows: build_environment: "pytorch-vulkan-linux-bionic-py3.6-clang9-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9" resource_class: large - - pytorch_linux_build: - name: pytorch_linux_bionic_rocm3_9_py3_6_build - requires: - - "docker-pytorch-linux-bionic-rocm3.9-py3.6" - build_environment: "pytorch-linux-bionic-rocm3.9-py3.6-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.9-py3.6" - resource_class: xlarge - build_only: "1" - pytorch_macos_10_15_py3_build: name: pytorch_macos_10_15_py3_build - pytorch_macos_10_13_py3_build: From 257623da39c9ecc63025e90a418852ef3200b57f Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Thu, 2 Sep 2021 13:35:05 -0700 Subject: [PATCH 487/530] Switch Shuffler to use iter-local buffer (#64195) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64195 Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D30642947 Pulled By: ejguan fbshipit-source-id: d4b52479b4ae37ad693388b9cdb8eed83a136474 --- .../data/datapipes/iter/combinatorics.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py index 4d6fac749729d..5e17a3ef56c33 100644 --- a/torch/utils/data/datapipes/iter/combinatorics.py +++ b/torch/utils/data/datapipes/iter/combinatorics.py @@ -1,7 +1,7 @@ import random from torch.utils.data import IterDataPipe, Sampler, SequentialSampler, functional_datapipe -from typing import TypeVar, Type, Iterator, Sized, Optional, Tuple, Dict, List +from typing import Dict, Iterator, List, Optional, Sized, Tuple, Type, TypeVar T_co = TypeVar('T_co', covariant=True) @@ -72,7 +72,6 @@ class ShufflerIterDataPipe(IterDataPipe[T_co]): """ datapipe: IterDataPipe[T_co] buffer_size: int - _buffer: List[T_co] def __init__(self, datapipe: IterDataPipe[T_co], @@ -87,24 +86,24 @@ def __init__(self, else: self.datapipe = datapipe.unbatch(unbatch_level=unbatch_level) self.buffer_size = buffer_size - self._buffer = [] - def buffer_replace(self, x): - idx = random.randint(0, self.buffer_size - 1) - val = self._buffer[idx] - self._buffer[idx] = x + @staticmethod + def buffer_replace(buffer, x): + idx = random.randint(0, len(buffer) - 1) + val = buffer[idx] + buffer[idx] = x return val def __iter__(self) -> Iterator[T_co]: - # TODO: Buffer is global, should be per __iter__ !!! + buffer: List[T_co] = [] for x in self.datapipe: - if len(self._buffer) == self.buffer_size: - yield self.buffer_replace(x) + if len(buffer) == self.buffer_size: + yield ShufflerIterDataPipe.buffer_replace(buffer, x) else: - self._buffer.append(x) - random.shuffle(self._buffer) - while self._buffer: - yield self._buffer.pop() + buffer.append(x) + random.shuffle(buffer) + while buffer: + yield buffer.pop() def __len__(self) -> int: if isinstance(self.datapipe, Sized): From 3c79e0b314c56c01e119b22e834922923a63ad9e Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Thu, 2 Sep 2021 14:49:47 -0700 Subject: [PATCH 488/530] .github: Migrate pytorch_linux_bionic_py_3_6_clang9 to GHA (#64218) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64218 Relies on https://github.com/fairinternal/pytorch-gha-infra/pull/11 Signed-off-by: Eli Uriegas cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra bdhirsh Test Plan: Imported from OSS Reviewed By: malfet, H-Huang, janeyx99 Differential Revision: D30651516 Pulled By: seemethere fbshipit-source-id: e5843dfe84f096f2872d88f2e53e9408ad2fe399 --- .circleci/cimodel/data/pytorch_build_data.py | 5 - .circleci/config.yml | 13 - .github/generated-ciflow-ruleset.json | 10 + .github/scripts/generate_ci_workflows.py | 36 +- .../scripts/generate_pytorch_test_matrix.py | 4 + .github/templates/linux_ci_workflow.yml.j2 | 5 + ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 2 + ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 2 + ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 5 + .../generated-linux-bionic-py3.6-clang9.yml | 506 ++++++++++++++++++ ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 5 + ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 5 + ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 5 + .../generated-linux-xenial-py3.6-gcc5.4.yml | 5 + ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 1 + ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 + ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 5 + 17 files changed, 580 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/generated-linux-bionic-py3.6-clang9.yml diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index 7d43a73f622a0..dbe17bf4f15f5 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -51,11 +51,6 @@ ]), ("bionic", [ ("clang", [ - ("9", [ - ("3.6", [ - ("noarch", [XImportant(True)]), - ]), - ]), ("9", [ ("3.6", [ ("xla", [XImportant(True)]), diff --git a/.circleci/config.yml b/.circleci/config.yml index 3e175764cad1e..ffc67a14ec5ad 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7407,19 +7407,6 @@ workflows: build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7" resource_class: large - - pytorch_linux_build: - name: pytorch_linux_bionic_py3_6_clang9_noarch_build - requires: - - "docker-pytorch-linux-bionic-py3.6-clang9" - build_environment: "pytorch-linux-bionic-py3.6-clang9-noarch-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9" - - pytorch_linux_test: - name: pytorch_linux_bionic_py3_6_clang9_noarch_test - requires: - - pytorch_linux_bionic_py3_6_clang9_noarch_build - build_environment: "pytorch-linux-bionic-py3.6-clang9-noarch-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9" - resource_class: large - pytorch_linux_build: name: pytorch_xla_linux_bionic_py3_6_clang9_build requires: diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index d13561190d01f..d3ebad35a5303 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -5,6 +5,7 @@ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", + "linux-bionic-py3.6-clang9", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda10.2-py3.6-gcc7", "linux-xenial-cuda11.3-py3.6-gcc7", @@ -24,6 +25,7 @@ "linux-bionic-py3.8-gcc9-coverage" ], "ciflow/cpu": [ + "linux-bionic-py3.6-clang9", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", @@ -42,6 +44,7 @@ "win-vs2019-cuda11.3-py3" ], "ciflow/default": [ + "linux-bionic-py3.6-clang9", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", @@ -58,6 +61,7 @@ "libtorch-linux-xenial-cuda10.2-py3.6-gcc7", "libtorch-linux-xenial-cuda11.3-py3.6-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", + "linux-bionic-py3.6-clang9", "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-cuda10.2-py3.6-gcc7", "linux-xenial-cuda11.3-py3.6-gcc7", @@ -66,6 +70,9 @@ "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7" ], + "ciflow/noarch": [ + "linux-bionic-py3.6-clang9" + ], "ciflow/scheduled": [ "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7", @@ -80,6 +87,9 @@ "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3", "win-vs2019-cuda11.3-py3" + ], + "ciflow/xla": [ + "linux-bionic-py3.6-clang9" ] }, "version": "v1" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 467d13d0dc45d..16100f72a527c 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -49,6 +49,8 @@ LABEL_CIFLOW_SCHEDULED = "ciflow/scheduled" LABEL_CIFLOW_SLOW = "ciflow/slow" LABEL_CIFLOW_WIN = "ciflow/win" +LABEL_CIFLOW_XLA = "ciflow/xla" +LABEL_CIFLOW_NOARCH = "ciflow/noarch" @dataclass @@ -150,6 +152,8 @@ class CIWorkflow: enable_slow_test: YamlShellBool = "''" enable_docs_test: YamlShellBool = "''" enable_backwards_compat_test: YamlShellBool = "''" + enable_xla_test: YamlShellBool = "''" + enable_noarch_test: YamlShellBool = "''" def __post_init__(self) -> None: if self.is_libtorch: @@ -409,24 +413,6 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CUDA}, ), ), - # CIWorkflow( - # arch="linux", - # build_environment="linux-bionic-py3.6-clang9-noarch", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), - # CIWorkflow( - # arch="linux", - # build_environment="xla-linux-bionic-py3.6-clang9", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), - # CIWorkflow( - # arch="linux", - # build_environment="vulkan-linux-bionic-py3.6-clang9", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), CIWorkflow( arch="linux", build_environment="linux-bionic-py3.8-gcc9-coverage", @@ -440,6 +426,20 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_COVERAGE, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, ), ), + CIWorkflow( + arch="linux", + build_environment="linux-bionic-py3.6-clang9", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", + test_runner_type=LINUX_CPU_TEST_RUNNER, + on_pull_request=True, + num_test_shards=2, + distributed_test=False, + enable_noarch_test=1, + ciflow_config=CIFlowConfig( + enabled=True, + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA, LABEL_CIFLOW_NOARCH}, + ), + ), # CIWorkflow( # arch="linux", # build_environment="linux-bionic-rocm3.9-py3.6", diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py index cb71f588ece5e..beb1b9d90e62f 100755 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ b/.github/scripts/generate_pytorch_test_matrix.py @@ -59,6 +59,10 @@ def main() -> None: configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'): configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} + if os.getenv('ENABLE_XLA_TEST'): + configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} + if os.getenv('ENABLE_NOARCH_TEST'): + configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} matrix = { 'include': [ { diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 8aa854782de52..314122b699c86 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -35,6 +35,7 @@ env: BUILD_ENVIRONMENT: !{{ build_environment }} DOCKER_IMAGE_BASE: !{{ docker_image_base }} SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -137,6 +138,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -214,6 +216,8 @@ jobs: ENABLE_SLOW_TEST: !{{ enable_slow_test }} ENABLE_DOCS_TEST: !{{ enable_docs_test }} ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }} + ENABLE_XLA_TEST: !{{ enable_xla_test }} + ENABLE_NOARCH_TEST: !{{ enable_noarch_test }} NUM_TEST_SHARDS: !{{ num_test_shards }} MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -313,6 +317,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index 396284cf72f84..d2111896319df 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda10.2-py3.6-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index cb0a98591956b..51811ae9eca58 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.3-py3.6-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index ddb38b7c84d5a..f410f1fc0be20 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -282,6 +284,8 @@ jobs: ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -420,6 +424,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml new file mode 100644 index 0000000000000..e54555d12cf62 --- /dev/null +++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml @@ -0,0 +1,506 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-bionic-py3.6-clang9 + +on: + pull_request: + types: [opened, synchronize, reopened, unassigned] + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: linux-bionic-py3.6-clang9 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + +concurrency: + group: linux-bionic-py3.6-clang9-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) }} + steps: + - name: noop + run: echo running ciflow_should_run + calculate-docker-image: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.2xlarge + needs: [ciflow_should_run] + env: + DOCKER_BUILDKIT: 1 + timeout-minutes: 90 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + - name: Check if image should be built + id: check + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + - name: Build and push docker image + if: ${{ steps.check.outputs.rebuild }} + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + DOCKER_SKIP_S3_UPLOAD: 1 + run: | + export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} + cd .circleci/docker && ./build_docker.sh + + build: + runs-on: linux.2xlarge + needs: [calculate-docker-image, ciflow_should_run] + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.6-clang9-build + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch + run: | + docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + + generate-test-matrix: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-18.04 + needs: [ciflow_should_run] + env: + TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: '' + ENABLE_JIT_LEGACY_TEST: '' + ENABLE_MULTIGPU_TEST: '' + ENABLE_NOGPU_NO_AVX_TEST: '' + ENABLE_NOGPU_NO_AVX2_TEST: '' + ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: 1 + NUM_TEST_SHARDS: 2 + MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu + NOGPU_RUNNER_TYPE: linux.2xlarge + PR_BODY: ${{ github.event.pull_request.body }} + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} + ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} + container: + image: python:3.9 + steps: + - name: Install dependencies + run: pip install typing-extensions + - name: Clone pytorch/pytorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + - name: Generating test matrix + id: set-matrix + run: .github/scripts/generate_pytorch_test_matrix.py + + test: + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] + strategy: + matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.6-clang9-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Test PyTorch + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + if [[ $NUM_TEST_SHARDS -ne 2 ]]; then + export SHARD_NUMBER=0 + fi + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086 + docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ + -e CONTINUE_THROUGH_ERROR \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Zip test reports for upload + if: always() + env: + COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store PyTorch Test Reports + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Test Reports on S3 + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.6-clang9-test + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index c4cdd2cd636d5..2bcb4261816de 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -282,6 +284,8 @@ jobs: ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -420,6 +424,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 2d0d916237676..656b56548b26b 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -282,6 +284,8 @@ jobs: ENABLE_SLOW_TEST: 1 ENABLE_DOCS_TEST: '' ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -420,6 +424,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 0e1c7ba6ca6c5..68b42858c98e3 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -282,6 +284,8 @@ jobs: ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -420,6 +424,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index 6f4e5c2958904..b347b52be9509 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -192,6 +193,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -282,6 +284,8 @@ jobs: ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: 1 ENABLE_BACKWARDS_COMPAT_TEST: 1 + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -420,6 +424,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index 2d72fab281f6e..e86cc563a77f8 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -16,6 +16,7 @@ env: BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index b009d77b3a8d8..b591519e3219c 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -14,6 +14,7 @@ env: BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -190,6 +191,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 738e6bb146cdd..07593ad07a941 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -14,6 +14,7 @@ env: BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7 DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla TORCH_CUDA_ARCH_LIST: 5.2 IN_CI: 1 # This is used for the phase of adding wheel tests only, will be removed once completed @@ -190,6 +191,7 @@ jobs: -e JOB_BASE_NAME \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e TORCH_CUDA_ARCH_LIST \ @@ -280,6 +282,8 @@ jobs: ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge @@ -418,6 +422,7 @@ jobs: -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ From 795387477fe90e03cb598f3077a32222896e65dd Mon Sep 17 00:00:00 2001 From: James Reed Date: Thu, 2 Sep 2021 15:15:24 -0700 Subject: [PATCH 489/530] [FX] Prototype for guarding against mutable operations in tracing (#64295) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64295 Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D30675780 Pulled By: jamesr66a fbshipit-source-id: b2116b51dcc87357f0c84192c4c336680875e27a --- ..._compat-fx_backcompat_class_members.expect | 2 +- test/test_fx.py | 67 +++++++++++++++++-- torch/csrc/jit/python/init.cpp | 14 ++-- torch/fx/operator_schemas.py | 44 ++++++++++-- torch/fx/proxy.py | 7 ++ 5 files changed, 120 insertions(+), 14 deletions(-) diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect index 88e4654b568df..5c3630a3169f7 100644 --- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect +++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect @@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node'] torch.fx.proxy.GraphAppendingTracer [] torch.fx.proxy.Proxy ['keys'] torch.fx.proxy.TraceError [] -torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] +torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map'] \ No newline at end of file diff --git a/test/test_fx.py b/test/test_fx.py index 5220f67ebf309..57a2960a409c3 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -132,10 +132,17 @@ def __init__(self, a, b): class TestFX(JitTestCase): def setUp(self): - if TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS: - return - lib_file_path = find_library_location('libtorchbind_test.so') - torch.ops.load_library(str(lib_file_path)) + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + if not (TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS): + lib_file_path = find_library_location('libtorchbind_test.so') + torch.ops.load_library(str(lib_file_path)) + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): """Check that an nn.Module's results match the GraphModule version @@ -2367,6 +2374,19 @@ def forward(self, x: torch.Tensor, y: int, z: int): traced.graph.lint() + def test_throw_out_variant(self): + def foo(x): + y = torch.rand_like(x) + torch.sigmoid(x, out=y) + return y + + class MyTracer(torch.fx.Tracer): + check_mutable_operations = True + + tracer = MyTracer() + with self.assertRaisesRegex(RuntimeError, 'mutable operation aten::sigmoid.out'): + traced_graph = tracer.trace(foo) + def test_ast_rewriter_reassigns_submodules(self): class M(torch.nn.Module): def __init__(self): @@ -3021,6 +3041,15 @@ def run_getitem_target(): class TestOperatorSignatures(JitTestCase): + def setUp(self): + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + @onlyCPU @ops(op_db, allowed_dtypes=(torch.float,)) def test_get_torch_func_signature_exhaustive(self, device, dtype, op): @@ -3090,6 +3119,15 @@ class TestFXAPIBackwardCompatibility(JitTestCase): def setUp(self): self.maxDiff = None + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + + def _fn_to_stable_annotation_str(self, obj): """ Unfortunately we have to serialize function signatures manually since @@ -3326,6 +3364,15 @@ def check_symbols_have_bc_designation(m, prefix): f"BC guarantees.") class TestFunctionalTracing(JitTestCase): + def setUp(self): + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary", "has_torch_function_variadic", "handle_torch_function", "boolean_dispatch") @@ -3340,6 +3387,7 @@ class TestFunctionalTracing(JitTestCase): ARG_TYPE_MISMATCH = (TypeError, r", not Proxy$") CONTROL_FLOW = (TraceError, r"symbolically traced variables cannot be used as inputs to control flow") INTERPOLATE_ARGS_CONFLICT = (ValueError, r"only one of size or scale_factor should be defined") + MUTABLE = (RuntimeError, r"Tried to trace mutable operation") UNTRACEABLE_FUNCTIONALS = { "adaptive_avg_pool1d": BUILT_IN_FUNC, @@ -3459,6 +3507,8 @@ class TestFunctionalTracing(JitTestCase): "upsample_bilinear": INTERPOLATE_ARGS_CONFLICT, "upsample_nearest": INTERPOLATE_ARGS_CONFLICT, + + "normalize" : MUTABLE, } # List of nn.functionals with Tensor inputs but not with type annotation @@ -3573,6 +3623,15 @@ def tearDownClass(cls): @skipIfNoTorchVision class TestVisionTracing(JitTestCase): + def setUp(self): + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + PROXY_ITERATED = (TraceError, r"Proxy object cannot be iterated") INCONSISTENT_TYPE = ( RuntimeError, diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 7e43e511c786f..35197e4ea1423 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -1280,11 +1280,15 @@ void initJITBindings(PyObject* module) { [](const FunctionSchema& self, const FunctionSchema& other) { return self == other; }) - .def("__str__", [](FunctionSchema& self) { - std::stringstream ss; - ss << self; - return ss.str(); - }); + .def( + "__str__", + [](FunctionSchema& self) { + std::stringstream ss; + ss << self; + return ss.str(); + }) + .def_property_readonly( + "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); }); py::class_(m, "Argument") .def_property_readonly("name", [](Argument& self) { return self.name(); }) .def_property_readonly("type", [](Argument& self) { return self.type(); }) diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py index ac559b19530c7..5e024e8624cca 100644 --- a/torch/fx/operator_schemas.py +++ b/torch/fx/operator_schemas.py @@ -79,7 +79,43 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins return inspect.Signature(parameters, return_annotation=return_type) @compatibility(is_backward_compatible=False) -def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]: +def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']): + signatures, schemas = get_signature_for_torch_op(target, return_schemas=True) + + if signatures and schemas: + matched_schemas = [] + + # Iterate through all of the schema until we find one that matches + # If one matches, populate `new_args_and_kwargs` with the new args/kwargs + # values. If none matches, `new_args_and_kwargs` will be None + for candidate_signature, schema in zip(signatures, schemas): + try: + candidate_signature.bind(*args, **kwargs) + matched_schemas.append((candidate_signature, schema)) + except TypeError as e: + continue + + def throw_if_mutable(schema): + if schema.is_mutable: + raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional ' + f'code, so operations that mutate operands in-place (e.g. via `out` arguments) ' + f'are not supported') + + if len(matched_schemas) == 0: + # Did not match any schema. Cannot check for mutation + pass + elif len(matched_schemas) == 1: + # Matched exactly one schema, unambiguous + _, schema_to_check = matched_schemas[0] + throw_if_mutable(schema_to_check) + pass + else: + # Ambiguous schema match. Since mutability checking is best effort, + # do nothing. + pass + +@compatibility(is_backward_compatible=False) +def get_signature_for_torch_op(op : Callable, return_schemas : bool = False) -> Optional[List[inspect.Signature]]: """ Given an operator on the `torch` namespace, return a list of `inspect.Signature` objects corresponding to the overloads of that op.. May return `None` if a signature @@ -94,17 +130,17 @@ def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature """ override = _manual_overrides.get(op) if override: - return override + return (override, None) if return_schemas else None aten_fn = torch.jit._builtins._find_builtin(op) if aten_fn is None: - return None + return (None, None) if return_schemas else None schemas = torch._C._jit_get_schemas_for_operator(aten_fn) signatures = [_torchscript_schema_to_signature(schema) for schema in schemas] - return signatures + return (signatures, schemas) if return_schemas else signatures @compatibility(is_backward_compatible=False) def create_type_hint(x): diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py index 61b039f8b7219..b25e45d206a51 100644 --- a/torch/fx/proxy.py +++ b/torch/fx/proxy.py @@ -8,11 +8,15 @@ from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable from .node import Target, Node, Argument, base_types, map_aggregate from ._compatibility import compatibility +from .operator_schemas import check_for_mutable_operation @compatibility(is_backward_compatible=True) class TracerBase: graph: Graph record_stack_traces : bool = False + # Feature flag for mutable schema checking + # Enableby default in 1.12 + check_mutable_operations : bool = False @compatibility(is_backward_compatible=True) def create_node(self, kind : str, target : Target, @@ -25,6 +29,9 @@ def create_node(self, kind : str, target : Target, modification of values used in node creation. For example, one might want to disallow in-place operations from being recorded. """ + if kind == 'call_function' and self.check_mutable_operations: + check_for_mutable_operation(target, args, kwargs) + return self.graph.create_node(kind, target, args, kwargs, name, type_expr) @compatibility(is_backward_compatible=True) From 116142143cc2d66c7e582d9f96e00862456fd736 Mon Sep 17 00:00:00 2001 From: Zafar Takhirov Date: Thu, 2 Sep 2021 15:56:54 -0700 Subject: [PATCH 490/530] [quant] Enable jit tracing on quantizable LSTM (#64438) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64438 The quantizable LSTM didn't support jit tracing because it had several non taceable paths. We sacrifice some of the user experience to enable the tracing. The main UX feature removed is a user-friendly message when trying to access the backwards path in a bidirectional LSTM: When the bidirectional flag is `False`, we used to throw a nice error message when the user tried accessing backwards weights. Now the message is default (removed properties). Test Plan: `buck test mode/dev //caffe2/test:quantization -- test_custom_module_lstm` Reviewed By: mtl67 Differential Revision: D30732630 fbshipit-source-id: 443e351ebb0e2b636c86dea9691b9bf42ffe618f --- test/quantization/core/test_quantized_op.py | 7 +++ torch/nn/quantizable/modules/rnn.py | 59 ++++----------------- 2 files changed, 17 insertions(+), 49 deletions(-) diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index 49b7c96847612..6275174d8e43a 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -2476,6 +2476,13 @@ def test_custom_module_lstm(self): msg=(f"Error is too high: SNR(dB): {power}, " f"Signal: {signal}, MSE: {mse}")) + # Trace + jit_qmodule = torch.jit.trace(lstm_quantized, qx) + + # Script + # TODO: Fix the scripting in the torch/nn/quantizable/modules/rnn.py + # jit_qmodule = torch.jit.script(lstm_quantized) + @override_qengines def test_custom_module_multi_head_attention(self): class MultiheadAttentionModel(torch.nn.Module): diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py index bdfd7788533b5..cd0d094d086a7 100644 --- a/torch/nn/quantizable/modules/rnn.py +++ b/torch/nn/quantizable/modules/rnn.py @@ -48,7 +48,7 @@ def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True, self.ogate_cy = torch.nn.quantized.FloatFunctional() def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]: - if hidden is None or hidden == (None, None): + if hidden is None or hidden[0] is None or hidden[1] is None: hidden = self.initialize_hidden(x.shape[0], x.is_quantized) hx, cx = hidden @@ -175,10 +175,13 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): cx_bw = cx_fw[1] cx_fw = cx_fw[0] hidden_bw = hx_bw, cx_bw - hidden_fw = hx_fw, cx_fw + if hx_fw is None and cx_fw is None: + hidden_fw = None + else: + hidden_fw = torch.jit._unwrap_optional(hx_fw), torch.jit._unwrap_optional(cx_fw) result_fw, hidden_fw = self.layer_fw(x, hidden_fw) - if self.bidirectional: + if hasattr(self, 'layer_bw') and self.bidirectional: x_reversed = x.flip(0) result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw) result_bw = result_bw.flip(0) @@ -188,7 +191,7 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): c = torch.stack([hidden_fw[1], hidden_bw[1]], 0) # type: ignore[list-item] else: result = result_fw - h, c = hidden_fw # type: ignore[assignment] + h, c = torch.jit._unwrap_optional(hidden_fw) # type: ignore[assignment] if self.batch_first: result.transpose_(0, 1) @@ -227,46 +230,6 @@ def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs): layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh) return layer - # Getters for the weights and biases - # Note that jit currently doesn't support the `porperty`, so if you need to - # access the weights/biases you would need to navigate manually to the - # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883 - @property - def weight_ih(self): - return self.layer_fw.cell.igates.weight - - @property - def weight_hh(self): - return self.layer_fw.cell.hgates.weight - - @property - def bias_ih(self): - return self.layer_fw.cell.igates.bias - - @property - def bias_hh(self): - return self.layer_fw.cell.hgates.bias - - @property - def weight_ih_reverse(self): - assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' - return self.layer_bw.cell.igates.weight - - @property - def weight_hh_reverse(self): - assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' - return self.layer_bw.cell.hgates.weight - - @property - def bias_ih_reverse(self): - assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' - return self.layer_bw.cell.igates.bias - - @property - def bias_hh_reverse(self): - assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' - return self.layer_bw.cell.hgates.bias - class LSTM(torch.nn.Module): r"""A quantizable long short-term memory (LSTM). @@ -362,14 +325,12 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): cx = hidden_non_opt[1].reshape(self.num_layers, num_directions, max_batch_size, self.hidden_size).unbind(0) - hxcx = [] - for idx in range(self.num_layers): - hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0))) + hxcx = [(hx[idx].squeeze_(0), cx[idx].squeeze_(0)) for idx in range(self.num_layers)] else: hxcx = hidden_non_opt - for idx in range(self.num_layers): - x, hxcx[idx] = self.layers[idx](x, hxcx[idx]) + for idx, layer in enumerate(self.layers): + x, hxcx[idx] = layer(x, hxcx[idx]) hx_list = [] cx_list = [] From 32a93c2424c7c165a3f52a6dc8cee83aab4d7b63 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Thu, 2 Sep 2021 16:06:17 -0700 Subject: [PATCH 491/530] Revert D30675780: [FX] Prototype for guarding against mutable operations in tracing Test Plan: revert-hammer Differential Revision: D30675780 (https://github.com/pytorch/pytorch/commit/795387477fe90e03cb598f3077a32222896e65dd) Original commit changeset: b2116b51dcc8 fbshipit-source-id: d4f1173f4989556ea54974f4c2739ef85a705fae --- ..._compat-fx_backcompat_class_members.expect | 2 +- test/test_fx.py | 67 ++----------------- torch/csrc/jit/python/init.cpp | 14 ++-- torch/fx/operator_schemas.py | 44 ++---------- torch/fx/proxy.py | 7 -- 5 files changed, 14 insertions(+), 120 deletions(-) diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect index 5c3630a3169f7..88e4654b568df 100644 --- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect +++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect @@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node'] torch.fx.proxy.GraphAppendingTracer [] torch.fx.proxy.Proxy ['keys'] torch.fx.proxy.TraceError [] -torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] +torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map'] \ No newline at end of file diff --git a/test/test_fx.py b/test/test_fx.py index 57a2960a409c3..5220f67ebf309 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -132,17 +132,10 @@ def __init__(self, a, b): class TestFX(JitTestCase): def setUp(self): - # Checking for mutable operations whil tracing is feature flagged - # Enable it in testing but not by default - self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations - torch.fx.proxy.TracerBase.check_mutable_operations = True - - if not (TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS): - lib_file_path = find_library_location('libtorchbind_test.so') - torch.ops.load_library(str(lib_file_path)) - - def tearDown(self): - torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + if TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS: + return + lib_file_path = find_library_location('libtorchbind_test.so') + torch.ops.load_library(str(lib_file_path)) def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): """Check that an nn.Module's results match the GraphModule version @@ -2374,19 +2367,6 @@ def forward(self, x: torch.Tensor, y: int, z: int): traced.graph.lint() - def test_throw_out_variant(self): - def foo(x): - y = torch.rand_like(x) - torch.sigmoid(x, out=y) - return y - - class MyTracer(torch.fx.Tracer): - check_mutable_operations = True - - tracer = MyTracer() - with self.assertRaisesRegex(RuntimeError, 'mutable operation aten::sigmoid.out'): - traced_graph = tracer.trace(foo) - def test_ast_rewriter_reassigns_submodules(self): class M(torch.nn.Module): def __init__(self): @@ -3041,15 +3021,6 @@ def run_getitem_target(): class TestOperatorSignatures(JitTestCase): - def setUp(self): - # Checking for mutable operations whil tracing is feature flagged - # Enable it in testing but not by default - self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations - torch.fx.proxy.TracerBase.check_mutable_operations = True - - def tearDown(self): - torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag - @onlyCPU @ops(op_db, allowed_dtypes=(torch.float,)) def test_get_torch_func_signature_exhaustive(self, device, dtype, op): @@ -3119,15 +3090,6 @@ class TestFXAPIBackwardCompatibility(JitTestCase): def setUp(self): self.maxDiff = None - # Checking for mutable operations whil tracing is feature flagged - # Enable it in testing but not by default - self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations - torch.fx.proxy.TracerBase.check_mutable_operations = True - - def tearDown(self): - torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag - - def _fn_to_stable_annotation_str(self, obj): """ Unfortunately we have to serialize function signatures manually since @@ -3364,15 +3326,6 @@ def check_symbols_have_bc_designation(m, prefix): f"BC guarantees.") class TestFunctionalTracing(JitTestCase): - def setUp(self): - # Checking for mutable operations whil tracing is feature flagged - # Enable it in testing but not by default - self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations - torch.fx.proxy.TracerBase.check_mutable_operations = True - - def tearDown(self): - torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag - IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary", "has_torch_function_variadic", "handle_torch_function", "boolean_dispatch") @@ -3387,7 +3340,6 @@ def tearDown(self): ARG_TYPE_MISMATCH = (TypeError, r", not Proxy$") CONTROL_FLOW = (TraceError, r"symbolically traced variables cannot be used as inputs to control flow") INTERPOLATE_ARGS_CONFLICT = (ValueError, r"only one of size or scale_factor should be defined") - MUTABLE = (RuntimeError, r"Tried to trace mutable operation") UNTRACEABLE_FUNCTIONALS = { "adaptive_avg_pool1d": BUILT_IN_FUNC, @@ -3507,8 +3459,6 @@ def tearDown(self): "upsample_bilinear": INTERPOLATE_ARGS_CONFLICT, "upsample_nearest": INTERPOLATE_ARGS_CONFLICT, - - "normalize" : MUTABLE, } # List of nn.functionals with Tensor inputs but not with type annotation @@ -3623,15 +3573,6 @@ def tearDownClass(cls): @skipIfNoTorchVision class TestVisionTracing(JitTestCase): - def setUp(self): - # Checking for mutable operations whil tracing is feature flagged - # Enable it in testing but not by default - self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations - torch.fx.proxy.TracerBase.check_mutable_operations = True - - def tearDown(self): - torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag - PROXY_ITERATED = (TraceError, r"Proxy object cannot be iterated") INCONSISTENT_TYPE = ( RuntimeError, diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 35197e4ea1423..7e43e511c786f 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -1280,15 +1280,11 @@ void initJITBindings(PyObject* module) { [](const FunctionSchema& self, const FunctionSchema& other) { return self == other; }) - .def( - "__str__", - [](FunctionSchema& self) { - std::stringstream ss; - ss << self; - return ss.str(); - }) - .def_property_readonly( - "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); }); + .def("__str__", [](FunctionSchema& self) { + std::stringstream ss; + ss << self; + return ss.str(); + }); py::class_(m, "Argument") .def_property_readonly("name", [](Argument& self) { return self.name(); }) .def_property_readonly("type", [](Argument& self) { return self.type(); }) diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py index 5e024e8624cca..ac559b19530c7 100644 --- a/torch/fx/operator_schemas.py +++ b/torch/fx/operator_schemas.py @@ -79,43 +79,7 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins return inspect.Signature(parameters, return_annotation=return_type) @compatibility(is_backward_compatible=False) -def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']): - signatures, schemas = get_signature_for_torch_op(target, return_schemas=True) - - if signatures and schemas: - matched_schemas = [] - - # Iterate through all of the schema until we find one that matches - # If one matches, populate `new_args_and_kwargs` with the new args/kwargs - # values. If none matches, `new_args_and_kwargs` will be None - for candidate_signature, schema in zip(signatures, schemas): - try: - candidate_signature.bind(*args, **kwargs) - matched_schemas.append((candidate_signature, schema)) - except TypeError as e: - continue - - def throw_if_mutable(schema): - if schema.is_mutable: - raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional ' - f'code, so operations that mutate operands in-place (e.g. via `out` arguments) ' - f'are not supported') - - if len(matched_schemas) == 0: - # Did not match any schema. Cannot check for mutation - pass - elif len(matched_schemas) == 1: - # Matched exactly one schema, unambiguous - _, schema_to_check = matched_schemas[0] - throw_if_mutable(schema_to_check) - pass - else: - # Ambiguous schema match. Since mutability checking is best effort, - # do nothing. - pass - -@compatibility(is_backward_compatible=False) -def get_signature_for_torch_op(op : Callable, return_schemas : bool = False) -> Optional[List[inspect.Signature]]: +def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]: """ Given an operator on the `torch` namespace, return a list of `inspect.Signature` objects corresponding to the overloads of that op.. May return `None` if a signature @@ -130,17 +94,17 @@ def get_signature_for_torch_op(op : Callable, return_schemas : bool = False) -> """ override = _manual_overrides.get(op) if override: - return (override, None) if return_schemas else None + return override aten_fn = torch.jit._builtins._find_builtin(op) if aten_fn is None: - return (None, None) if return_schemas else None + return None schemas = torch._C._jit_get_schemas_for_operator(aten_fn) signatures = [_torchscript_schema_to_signature(schema) for schema in schemas] - return (signatures, schemas) if return_schemas else signatures + return signatures @compatibility(is_backward_compatible=False) def create_type_hint(x): diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py index b25e45d206a51..61b039f8b7219 100644 --- a/torch/fx/proxy.py +++ b/torch/fx/proxy.py @@ -8,15 +8,11 @@ from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable from .node import Target, Node, Argument, base_types, map_aggregate from ._compatibility import compatibility -from .operator_schemas import check_for_mutable_operation @compatibility(is_backward_compatible=True) class TracerBase: graph: Graph record_stack_traces : bool = False - # Feature flag for mutable schema checking - # Enableby default in 1.12 - check_mutable_operations : bool = False @compatibility(is_backward_compatible=True) def create_node(self, kind : str, target : Target, @@ -29,9 +25,6 @@ def create_node(self, kind : str, target : Target, modification of values used in node creation. For example, one might want to disallow in-place operations from being recorded. """ - if kind == 'call_function' and self.check_mutable_operations: - check_for_mutable_operation(target, args, kwargs) - return self.graph.create_node(kind, target, args, kwargs, name, type_expr) @compatibility(is_backward_compatible=True) From b12f34e8c2ae0a183abc48e65815480bf4c44fbe Mon Sep 17 00:00:00 2001 From: Garrett Cramer Date: Thu, 2 Sep 2021 16:11:10 -0700 Subject: [PATCH 492/530] update rpc tensorpipe logic for sparse tensors (#62960) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62960 A bug was filed a few years ago for sending sparse tensor over rpc #30807. This pr updates rpc/tensorpipe logic for CUDA sparse tensors. During the serialization process, the pickler.cpp implementation breaks down the sparse tensor into two tensors and metadata. torch/csrc/distributed/rpc/tensorpipe_agent.cpp needs to be updated because it does not have logic sparse tensors. It pushes a single device for a sparse tensor. This is wrong because after the sparse tensor has been serialized, there will be two tensors. The second tensor will not have a device. This will cause the second tensor to have the wrong target device. tensorpipe_utils.cpp needs to be updated because deserialization happens after the data is received on the target pipe. This takes the two tensors and metadata sent and rebuilds the sparse tensor. There will be two tpDescriptors but only one tensor after deserialization. The logic is updated to verify the sparse tensor is on the correct device using the first tpDescriptor. This pr also updates ivalue.cpp and ivalue.h to support more paths for Sparse COO tensors. I tested these changes by adding sparse tests to rpc_test.py and dist_autograd_test.py. Test Plan: Imported from OSS Reviewed By: gchanan Differential Revision: D30717285 Pulled By: gcramer23 fbshipit-source-id: daee9a56764550f56b131f9dd8e74e23113d6714 --- aten/src/ATen/core/ivalue.cpp | 41 ++- aten/src/ATen/core/ivalue.h | 9 +- .../csrc/distributed/rpc/tensorpipe_agent.cpp | 9 +- .../csrc/distributed/rpc/tensorpipe_utils.cpp | 10 +- .../distributed/rpc/dist_autograd_test.py | 312 ++++++++++-------- .../_internal/distributed/rpc/rpc_test.py | 98 ++++-- 6 files changed, 273 insertions(+), 206 deletions(-) diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 1404e01fa2434..b81c50f063b19 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -946,36 +946,25 @@ getClassConverter() { } // Needs to be in this .cpp file to access the full definition of PyObjectHolder -std::vector> ivalue::Future::extractStorages( - const at::IValue& value) { +std::vector> ivalue::Future:: + extractStorages(const at::IValue& value) { std::vector> weakStorageImpls; // getSubValues works poorly on Python objects: it only works if they can be // converted to a "regular" IValue type hence, for example, it doesn't support // custom subclasses. Thus, instead, we extract the tensors through pickling. + // Sparse tensors do not have storage. Instead, a sparse tensor + // contains two tensors indices and values, and both contain storage. if (value.isPyObject()) { std::vector tensors = value.toPyObjectHolder()->extractTensors(); - size_t num_storages = 0; - for (const at::Tensor& tensor : tensors) { + weakStorageImpls.reserve(2 * tensors.size()); + for (const auto& tensor : tensors) { if (tensor.is_sparse()) { - // Sparse tensor is indices and values. Both are tensors - // and contain storage. Therefore num_storages needs to be - // incremented by 2. - num_storages += 2; + weakStorageImpls.push_back( + tensor._indices().storage().getWeakStorageImpl()); + weakStorageImpls.push_back( + tensor._values().storage().getWeakStorageImpl()); } else { - // A dense/strided tensor contains 1 storage. - num_storages += 1; - } - } - weakStorageImpls.reserve(num_storages); - for (const at::Tensor& tensor : tensors) { - if (tensor.is_sparse()) { - // Sparse tensor is indices and values. Both are tensors - // and contain storage. - weakStorageImpls.push_back(tensor.indices().storage().getWeakStorageImpl()); - weakStorageImpls.push_back(tensor.values().storage().getWeakStorageImpl()); - } else { - // A dense/strided tensor contains 1 storage weakStorageImpls.push_back(tensor.storage().getWeakStorageImpl()); } } @@ -986,7 +975,15 @@ std::vector> ivalue::Future::extractSt value.getSubValues(sub_values); for (const at::IValue& sub_value : sub_values) { if (sub_value.isTensor()) { - weakStorageImpls.push_back(sub_value.toTensor().storage().getWeakStorageImpl()); + auto& tensor = sub_value.toTensor(); + if (tensor.is_sparse()) { + weakStorageImpls.push_back( + tensor._indices().storage().getWeakStorageImpl()); + weakStorageImpls.push_back( + tensor._values().storage().getWeakStorageImpl()); + } else { + weakStorageImpls.push_back(tensor.storage().getWeakStorageImpl()); + } } } } diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 188a619307185..6574187d06f8b 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -872,14 +872,17 @@ struct TORCH_API IValue final { struct HashAliasedIValue { size_t operator()(const IValue& val) const { if (val.isTensor()) { - if (val.toTensor().is_mkldnn()) { + auto& tensor = val.toTensor(); + if (tensor.is_mkldnn() || tensor.is_sparse()) { // MKLDNN tensors dont have storage and dont create views // or aliasing so we can just use Tensor pointer, TODO: find way // to use mkldnn storage - return reinterpret_cast(val.toTensor().unsafeGetTensorImpl()); + // Sparse tensors don't have storage use unsafeGetTensorImpl + // instead of using the storage of indices or values. + return reinterpret_cast(tensor.unsafeGetTensorImpl()); } else { return reinterpret_cast( - val.toTensor().storage().unsafeGetStorageImpl()); + tensor.storage().unsafeGetStorageImpl()); } } // If it is not a Tensor, then two mutable IValues alias each other only diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 8e7ad18c575f8..3769db054ab45 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -48,7 +48,7 @@ std::vector getDevicesForTensors( "Request device mapping is not available for destination ", remoteName); std::vector devices; - devices.reserve(tensors.size()); + devices.reserve(2 * tensors.size()); bool hasMappedDevice = false; for (const auto& t : tensors) { if (t.device().is_cpu()) { @@ -67,7 +67,12 @@ std::vector getDevicesForTensors( " for device ", t.device(), " but received a tensor on that device."); - devices.push_back(deviceIter->second); + if (t.is_sparse()) { + devices.push_back(deviceIter->second); + devices.push_back(deviceIter->second); + } else { + devices.push_back(deviceIter->second); + } hasMappedDevice = true; } } diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp index ee66f3108e522..aa21fdf65c0f9 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp @@ -311,8 +311,9 @@ c10::intrusive_ptr tensorpipeDeserialize( tensors.emplace_back(std::move(t)); } - for (const auto i : c10::irange(tpDescriptor.tensors.size())) { - auto& tensor = tpDescriptor.tensors[i]; + size_t tpDescriptorIndex = 0; + for (size_t i = 0; i < tensors.size(); i++) { + auto& tensor = tpDescriptor.tensors[tpDescriptorIndex]; if (tensor.targetDevice.has_value() && tensor.targetDevice->type == tensorpipe::kCudaDeviceType) { TORCH_INTERNAL_ASSERT( @@ -326,6 +327,11 @@ c10::intrusive_ptr tensorpipeDeserialize( ", but got it on ", tensors[i].device()); } + if (tensors[i].is_sparse()) { + tpDescriptorIndex += 2; + } else { + tpDescriptorIndex += 1; + } } return c10::make_intrusive( diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py index fba50303068e7..2ba25a591ae0f 100644 --- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py @@ -78,14 +78,20 @@ def create_tensor(): return torch.ones((3, 3), requires_grad=True) -def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32): +def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32, device=None): i = [[0, 1, 1], [2, 0, 2]] v = [3.2, 4.1, 5.3] - tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype) + tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype, device=device) if coalesce: tensor = tensor.coalesce() return tensor +def build_sparse_one_gradient(dtype=torch.float32): + i = [[0, 1, 1], [2, 0, 2]] + v = [1, 1, 1] + tensor = torch.sparse_coo_tensor(i, v, (3, 3), dtype=dtype) + return tensor + @torch.jit.script def create_torchscript_tensor() -> torch.Tensor: @@ -104,6 +110,9 @@ def my_rref_add(rref_t1, t2): ret = torch.add(rref_t1.local_value(), t2) return ret +def my_sum(t): + return torch.sparse.sum(t) if t.is_sparse else t.sum() + @torch.jit.script def my_script_add(t1, t2): @@ -159,13 +168,10 @@ def _all_contexts_cleaned_up(timeout_seconds=10): # This function creates a dis atugorad context, run rpc_sync on the given ps, # and then blocks until the ps has verified the grads are correctly accumulated. -def _run_trainer(rref_t1, t2, ps, rank_diff, sparse): +def _run_trainer(rref_t1, t2, ps, rank_diff): with dist_autograd.context() as context_id: ret = rpc.rpc_sync(ps, my_rref_add, args=(rref_t1, t2)) - if sparse: - loss = torch.sparse.sum(ret) - else: - loss = ret.sum() + loss = my_sum(ret) dist_autograd.backward(context_id, [loss]) # prevent deleting dist autograd context rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff)) @@ -173,13 +179,10 @@ def _run_trainer(rref_t1, t2, ps, rank_diff, sparse): # This function is the same as _run_trainer, except rpc calls torchscript # function "my_script_ref_add" instead of python funciton "my_rref_add" -def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff, sparse): +def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff): with dist_autograd.context() as context_id: ret = rpc.rpc_sync(ps, my_script_ref_add, args=(rref_t1, t2)) - if sparse: - loss = torch.sparse.sum(ret) - else: - loss = ret.sum() + loss = my_sum(ret) dist_autograd.backward(context_id, [loss]) # prevent deleting dist autograd context rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff)) @@ -990,25 +993,19 @@ def test_error_in_context(self): def _backward_no_grad_on_tensor(self, t1, t2, sparse): with dist_autograd.context() as context_id: - loss = rpc.rpc_sync( + ret = rpc.rpc_sync( worker_name(self._next_rank()), torch.add, args=(t1, t2)) - if sparse: - loss = torch.sparse.sum(loss) - else: - loss = loss.sum() + loss = my_sum(ret) dist_autograd.backward(context_id, [loss], retain_graph=True) self.assertIsNone(t1.grad) self.assertIsNone(t2.grad) # Now populate .grad with local autograd engine and # verify dist autograd doesn't mess with it. - loss_local = torch.add(t1, t2) - if sparse: - loss_local = torch.sparse.sum(loss_local) - else: - loss_local = loss_local.sum() + ret = torch.add(t1, t2) + loss_local = my_sum(ret) loss_local.backward() self.assertIsNotNone(t1.grad) self.assertIsNotNone(t2.grad) @@ -1043,10 +1040,7 @@ def _backward_simple(self, dst, t1, t2, local_grads, sparse): ret = self._exec_func_with_dst( dst, exec_mode, torch.add, t1, t2 ) - if sparse: - loss = torch.sparse.sum(ret) - else: - loss = ret.sum() + loss = my_sum(ret) ret = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @@ -1099,10 +1093,7 @@ def test_backward_simple_self_sparse(self): # tensor lives on the rref owner. def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse): local_ret = torch.add(t1, t2) - if sparse: - local_ret = torch.sparse.sum(local_ret) - else: - local_ret = local_ret.sum() + local_ret = my_sum(local_ret) local_ret.backward() with dist_autograd.context() as context_id: if sparse: @@ -1120,10 +1111,7 @@ def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse): callee, my_nested_rref_add, args=(rref_owner, rref_t1, t2) ) ret = rref.to_here() - if sparse: - ret = torch.sparse.sum(ret) - else: - ret = ret.sum() + ret = my_sum(ret) dist_autograd.backward(context_id, [ret]) # verify grads on caller @@ -1238,10 +1226,7 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse): t2 = torch.zeros((3, 3), requires_grad=True) local_ret = torch.add(t1, t2) - if sparse: - torch.sparse.sum(local_ret).backward() - else: - local_ret.sum().backward() + my_sum(local_ret).backward() # create rref on self rref_t1 = rpc.remote( @@ -1257,7 +1242,7 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse): rpc.rpc_async( worker_name((self.rank + rank_diff) % self.world_size), trainer_fn, - args=(rref_t1, t2, worker_name(self.rank), rank_diff, sparse), + args=(rref_t1, t2, worker_name(self.rank), rank_diff), ) ) @@ -1309,7 +1294,7 @@ def test_trainer_ps_torchscript_functions(self): self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript, False) - def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse): + def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads): for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]: with dist_autograd.context() as context_id: # Multiple RPCs between different nodes. @@ -1317,7 +1302,7 @@ def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse val = self._exec_func(exec_mode, torch.mul, t3, val) s1 = self._exec_func(exec_mode, torch.stack, (t4, val)) s2 = self._exec_func(exec_mode, torch.stack, (t5, val)) - if sparse: + if s1.is_sparse: val = self._exec_func(exec_mode, torch.mul, s1, s2) val = self._exec_func(exec_mode, torch.mul, val, val) loss = torch.sparse.sum(val) @@ -1339,8 +1324,7 @@ def test_backward_multiple_round_trips(self): torch.rand((3, 3), requires_grad=True), torch.rand((3, 3)), torch.rand((3, 3), requires_grad=True), - None, - False + None ) @dist_init @@ -1351,8 +1335,7 @@ def test_backward_multiple_round_trips_sparse(self): build_sparse_tensor(requires_grad=True), build_sparse_tensor(requires_grad=False), build_sparse_tensor(requires_grad=True), - None, - True + None ) @dist_init @@ -1589,15 +1572,12 @@ def test_backward_multiple_roots(self): exec_mode, [r1, r2, r3, r4], context_id, local_grads, t1, t2 ) - def _backward_different_dtypes(self, t1, t2, sparse): + def _backward_different_dtypes(self, t1, t2): local_grads = None for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]: with dist_autograd.context() as context_id: loss = self._exec_func(exec_mode, torch.add, t1, t2) - if sparse: - loss = torch.sparse.sum(loss) - else: - loss = loss.sum() + loss = my_sum(loss) local_grads = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @@ -1606,29 +1586,24 @@ def _backward_different_dtypes(self, t1, t2, sparse): def test_backward_different_dtypes(self): self._backward_different_dtypes( torch.rand((3, 3), requires_grad=True, dtype=torch.float32), - torch.rand((3, 3), requires_grad=True, dtype=torch.float64), - False + torch.rand((3, 3), requires_grad=True, dtype=torch.float64) ) @dist_init def test_backward_different_dtypes_sparse(self): self._backward_different_dtypes( build_sparse_tensor(requires_grad=True, dtype=torch.float32), - build_sparse_tensor(requires_grad=True, dtype=torch.float64), - True + build_sparse_tensor(requires_grad=True, dtype=torch.float64) ) # Run the same code locally and with dist autograd and verify gradients # are same. - def _backward_simple_python_udf(self, t1, t2, sparse): + def _backward_simple_python_udf(self, t1, t2): local_grads = None for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]: with dist_autograd.context() as context_id: ret = self._exec_func(exec_mode, my_py_add, t1, t2) - if sparse: - loss = torch.sparse.sum(ret) - else: - loss = ret.sum() + loss = my_sum(ret) local_grads = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @@ -1637,21 +1612,19 @@ def _backward_simple_python_udf(self, t1, t2, sparse): def test_backward_simple_python_udf(self): self._backward_simple_python_udf( torch.rand(3, 3, requires_grad=True), - torch.rand(3, 3, requires_grad=True), - False + torch.rand(3, 3, requires_grad=True) ) @dist_init def test_backward_simple_python_udf_sparse(self): self._backward_simple_python_udf( build_sparse_tensor(requires_grad=True), - build_sparse_tensor(requires_grad=True), - True + build_sparse_tensor(requires_grad=True) ) # Run the same code locally and with dist autograd and verify gradients # are same. - def _backward_simple_script_call(self, t1, t2, sparse): + def _backward_simple_script_call(self, t1, t2): local_grads = None for exec_mode in [ ExecMode.LOCAL, @@ -1661,10 +1634,7 @@ def _backward_simple_script_call(self, t1, t2, sparse): ]: with dist_autograd.context() as context_id: forward_ret = self._exec_func(exec_mode, my_script_add, t1, t2) - if sparse: - loss = torch.sparse.sum(forward_ret) - else: - loss = forward_ret.sum() + loss = my_sum(forward_ret) ret = self._verify_backwards( exec_mode, [loss], context_id, local_grads, t1, t2 ) @@ -1674,16 +1644,14 @@ def _backward_simple_script_call(self, t1, t2, sparse): def test_backward_simple_script_call(self): self._backward_simple_script_call( torch.rand(3, 3, requires_grad=True), - torch.rand(3, 3, requires_grad=True), - False + torch.rand(3, 3, requires_grad=True) ) @dist_init def test_backward_simple_script_call_sparse(self): self._backward_simple_script_call( build_sparse_tensor(requires_grad=True), - build_sparse_tensor(requires_grad=True), - True + build_sparse_tensor(requires_grad=True) ) @staticmethod @@ -1796,28 +1764,22 @@ def _nested_python_udf(t1, t2, dst): res = rpc.rpc_sync(worker_name(dst), my_py_add, args=(t3, t4)) return t1 * t2 * t3 * t4 * res - def _backwards_nested_python_udf(self, t1, t2, sparse): + def _backwards_nested_python_udf(self, t1, t2): t3 = t1 * t2 t4 = t1 + t2 res = t3 + t4 - loss = t1 * t2 * t3 * t4 * res - if sparse: - loss = torch.sparse.sum(loss) - else: - loss = loss.sum() + ret = t1 * t2 * t3 * t4 * res + loss = my_sum(ret) torch.autograd.backward([loss]) # Now run distributed autograd. with dist_autograd.context() as context_id: - loss = rpc.rpc_sync( + ret = rpc.rpc_sync( worker_name(self._next_rank()), DistAutogradTest._nested_python_udf, args=(t1, t2, self._next_rank()), ) - if sparse: - loss = torch.sparse.sum(loss) - else: - loss = loss.sum() + loss = my_sum(ret) dist_autograd.backward(context_id, [loss]) grads = dist_autograd.get_gradients(context_id) self.assertEqual(t1.grad, grads[t1]) @@ -1828,8 +1790,7 @@ def test_backwards_nested_python_udf(self): # Run equivalent of _nested_python_udf locally. self._backwards_nested_python_udf( torch.rand(3, 3, requires_grad=True), - torch.rand(3, 3, requires_grad=True), - False + torch.rand(3, 3, requires_grad=True) ) @dist_init @@ -1837,8 +1798,7 @@ def test_backwards_nested_python_udf_sparse(self): # Run equivalent of _nested_python_udf locally. self._backwards_nested_python_udf( build_sparse_tensor(requires_grad=True), - build_sparse_tensor(requires_grad=True), - True + build_sparse_tensor(requires_grad=True) ) _test_clean_context_backward_context_id = None @@ -1986,17 +1946,14 @@ def _mixed_requires_grad_operaton(cls, t1, t2): else: return t1 * t2 - def _mixed_requires_grad(self, t1, t2, sparse): + def _mixed_requires_grad(self, t1, t2): for exec_mode in [ExecMode.RPC_SYNC, ExecMode.REMOTE]: with dist_autograd.context() as context_id: ret = self._exec_func( exec_mode, DistAutogradTest._mixed_requires_grad_operaton, t1, t2 ) self.assertEqual(t1 * t2, ret) - if sparse: - loss = torch.sparse.sum(ret) - else: - loss = ret.sum() + loss = my_sum(ret) dist_autograd.backward(context_id, [loss]) self.assertTrue(t1.requires_grad) self.assertFalse(t2.requires_grad) @@ -2009,16 +1966,14 @@ def _mixed_requires_grad(self, t1, t2, sparse): def test_mixed_requires_grad(self): self._mixed_requires_grad( torch.rand(3, 3, requires_grad=True), - torch.rand(3, 3, requires_grad=False), - False + torch.rand(3, 3, requires_grad=False) ) @dist_init def test_mixed_requires_grad_sparse(self): self._mixed_requires_grad( build_sparse_tensor(requires_grad=True), - build_sparse_tensor(requires_grad=False), - True + build_sparse_tensor(requires_grad=False) ) class TestDebugInfoFunc(Function): @@ -2160,17 +2115,14 @@ def test_backward_accumulate_grads(self): def _test_nested_backward_accumulate_grads(t1, t2, dst_rank): return rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2)) - def _nested_backward_accumulate_grads(self, t1, t2, sparse): + def _nested_backward_accumulate_grads(self, t1, t2): with dist_autograd.context() as context_id: ret = rpc.rpc_sync( worker_name(self._next_rank()), DistAutogradTest._test_nested_backward_accumulate_grads, args=(t1, t2, self._next_rank()), ) - if sparse: - loss = torch.sparse.sum(ret) - else: - loss = ret.sum() + loss = my_sum(ret) # Run backward twice. dist_autograd.backward(context_id, [loss], retain_graph=True) dist_autograd.backward(context_id, [loss]) @@ -2179,28 +2131,23 @@ def _nested_backward_accumulate_grads(self, t1, t2, sparse): def test_nested_backward_accumulate_grads(self): self._nested_backward_accumulate_grads( torch.rand(3, 3, requires_grad=True), - torch.rand(3, 3, requires_grad=True), - False + torch.rand(3, 3, requires_grad=True) ) @dist_init def test_nested_backward_accumulate_grads_sparse(self): self._nested_backward_accumulate_grads( build_sparse_tensor(requires_grad=True), - build_sparse_tensor(requires_grad=True), - True + build_sparse_tensor(requires_grad=True) ) - def _multiple_backward(self, t1, t2, sparse): + def _multiple_backward(self, t1, t2): with dist_autograd.context() as context_id: - loss = rpc.rpc_sync( + ret = rpc.rpc_sync( worker_name(self._next_rank()), torch.add, args=(t1, t2)) - if sparse: - loss = torch.sparse.sum(loss) - else: - loss = loss.sum() + loss = my_sum(ret) # Run backward in a loop multiple times. for i in range(1000): dist_autograd.backward(context_id, [loss], retain_graph=True) @@ -2209,16 +2156,14 @@ def _multiple_backward(self, t1, t2, sparse): def test_multiple_backward(self): self._multiple_backward( torch.rand(3, 3, requires_grad=True), - torch.rand(3, 3, requires_grad=True), - False + torch.rand(3, 3, requires_grad=True) ) @dist_init def test_multiple_backward_sparse(self): self._multiple_backward( build_sparse_tensor(requires_grad=True), - build_sparse_tensor(requires_grad=True), - True + build_sparse_tensor(requires_grad=True) ) @dist_init(clean_shutdown=False) @@ -2524,15 +2469,13 @@ def test_thread_local_context_id(self): class CudaDistAutogradTest(CommonDistAutogradTest): - @skip_if_lt_x_gpu(1) - @dist_init - def test_gpu_simple(self): - t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0") - t2 = torch.rand(3, 3, requires_grad=True, device="cuda:0") - (t1 + t2).sum().backward() + + def _gpu_simple(self, t1, t2): + my_sum(t1 + t2).backward() with dist_autograd.context() as context_id: t3 = t1 + t2 - dist_autograd.backward(context_id, [t3.sum()]) + loss = my_sum(t3) + dist_autograd.backward(context_id, [loss]) grads = dist_autograd.get_gradients(context_id) self.assertEqual(2, len(grads)) self.assertEqual(t1.grad, grads[t1]) @@ -2540,9 +2483,22 @@ def test_gpu_simple(self): @skip_if_lt_x_gpu(1) @dist_init - def test_gpu_to_cpu_continuation(self): - t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0") - t2 = torch.rand(3, 3, requires_grad=True) + def test_gpu_simple(self): + self._gpu_simple( + torch.rand(3, 3, requires_grad=True, device="cuda:0"), + torch.rand(3, 3, requires_grad=True, device="cuda:0") + ) + + @skip_if_lt_x_gpu(1) + @dist_init + def test_gpu_simple_sparse(self): + self._gpu_simple( + build_sparse_tensor(requires_grad=True, device="cuda:0"), + build_sparse_tensor(requires_grad=True, device="cuda:0") + ) + + + def _gpu_to_cpu_continuation(self, t1, t2): # Run a few iterations. for i in range(3): t1.grad = None @@ -2557,16 +2513,29 @@ def test_gpu_to_cpu_continuation(self): t6 = t5.cuda(0) + t4 t7 = self._exec_func(exec_mode, torch.add, t6.cpu(), t5) # Autograd graph consists of CPU -> GPU -> CPU execution. + loss = my_sum(t7) ret = self._verify_backwards( - exec_mode, [t7.sum()], context_id, local_grads, t1, t2 + exec_mode, [loss], context_id, local_grads, t1, t2 ) local_grads = ret if ret else local_grads @skip_if_lt_x_gpu(1) @dist_init - def test_gpu_to_cpu_continuation_gpu_root(self): - t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0") - t2 = torch.rand(3, 3, requires_grad=True) + def test_gpu_to_cpu_continuation(self): + self._gpu_to_cpu_continuation( + torch.rand(3, 3, requires_grad=True, device="cuda:0"), + torch.rand(3, 3, requires_grad=True) + ) + + @skip_if_lt_x_gpu(1) + @dist_init + def test_gpu_to_cpu_continuation_sparse(self): + self._gpu_to_cpu_continuation( + build_sparse_tensor(requires_grad=True, device="cuda:0"), + build_sparse_tensor(requires_grad=True) + ) + + def _gpu_to_cpu_continuation_gpu_root(self, t1, t2): # Run a few iterations. for i in range(3): t1.grad = None @@ -2580,11 +2549,28 @@ def test_gpu_to_cpu_continuation_gpu_root(self): t5 = self._exec_func(exec_mode, torch.add, t4.cpu(), t2) t6 = t5.cuda(0) + t4 # Autograd graph consists of CPU -> GPU -> CPU execution. + loss = my_sum(t6) ret = self._verify_backwards( - exec_mode, [t6.sum()], context_id, local_grads, t1, t2 + exec_mode, [loss], context_id, local_grads, t1, t2 ) local_grads = ret if ret else local_grads + @skip_if_lt_x_gpu(1) + @dist_init + def test_gpu_to_cpu_continuation_gpu_root(self): + self._gpu_to_cpu_continuation_gpu_root( + torch.rand(3, 3, requires_grad=True, device="cuda:0"), + torch.rand(3, 3, requires_grad=True) + ) + + @skip_if_lt_x_gpu(1) + @dist_init + def test_gpu_to_cpu_continuation_gpu_root_sparse(self): + self._gpu_to_cpu_continuation_gpu_root( + build_sparse_tensor(requires_grad=True, device="cuda:0"), + build_sparse_tensor(requires_grad=True) + ) + class FaultyAgentDistAutogradTest(RpcAgentTestFixture): # Reusing a simplified helper function from DistAutogradTest to ensure @@ -2646,8 +2632,7 @@ def gradients(self, ctx_id): class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture): - @skip_if_lt_x_gpu(4) - def test_device_maps_backward_pass(self): + def _device_maps_backward_pass(self, t1, t2): options = self.rpc_backend_options dst = worker_name((self.rank + 1) % self.world_size) @@ -2662,19 +2647,36 @@ def test_device_maps_backward_pass(self): rpc_backend_options=options, ) - t1 = torch.rand(10, device=self.rank, requires_grad=True) - t2 = torch.rand(10, device=self.rank, requires_grad=True) with dist_autograd.context() as context_id: res = rpc.rpc_sync(dst, torch.add, args=(t1, t2)) - dist_autograd.backward(context_id, [res.sum()]) + loss = my_sum(res) + dist_autograd.backward(context_id, [loss]) grads = dist_autograd.get_gradients(context_id) - self.assertEqual(torch.ones(10), grads[t1]) - self.assertEqual(torch.ones(10), grads[t2]) + if t1.is_sparse: + self.assertEqual(build_sparse_one_gradient(), grads[t1]) + self.assertEqual(build_sparse_one_gradient(), grads[t2]) + else: + self.assertEqual(torch.ones(10), grads[t1]) + self.assertEqual(torch.ones(10), grads[t2]) self.assertEqual(t1.device, grads[t1].device) self.assertEqual(t2.device, grads[t2].device) rpc.shutdown() + @skip_if_lt_x_gpu(4) + def test_device_maps_backward_pass(self): + self._device_maps_backward_pass( + torch.rand(10, requires_grad=True, device=self.rank), + torch.ones(10, requires_grad=True, device=self.rank) + ) + + @skip_if_lt_x_gpu(4) + def test_device_maps_backward_pass_sparse(self): + self._device_maps_backward_pass( + build_sparse_tensor(requires_grad=True, device=self.rank), + build_sparse_tensor(requires_grad=True, device=self.rank) + ) + class MyRemoteCompute(torch.nn.Module): def __init__(self): super().__init__() @@ -2691,9 +2693,7 @@ def __init__(self, next_stage): def forward(self, input): return self.next_stage.rpc_sync().forward(input) - @skip_if_lt_x_gpu(4) - def test_dist_autograd_sync_streams(self): - + def _dist_autograd_sync_streams(self, sparse): options = self.rpc_backend_options dst = worker_name((self.rank + 1) % self.world_size) @@ -2711,17 +2711,20 @@ def test_dist_autograd_sync_streams(self): remote_compute = rpc.remote(dst, TensorPipeCudaDistAutogradTest.MyRemoteCompute) local_compute = TensorPipeCudaDistAutogradTest.MyLocalCompute(remote_compute) for _ in range(10): - input = torch.rand([1000, 10000], device=self.rank, requires_grad=True) + if sparse: + input = build_sparse_tensor(requires_grad=True, device=self.rank) + else: + input = torch.rand([1000, 10000], device=self.rank, requires_grad=True) # Run local autograd result = input * 2.0 r = random.random() - loss = result.sum() * r + loss = my_sum(result) * r loss.backward() # Run distributed autograd with dist_autograd.context() as context_id: result = local_compute(input) - loss = result.sum() * r + loss = my_sum(result) * r dist_autograd.backward(context_id, [loss]) # Compare grads. @@ -2731,7 +2734,14 @@ def test_dist_autograd_sync_streams(self): rpc.shutdown() @skip_if_lt_x_gpu(4) - def test_gradients_synchronizations(self): + def test_dist_autograd_sync_streams(self): + self._dist_autograd_sync_streams(False) + + @skip_if_lt_x_gpu(4) + def test_dist_autograd_sync_streams_sparse(self): + self._dist_autograd_sync_streams(True) + + def _gradients_synchronizations(self, x): options = self.rpc_backend_options for peer_rank in range(self.world_size): options.set_device_map(worker_name(peer_rank), {self.rank: peer_rank}) @@ -2755,8 +2765,8 @@ def test_gradients_synchronizations(self): WrapperModule, args=(layers[rank - 1], rank) )) + x = x.to(0) - x = torch.randn(5000, 2000).to(0) # local iteration local_model = nn.Sequential(*local_layers) local_model(x).sum().backward() @@ -2778,3 +2788,15 @@ def test_gradients_synchronizations(self): self.assertEqual(g1, g2) rpc.shutdown() + + @skip_if_lt_x_gpu(4) + def test_gradients_synchronizations(self): + self._gradients_synchronizations( + torch.randn(5000, 2000) + ) + + @skip_if_lt_x_gpu(4) + def test_gradients_synchronizations_sparse(self): + self._gradients_synchronizations( + torch.randn(5000, 2000).to_sparse() + ) diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index e0ef915ee8937..23759f1e292ad 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -194,6 +194,14 @@ def my_slow_method(self, my_tensor_arg): return torch.add(self.a, my_tensor_arg) +def _run_func_in_mode(to, fn, mode, args=None, kwargs=None): + if mode == RPCExecMode.SYNC: + return rpc.rpc_sync(to, fn, args=args, kwargs=kwargs) + elif mode == RPCExecMode.ASYNC: + return rpc.rpc_async(to, fn, args=args, kwargs=kwargs).wait() + elif mode == RPCExecMode.REMOTE: + return rpc.remote(to, fn, args=args, kwargs=kwargs).to_here() + def _call_method_on_rref(method, rref, *args, **kwargs): return method(rref.local_value(), *args, **kwargs) @@ -736,7 +744,7 @@ def test_send_to_rank(self): # Test dense tensor for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: - ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) + ret = _run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) self.assertEqual(ret, torch.ones(2, 2) + 1) # Test sparse tensor @@ -744,32 +752,32 @@ def test_send_to_rank(self): x = build_sparse_tensor() y = build_sparse_tensor() expected_tensor = (x + y) - ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y)) + ret = _run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y)) self.assertEqual(expected_tensor, ret) for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: x = build_sparse_tensor(coalesce=True) y = build_sparse_tensor(coalesce=True) expected_tensor = (x + y) - ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y)) + ret = _run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y)) self.assertEqual(expected_tensor, ret) # Test invalid ranks for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: with self.assertRaises(RuntimeError): - self._run_func_in_mode(self.world_size + 1, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) + _run_func_in_mode(self.world_size + 1, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: with self.assertRaises(RuntimeError): - self._run_func_in_mode(-1, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) + _run_func_in_mode(-1, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: with self.assertRaises(ValueError): - self._run_func_in_mode(dst_rank + 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) + _run_func_in_mode(dst_rank + 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: with self.assertRaises(ValueError): - self._run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) + _run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)) def _self_py_udf_remote(self, worker_info, x, y, z): rref = rpc.remote(worker_info, my_function, args=(x, y, z)) @@ -4025,17 +4033,9 @@ def test_future_in_rpc(self): def test_future_nested_callback(self): self._test_future_cb(add_use_future_nested_cb) - def _run_func_in_mode(self, to, fn, mode, args=None, kwargs=None): - if mode == RPCExecMode.SYNC: - return rpc.rpc_sync(to, fn, args=args, kwargs=kwargs) - elif mode == RPCExecMode.ASYNC: - return rpc.rpc_async(to, fn, args=args, kwargs=kwargs).wait() - elif mode == RPCExecMode.REMOTE: - return rpc.remote(to, fn, args=args, kwargs=kwargs).to_here() - def _test_async_function_raise(self, mode): with self.assertRaisesRegex(RuntimeError, "Expected error"): - self._run_func_in_mode( + _run_func_in_mode( worker_name((self.rank + 1) % self.world_size), async_raise_func, mode @@ -4059,7 +4059,7 @@ def _test_async_function_wrong_return_type(self, mode): "torch\\.futures\\.Future object," ) with self.assertRaisesRegex(RuntimeError, errMsg): - self._run_func_in_mode( + _run_func_in_mode( worker_name((self.rank + 1) % self.world_size), async_wrong_type, mode @@ -4090,7 +4090,7 @@ def _test_async_function(self, fn, mode=RPCExecMode.SYNC): dst2 = worker_name((self.rank + 2) % self.world_size) args = (dst2, torch.ones(2, 2), 1, 2) - ret = self._run_func_in_mode(dst1, fn, mode, args=args) + ret = _run_func_in_mode(dst1, fn, mode, args=args) self.assertEqual(ret, torch.ones(2, 2) + 3) @dist_init @@ -4183,7 +4183,7 @@ def _test_async_function_multi(self, fn, mode=RPCExecMode.SYNC): num = 20 step = 3 args = (dst2, torch.ones(2, 2), num, step) - ret = self._run_func_in_mode(dst1, fn, mode, args=args) + ret = _run_func_in_mode(dst1, fn, mode, args=args) self.assertEqual(ret, torch.ones(2, 2) + num * step) @dist_init @@ -4227,7 +4227,7 @@ def _test_return_future(self, mode): RuntimeError, "Can not pickle torch.futures.Future" ): - self._run_func_in_mode( + _run_func_in_mode( worker_name((self.rank + 1) % self.world_size), return_future, mode @@ -5217,13 +5217,33 @@ def test_device_maps_gpu(self): rpc_backend_options=options, ) - ret = rpc.rpc_sync( - dst, - TensorPipeAgentCudaRpcTest._gpu_add, - args=(torch.zeros(2).to(0), torch.ones(2).to(0)) - ) - self.assertEqual(ret.device, torch.device(1)) - self.assertEqual(ret, (torch.zeros(2) + torch.ones(2)).to(1)) + # Test dense tensor + for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: + x = torch.ones(2, 2) + y = torch.ones(2, 2) + expected_tensor = (x + y) + ret = _run_func_in_mode(dst, TensorPipeAgentCudaRpcTest._gpu_add, exec_mode, args=(x.to(0), y.to(0))) + self.assertEqual(ret.device, torch.device(1)) + self.assertEqual(ret, expected_tensor.to(1)) + + # Test sparse tensor uncoalesced + for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: + x = build_sparse_tensor() + y = build_sparse_tensor() + expected_tensor = (x + y) + ret = _run_func_in_mode(dst, TensorPipeAgentCudaRpcTest._gpu_add, exec_mode, args=(x.to(0), y.to(0))) + self.assertEqual(ret.device, torch.device(1)) + self.assertEqual(ret, expected_tensor.to(1)) + + # Test sparse tensor coalesced + for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]: + x = build_sparse_tensor().coalesce() + y = build_sparse_tensor().coalesce() + expected_tensor = (x + y) + ret = _run_func_in_mode(dst, TensorPipeAgentCudaRpcTest._gpu_add, exec_mode, args=(x.to(0), y.to(0))) + self.assertEqual(ret.device, torch.device(1)) + self.assertEqual(ret, expected_tensor.to(1)) + rpc.shutdown() @staticmethod @@ -5722,8 +5742,7 @@ def test_device_maps_missing_config_remote(self): def test_device_maps_missing_config_remote_response(self): self._test_device_maps_missing_config_response(RPCExecMode.REMOTE) - @skip_if_lt_x_gpu(2) - def test_device_maps_remote(self): + def _device_maps_remote(self, x, y, expected): options = self.rpc_backend_options dst = worker_name((self.rank + 1) % self.world_size) options.set_device_map(dst, {1: 0}) @@ -5739,14 +5758,29 @@ def test_device_maps_remote(self): rref = rpc.remote( dst, TensorPipeAgentCudaRpcTest._add_to_gpu, - args=(torch.zeros(2), 1) + args=(x, y) ) - self.assertEqual(rref.to_here().device.index, 1) - self.assertEqual(rref.to_here(), torch.ones(2).to(1)) + self.assertEqual(rref.to_here(), expected.to(1)) rpc.shutdown() + @skip_if_lt_x_gpu(2) + def test_device_maps_remote(self): + self._device_maps_remote( + torch.ones(3, 3), + torch.ones(3, 3), + torch.ones(3, 3) + torch.ones(3, 3) + ) + + @skip_if_lt_x_gpu(2) + def test_device_maps_remote_sparse(self): + self._device_maps_remote( + build_sparse_tensor(), + build_sparse_tensor(), + build_sparse_tensor() + build_sparse_tensor() + ) + @staticmethod def _slow_add_on_user_stream(x, y): s0 = torch.cuda.current_stream(x.device) From 4968d0b34ffd624445de0137924288bccf28f51a Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 2 Sep 2021 16:21:52 -0700 Subject: [PATCH 493/530] [POC] .github: Add event name to concurrency (#64402) Summary: This would ensure that manually/API triggered workflows would not cancel other triggered workflows. For example, the manually triggered periodic 11.1 linux job cancelled the scheduled one here, which we may not want: ![image](https://user-images.githubusercontent.com/31798555/131752175-1c99d56e-d344-46e1-b8ac-9c12bba0569a.png). This would be helpful later as we use more dispatched workflows (e.g., for bisect functionality) Pull Request resolved: https://github.com/pytorch/pytorch/pull/64402 Reviewed By: malfet Differential Revision: D30734860 Pulled By: janeyx99 fbshipit-source-id: 220016716094666e9af836fcd716dd529cf23d8a --- .github/scripts/ensure_actions_will_cancel.py | 3 ++- .github/templates/linux_ci_workflow.yml.j2 | 6 +++++- .github/templates/windows_ci_workflow.yml.j2 | 2 +- .github/workflows/add_annotations.yml | 2 +- .github/workflows/auto_label.yml | 2 +- .github/workflows/build_linux_conda.yml | 2 +- .github/workflows/build_linux_libtorch.yml | 2 +- .github/workflows/build_linux_wheels.yml | 2 +- .github/workflows/create_release.yml | 2 +- .../generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 6 +++++- .../generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 6 +++++- .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 6 +++++- .../generated-linux-bionic-py3.8-gcc9-coverage.yml | 6 +++++- .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 6 +++++- .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 6 +++++- .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 6 +++++- .../generated-linux-xenial-py3.6-gcc7-bazel-test.yml | 6 +++++- ...d-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 6 +++++- .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 6 +++++- .../generated-periodic-win-vs2019-cuda11.1-py3.yml | 2 +- .github/workflows/generated-win-vs2019-cpu-py3.yml | 2 +- .github/workflows/generated-win-vs2019-cuda10.1-py3.yml | 2 +- .github/workflows/generated-win-vs2019-cuda11.3-py3.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/run_torchbench.yml | 2 +- .github/workflows/test_tools.yml | 2 +- 26 files changed, 71 insertions(+), 26 deletions(-) diff --git a/.github/scripts/ensure_actions_will_cancel.py b/.github/scripts/ensure_actions_will_cancel.py index 18e61d264f514..8af3e80702e56 100755 --- a/.github/scripts/ensure_actions_will_cancel.py +++ b/.github/scripts/ensure_actions_will_cancel.py @@ -15,7 +15,8 @@ def concurrency_key(filename: Path) -> str: workflow_name = filename.with_suffix("").name.replace("_", "-") if workflow_name.startswith("generated-"): workflow_name = workflow_name[len("generated-"):] - return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}" + return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}" \ + "-${{ github.event_name == 'workflow_dispatch' }}" def should_check(filename: Path) -> bool: diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 314122b699c86..2d856704c3137 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -46,7 +46,7 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }} + group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: @@ -54,9 +54,13 @@ jobs: !{{ ciflow_config.root_job_name }}: runs-on: ubuntu-18.04 if: ${{ !{{ ciflow_config.root_job_condition }} }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running !{{ ciflow_config.root_job_name }} + - name: print labels + run: echo "${LABELS}" {%- endif %} calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 05d739db523e3..4f486dd75f3b7 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -61,7 +61,7 @@ env: {%- endif %} concurrency: - group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }} + group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: diff --git a/.github/workflows/add_annotations.yml b/.github/workflows/add_annotations.yml index 9bb3c1b46e7b4..76f7307e3fb77 100644 --- a/.github/workflows/add_annotations.yml +++ b/.github/workflows/add_annotations.yml @@ -9,7 +9,7 @@ on: concurrency: - group: add-annotations-${{ github.event.pull_request.number || github.sha }} + group: add-annotations-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/auto_label.yml b/.github/workflows/auto_label.yml index 1616ea9c90b8a..6dcb29a70f57a 100644 --- a/.github/workflows/auto_label.yml +++ b/.github/workflows/auto_label.yml @@ -8,7 +8,7 @@ on: concurrency: - group: auto-label-${{ github.event.pull_request.number || github.sha }} + group: auto-label-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/build_linux_conda.yml b/.github/workflows/build_linux_conda.yml index 536a18771831e..b43c2013327ba 100644 --- a/.github/workflows/build_linux_conda.yml +++ b/.github/workflows/build_linux_conda.yml @@ -111,5 +111,5 @@ jobs: python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 concurrency: - group: build-linux-conda-${{ github.event.pull_request.number || github.sha }} + group: build-linux-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/build_linux_libtorch.yml b/.github/workflows/build_linux_libtorch.yml index 9321c6ac8bf88..0a1c653375f9c 100644 --- a/.github/workflows/build_linux_libtorch.yml +++ b/.github/workflows/build_linux_libtorch.yml @@ -110,5 +110,5 @@ jobs: python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 concurrency: - group: build-linux-libtorch-${{ github.event.pull_request.number || github.sha }} + group: build-linux-libtorch-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/build_linux_wheels.yml b/.github/workflows/build_linux_wheels.yml index 15a38f6cee0fe..1f8e5f02e2220 100644 --- a/.github/workflows/build_linux_wheels.yml +++ b/.github/workflows/build_linux_wheels.yml @@ -109,5 +109,5 @@ jobs: python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 concurrency: - group: build-linux-wheels-${{ github.event.pull_request.number || github.sha }} + group: build-linux-wheels-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 4cd0568be5aad..eea423c00505c 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -48,5 +48,5 @@ jobs: files: ${{env.PT_RELEASE_FILE}} concurrency: - group: create-release-${{ github.event.pull_request.number || github.sha }} + group: create-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml index d2111896319df..477fe1bac6fe2 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml index 51811ae9eca58..9fd6d7ff8d140 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index f410f1fc0be20..ee0ca4cf76ce0 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }} + group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml index 2bcb4261816de..2103f2b66bdbf 100644 --- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml +++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-bionic-py3.8-gcc9-coverage-${{ github.event.pull_request.number || github.sha }} + group: linux-bionic-py3.8-gcc9-coverage-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml index 656b56548b26b..187f9c1ccfdfb 100644 --- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml index 68b42858c98e3..9fff700c56e84 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml index b347b52be9509..d1187de624f17 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }} + group: linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml index e86cc563a77f8..49d2cd2f2267c 100644 --- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }} + group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml index b591519e3219c..620e4c3d2d318 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -25,16 +25,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml index 07593ad07a941..e318e665c9156 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml @@ -25,16 +25,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }} + group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 61c63f42cadf6..360fdc38c86ad 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -29,7 +29,7 @@ env: USE_CUDA: 1 concurrency: - group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }} + group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 6ef8e85ac931b..1277a69f1d13d 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -29,7 +29,7 @@ env: no_proxy: localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock concurrency: - group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }} + group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml index c158f08731d99..185cb5903e189 100644 --- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml @@ -31,7 +31,7 @@ env: USE_CUDA: 1 concurrency: - group: win-vs2019-cuda10.1-py3-${{ github.event.pull_request.number || github.sha }} + group: win-vs2019-cuda10.1-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index bd945c3255a0a..b339e79926f53 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -31,7 +31,7 @@ env: USE_CUDA: 1 concurrency: - group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }} + group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f69d2b01ab35a..a1b6182aedaf4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -467,5 +467,5 @@ jobs: fi concurrency: - group: lint-${{ github.event.pull_request.number || github.sha }} + group: lint-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml index 786d25f4e3b0f..cee27e1866282 100644 --- a/.github/workflows/run_torchbench.yml +++ b/.github/workflows/run_torchbench.yml @@ -66,5 +66,5 @@ jobs: path: ~/.torchbench/bisection/pr${{ github.event.number }} concurrency: - group: run-torchbench-${{ github.event.pull_request.number || github.sha }} + group: run-torchbench-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/test_tools.yml b/.github/workflows/test_tools.yml index afc790bb10e2b..02ae0dd34e4fd 100644 --- a/.github/workflows/test_tools.yml +++ b/.github/workflows/test_tools.yml @@ -31,5 +31,5 @@ jobs: run: python -m unittest discover -vs tools/test -p 'test_*.py' concurrency: - group: test-tools-${{ github.event.pull_request.number || github.sha }} + group: test-tools-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true From 046ed57a4d0f8296eb6863a835407cbd9711d832 Mon Sep 17 00:00:00 2001 From: Zafar Takhirov Date: Thu, 2 Sep 2021 16:58:36 -0700 Subject: [PATCH 494/530] Revert D30055886: [quant] AO migration of the `quantize.py` Test Plan: revert-hammer Differential Revision: D30055886 (https://github.com/pytorch/pytorch/commit/44e3ed88c9a1bd9ee6b0168ba5271a2c6b006cc8) Original commit changeset: 8ef7470f9fa6 fbshipit-source-id: c5bd3ead43a2d44b9e56872ec5bd7a195bdac725 --- test/quantization/ao_migration/__init__.py | 0 .../ao_migration/test_quantize_py.py | 63 -- test/test_quantization.py | 2 - torch/ao/quantization/__init__.py | 0 torch/ao/quantization/quantize.py | 580 ----------------- torch/quantization/fx/convert.py | 2 +- torch/quantization/fx/prepare.py | 2 +- .../quantization/fx/quantization_patterns.py | 2 +- torch/quantization/fx/utils.py | 2 +- torch/quantization/quantize.py | 604 +++++++++++++++++- 10 files changed, 581 insertions(+), 676 deletions(-) delete mode 100644 test/quantization/ao_migration/__init__.py delete mode 100644 test/quantization/ao_migration/test_quantize_py.py delete mode 100644 torch/ao/quantization/__init__.py delete mode 100644 torch/ao/quantization/quantize.py diff --git a/test/quantization/ao_migration/__init__.py b/test/quantization/ao_migration/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/test/quantization/ao_migration/test_quantize_py.py b/test/quantization/ao_migration/test_quantize_py.py deleted file mode 100644 index 086364bef9d56..0000000000000 --- a/test/quantization/ao_migration/test_quantize_py.py +++ /dev/null @@ -1,63 +0,0 @@ -from torch.testing._internal.common_utils import TestCase - -import importlib -from typing import List - - -class AOMigrationTestCase(TestCase): - def _test_package_import(self, package_name: str): - r"""Tests the module import by making sure that all the internals match - (except the dunder methods).""" - old_module = importlib.import_module(f'torch.quantization.{package_name}') - new_module = importlib.import_module(f'torch.ao.quantization.{package_name}') - old_module_dir = set(dir(old_module)) - new_module_dir = set(dir(new_module)) - # Remove magic modules from checking in subsets - for el in list(old_module_dir): - if el[:2] == '__' and el[-2:] == '__': - old_module_dir.remove(el) - assert (old_module_dir <= new_module_dir), \ - f"Importing {old_module} vs. {new_module} does not match: " \ - f"{old_module_dir - new_module_dir}" - - def _test_function_import(self, package_name: str, function_list: List[str]): - r"""Tests individual function list import by comparing the functions - and their hashes.""" - old_location = importlib.import_module(f'torch.quantization.{package_name}') - new_location = importlib.import_module(f'torch.ao.quantization.{package_name}') - for fn_name in function_list: - old_function = getattr(old_location, fn_name) - new_function = getattr(new_location, fn_name) - assert old_function == new_function, f"Functions don't match: {fn_name}" - assert hash(old_function) == hash(new_function), \ - f"Hashes don't match: {old_function}({hash(old_function)}) vs. " \ - f"{new_function}({hash(new_function)})" - - -class TestAOMigrationQuantizePy(AOMigrationTestCase): - def test_package_import(self): - self._test_package_import('quantize') - - def test_function_import(self): - function_list = [ - '_convert', - '_observer_forward_hook', - '_propagate_qconfig_helper', - '_remove_activation_post_process', - '_remove_qconfig', - 'add_observer_', - 'add_quant_dequant', - 'convert', - 'get_observer_dict', - 'get_unique_devices_', - 'is_activation_post_process', - 'prepare', - 'prepare_qat', - 'propagate_qconfig_', - 'quantize', - 'quantize_dynamic', - 'quantize_qat', - 'register_activation_post_process_hook', - 'swap_module', - ] - self._test_function_import('quantize', function_list) diff --git a/test/test_quantization.py b/test/test_quantization.py index ffc242ed77e33..867151373a5b6 100644 --- a/test/test_quantization.py +++ b/test/test_quantization.py @@ -100,8 +100,6 @@ from quantization.jit.test_fusion_passes import TestFusionPasses # noqa: F401 from quantization.jit.test_deprecated_jit_quant import TestDeprecatedJitQuantized # noqa: F401 -# AO Migration tests -from quantization.ao_migration.test_quantize_py import TestAOMigrationQuantizePy # noqa: F401 if __name__ == '__main__': run_tests() diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py deleted file mode 100644 index 92a794ed7b631..0000000000000 --- a/torch/ao/quantization/quantize.py +++ /dev/null @@ -1,580 +0,0 @@ -import copy -import itertools -import warnings - -import torch -import torch.nn as nn -import torch.nn.quantized as nnq -from torch.nn.intrinsic import _FusedModule - -# Import the duplicated API -from torch.quantization.quantization_mappings import ( - get_default_dynamic_quant_module_mappings, - get_default_static_quant_module_mappings, - get_default_qat_module_mappings, - get_default_qconfig_propagation_list, - no_observer_set, - _has_special_act_post_process, - _get_special_act_post_process, -) -from torch.quantization.stubs import DeQuantStub, QuantWrapper -from torch.quantization.qconfig import ( - add_module_to_qconfig_obs_ctr, - default_dynamic_qconfig, - float16_dynamic_qconfig, - float_qparams_weight_only_qconfig) - -def is_activation_post_process(module): - return (isinstance(module, torch.quantization.ObserverBase) or - isinstance(module, torch.quantization.FakeQuantizeBase)) - -def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None, - qconfig_parent=None, prefix=''): - r"""This is a helper function for `propagate_qconfig_` - - Args: - module: input module - qconfig_dict: dictionary that maps from name of submodule to quantization - configuration - allow_list: list of quantizable modules - qconfig_parent: quantization config of parent module, we will fallback to - this config when there is no specified config for current - module - prefix: corresponding prefix of the current module, used as key in - qconfig_dict - - Return: - None, module is modified inplace with qconfig attached - """ - # TODO: Add test - if allow_list is None: - allow_list = get_default_qconfig_propagation_list() - - module_qconfig = qconfig_dict.get(type(module), qconfig_parent) - module_qconfig = qconfig_dict.get(prefix, module_qconfig) - module_qconfig = getattr(module, 'qconfig', module_qconfig) - - torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module) - - qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module) - module.qconfig = qconfig_with_device_check - - for name, child in module.named_children(): - module_prefix = prefix + '.' + name if prefix else name - _propagate_qconfig_helper(child, qconfig_dict, allow_list, - qconfig_with_device_check, module_prefix) - -# TODO(jerryzh): expose allow_list -def propagate_qconfig_(module, qconfig_dict=None, allow_list=None): - r"""Propagate qconfig through the module hierarchy and assign `qconfig` - attribute on each leaf module - - Args: - module: input module - qconfig_dict: dictionary that maps from name or type of submodule to - quantization configuration, qconfig applies to all submodules of a - given module unless qconfig for the submodules are specified (when - the submodule already has qconfig attribute) - - Return: - None, module is modified inplace with qconfig attached - """ - if qconfig_dict is None: - qconfig_dict = {} - _propagate_qconfig_helper(module, qconfig_dict, allow_list) - -def _observer_forward_hook(self, input, output): - r"""Forward hook that calls observer on the output - """ - return self.activation_post_process(output) - -def register_activation_post_process_hook(module): - assert hasattr(module, 'activation_post_process'), \ - 'Expect activation_post_process attribut already attached to the module' - return module.register_forward_hook(_observer_forward_hook) - -def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None): - r"""Add observer for the leaf child of the module. - - This function insert observer module to all leaf child module that - has a valid qconfig attribute. - - Args: - module: input module with qconfig attributes for all the leaf modules that we want to quantize - device: parent device, if any - non_leaf_module_list: list of non-leaf modules we want to add observer - - Return: - None, module is modified inplace with added observer modules and forward_hooks - """ - if qconfig_propagation_list is None: - qconfig_propagation_list = get_default_qconfig_propagation_list() - - if custom_module_class_mapping is None: - custom_module_class_mapping = {} - - # respect device affinity when adding observers - if device is None: - devices = get_unique_devices_(module) - assert len(devices) <= 1, ( - "add_observer_ only works with cpu or single-device CUDA modules, " - "but got devices {}".format(devices) - ) - device = next(iter(devices)) if len(devices) > 0 else None - - def get_activation_post_process(qconfig, device, special_act_post_process=None): - activation = qconfig.activation() if special_act_post_process is None else special_act_post_process() - if device is not None: - activation.to(device) - return activation - - def needs_observation(m): - return hasattr(m, 'qconfig') and m.qconfig is not None - - def insert_activation_post_process(m, special_act_post_process=None): - """ Adds an activation post process module and register - a post hook that calls the module - """ - # We don't insert observer/fake_quantize for DeQuantStub - if needs_observation(m) and not isinstance(m, DeQuantStub): - # observer and hook will be gone after we swap the module - m.add_module('activation_post_process', get_activation_post_process( - m.qconfig, device, special_act_post_process)) - # Register observer as the first entry in the hook list - # All post forward hooks are preserved and will be executed after the observer before convert - handle = register_activation_post_process_hook(m) - m._forward_hooks.move_to_end(handle.id, last=False) - - for name, child in module.named_children(): - if type(child) in [nnq.FloatFunctional, nnq.QFunctional]: - if needs_observation(child): - child.activation_post_process = get_activation_post_process(child.qconfig, device) - elif isinstance(child, _FusedModule): - # activation_post_process are now added directly to nn.Sequentail/_FusedModule - if needs_observation(child): - insert_activation_post_process(child) - elif _has_special_act_post_process(child): - special_act_post_process = _get_special_act_post_process(child) - insert_activation_post_process(child, special_act_post_process) - elif non_leaf_module_list is not None and type(child) in non_leaf_module_list: - if needs_observation(child): - insert_activation_post_process(child) - elif needs_observation(child) and type(child) in custom_module_class_mapping: - observed_child = custom_module_class_mapping[type(child)].from_float(child) - setattr(module, name, observed_child) - # TODO: These are the modules that cannot be observed - # Once there are more, we should move them to a separate list - if custom_module_class_mapping[type(child)] not in no_observer_set(): - insert_activation_post_process(observed_child) - else: - add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping) - - # Insert observers only for leaf nodes, note that this observer is for - # the output of the module, for input QuantStub will observe them - if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ - and type(module) in qconfig_propagation_list: - insert_activation_post_process(module) - -def get_unique_devices_(module): - return {p.device for p in module.parameters()} | \ - {p.device for p in module.buffers()} - -def add_quant_dequant(module): - r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig - Note that this function will modify the children of module inplace and it - can return a new module which wraps the input module as well. - - Args: - module: input module with qconfig attributes for all the leaf modules - that we want to quantize - - Return: - Either the inplace modified module with submodules wrapped in - `QuantWrapper` based on qconfig or a new `QuantWrapper` module which - wraps the input module, the latter case only happens when the input - module is a leaf module and we want to quantize it. - """ - if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig: - return QuantWrapper(module) - - for name, child in module.named_children(): - module._modules[name] = add_quant_dequant(child) - return module - -def prepare(model, inplace=False, allow_list=None, - observer_non_leaf_module_list=None, - prepare_custom_config_dict=None): - r"""Prepares a copy of the model for quantization calibration or quantization-aware training. - - Quantization configuration should be assigned preemptively - to individual submodules in `.qconfig` attribute. - - The model will be attached with observer or fake quant modules, and qconfig - will be propagated. - - Args: - `model`: input model to be modified in-place - `inplace`: carry out model transformations in-place, the original module is mutated - `allow_list`: list of quantizable modules - `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer - `prepare_custom_config_dict`: customization configuration dictionary for prepare function - - .. code-block:: python - - # Example of prepare_custom_config_dict: - prepare_custom_config_dict = { - # user will manually define the corresponding observed - # module class which has a from_float class method that converts - # float custom module to observed custom module - "float_to_observed_custom_module_class": { - CustomModule: ObservedCustomModule - } - } - - """ - torch._C._log_api_usage_once("quantization_api.quantize.prepare") - if prepare_custom_config_dict is None: - prepare_custom_config_dict = {} - custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {}) - - if not inplace: - model = copy.deepcopy(model) - - # TODO: remove allow_list - qconfig_propagation_list = allow_list - if qconfig_propagation_list is None: - qconfig_propagation_list = get_default_qconfig_propagation_list() - propagate_qconfig_(model, qconfig_dict=None) - - # sanity check common API misusage - if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()): - warnings.warn("None of the submodule got qconfig applied. Make sure you " - "passed correct configuration through `qconfig_dict` or " - "by assigning the `.qconfig` attribute directly on submodules") - - add_observer_( - model, qconfig_propagation_list, observer_non_leaf_module_list, - custom_module_class_mapping=custom_module_class_mapping) - return model - -def _remove_activation_post_process(module): - # TODO: maybe we should change activation_post_process to _activation_post_process - # to prevent it from being used by user - if hasattr(module, 'activation_post_process') and \ - is_activation_post_process(module.activation_post_process): - delattr(module, 'activation_post_process') - - # remove activation_post_proceess hook - handle_ids_to_remove = set() - for handle_id, hook_fn in module._forward_hooks.items(): - if hook_fn is _observer_forward_hook: - handle_ids_to_remove.add(handle_id) - for handle_id in handle_ids_to_remove: - module._forward_hooks.pop(handle_id) - -# TODO: rename to something more general -def _remove_qconfig(module): - r"""Clean up the qconfig left in the module so that new qconfig can be - propagated. - - Args: - module: module to be cleaned up - """ - for child in module.children(): - _remove_qconfig(child) - - if hasattr(module, "qconfig"): - del module.qconfig - - _remove_activation_post_process(module) - -def quantize(model, run_fn, run_args, mapping=None, inplace=False): - r"""Quantize the input float model with post training static quantization. - - First it will prepare the model for calibration, then it calls - `run_fn` which will run the calibration step, after that we will - convert the model to a quantized model. - - Args: - model: input float model - run_fn: a calibration function for calibrating the prepared model - run_args: positional arguments for `run_fn` - inplace: carry out model transformations in-place, the original module is mutated - mapping: correspondence between original module types and quantized counterparts - - Return: - Quantized model. - """ - torch._C._log_api_usage_once("quantization_api.quantize.quantize") - if mapping is None: - mapping = get_default_static_quant_module_mappings() - if not inplace: - model = copy.deepcopy(model) - model.eval() - prepare(model, inplace=True) - run_fn(model, *run_args) - convert(model, mapping, inplace=True) - return model - -def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, - mapping=None, inplace=False): - r"""Converts a float model to dynamic (i.e. weights-only) quantized model. - - Replaces specified modules with dynamic weight-only quantized versions and output the quantized model. - - For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization - by default is performed for layers with large weights size - i.e. Linear and RNN variants. - - Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`. - If `qconfig` is provided, the `dtype` argument is ignored. - - Args: - model: input model - qconfig_spec: Either: - - - A dictionary that maps from name or type of submodule to quantization - configuration, qconfig applies to all submodules of a given - module unless qconfig for the submodules are specified (when the - submodule already has qconfig attribute). Entries in the dictionary - need to be QConfigDynamic instances. - - - A set of types and/or submodule names to apply dynamic quantization to, - in which case the `dtype` argument is used to specify the bit-width - - inplace: carry out model transformations in-place, the original module is mutated - mapping: maps type of a submodule to a type of corresponding dynamically quantized version - with which the submodule needs to be replaced - - """ - torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic") - if qconfig_spec is None: - if dtype == torch.qint8: - qconfig_spec = { - nn.Linear : default_dynamic_qconfig, - nn.LSTM : default_dynamic_qconfig, - nn.GRU : default_dynamic_qconfig, - nn.LSTMCell : default_dynamic_qconfig, - nn.RNNCell : default_dynamic_qconfig, - nn.GRUCell : default_dynamic_qconfig, - } - elif dtype == torch.float16: - qconfig_spec = { - nn.Linear : float16_dynamic_qconfig, - nn.LSTM : float16_dynamic_qconfig, - nn.GRU : float16_dynamic_qconfig, - nn.LSTMCell : float16_dynamic_qconfig, - nn.RNNCell : float16_dynamic_qconfig, - nn.GRUCell : float16_dynamic_qconfig, - } - elif dtype == torch.quint8: - qconfig_spec = { - nn.EmbeddingBag : float_qparams_weight_only_qconfig, - } - else: - raise ValueError( - "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype)) - elif isinstance(qconfig_spec, set): - if dtype is torch.qint8: - default_qconfig = default_dynamic_qconfig - elif dtype is torch.float16: - default_qconfig = float16_dynamic_qconfig - elif dtype is torch.quint8: - default_qconfig = float_qparams_weight_only_qconfig - else: - raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype)) - qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig))) - - if mapping is None: - mapping = get_default_dynamic_quant_module_mappings() - - if not inplace: - model = copy.deepcopy(model) - model.eval() - propagate_qconfig_(model, qconfig_spec) - convert(model, mapping, inplace=True) - return model - -def prepare_qat(model, mapping=None, inplace=False): - r""" - Prepares a copy of the model for quantization calibration or - quantization-aware training and converts it to quantized version. - - Quantization configuration should be assigned preemptively - to individual submodules in `.qconfig` attribute. - - Args: - model: input model to be modified in-place - mapping: dictionary that maps float modules to quantized modules to be - replaced. - inplace: carry out model transformations in-place, the original module - is mutated - """ - torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat") - if mapping is None: - mapping = get_default_qat_module_mappings() - - if not inplace: - model = copy.deepcopy(model) - - propagate_qconfig_(model, qconfig_dict=None) - convert(model, mapping=mapping, inplace=True, remove_qconfig=False) - prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True) - return model - -def quantize_qat(model, run_fn, run_args, inplace=False): - r"""Do quantization aware training and output a quantized model - - Args: - model: input model - run_fn: a function for evaluating the prepared model, can be a - function that simply runs the prepared model or a training - loop - run_args: positional arguments for `run_fn` - - Return: - Quantized model. - """ - torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat") - if not inplace: - model = copy.deepcopy(model) - model.train() - prepare_qat(model, inplace=True) - run_fn(model, *run_args) - convert(model, inplace=True) - return model - -def convert( - module, mapping=None, inplace=False, remove_qconfig=True, - convert_custom_config_dict=None): - r"""Converts submodules in input module to a different module according to `mapping` - by calling `from_float` method on the target module class. And remove qconfig at the - end if remove_qconfig is set to True. - - Args: - `module`: prepared and calibrated module - `mapping`: a dictionary that maps from source module type to target - module type, can be overwritten to allow swapping user defined - Modules - `inplace`: carry out model transformations in-place, the original module - is mutated - `convert_custom_config_dict`: custom configuration dictionary for convert function - - .. code-block:: python - - # Example of convert_custom_config_dict: - convert_custom_config_dict = { - # user will manually define the corresponding quantized - # module class which has a from_observed class method that converts - # observed custom module to quantized custom module - "observed_to_quantized_custom_module_class": { - ObservedCustomModule: QuantizedCustomModule - } - } - - """ - torch._C._log_api_usage_once("quantization_api.quantize.convert") - if not inplace: - module = copy.deepcopy(module) - _convert( - module, mapping, inplace=True, - convert_custom_config_dict=convert_custom_config_dict) - if remove_qconfig: - _remove_qconfig(module) - return module - -def _convert( - module, mapping=None, inplace=False, - convert_custom_config_dict=None): - r"""Converts submodules in input module to a different module according to `mapping` - by calling `from_float` method on the target module class - - Args: - module: input module - mapping: a dictionary that maps from source module type to target - module type, can be overwritten to allow swapping user defined - Modules - inplace: carry out model transformations in-place, the original module - is mutated - - """ - if mapping is None: - mapping = get_default_static_quant_module_mappings() - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {}) - - if not inplace: - module = copy.deepcopy(module) - reassign = {} - for name, mod in module.named_children(): - # both fused modules and observed custom modules are - # swapped as one unit - if not isinstance(mod, _FusedModule) and \ - type(mod) not in custom_module_class_mapping: - _convert(mod, mapping, True, # inplace - convert_custom_config_dict) - reassign[name] = swap_module(mod, mapping, custom_module_class_mapping) - - for key, value in reassign.items(): - module._modules[key] = value - - return module - -def swap_module(mod, mapping, custom_module_class_mapping): - r"""Swaps the module if it has a quantized counterpart and it has an - `observer` attached. - - Args: - mod: input module - mapping: a dictionary that maps from nn module to nnq module - - Return: - The corresponding quantized module of `mod` - """ - new_mod = mod - if hasattr(mod, 'qconfig') and mod.qconfig is not None: - swapped = False - if type(mod) in custom_module_class_mapping: - new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) - swapped = True - elif type(mod) in mapping: - new_mod = mapping[type(mod)].from_float(mod) - swapped = True - - if swapped: - # Preserve module's pre forward hooks. They'll be called on quantized input - for pre_hook_fn in mod._forward_pre_hooks.values(): - new_mod.register_forward_pre_hook(pre_hook_fn) - # Preserve module's post forward hooks except _observer_forward_hook - # After convert they'll work with quantized output - for hook_fn in mod._forward_hooks.values(): - if hook_fn is not _observer_forward_hook: - new_mod.register_forward_hook(hook_fn) - - # respect device affinity when swapping modules - devices = get_unique_devices_(mod) - assert len(devices) <= 1, ( - "swap_module only works with cpu or single-device CUDA modules, " - "but got devices {}".format(devices) - ) - device = next(iter(devices)) if len(devices) > 0 else None - if device: - new_mod.to(device) - return new_mod - -def get_observer_dict(mod, target_dict, prefix=""): - r"""Traverse the modules and save all observers into dict. - This is mainly used for quantization accuracy debug - Args: - mod: the top module we want to save all observers - prefix: the prefix for the current module - target_dict: the dictionary used to save all the observers - """ - def get_prefix(prefix): - return prefix if prefix == "" else prefix + '.' - - if hasattr(mod, 'activation_post_process'): - target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process - for name, child in mod.named_children(): - module_prefix = get_prefix(prefix) + name if prefix else name - get_observer_dict(child, target_dict, module_prefix) diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py index e00e4aaad1b68..867b0b24cf7ad 100644 --- a/torch/quantization/fx/convert.py +++ b/torch/quantization/fx/convert.py @@ -36,7 +36,7 @@ WEIGHT_INDEX_DICT, ) -from torch.ao.quantization.quantize import ( +from ..quantize import ( _remove_qconfig, is_activation_post_process, ) diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py index 0b65e339ce0a3..d2bb96ab7a5c0 100644 --- a/torch/quantization/fx/prepare.py +++ b/torch/quantization/fx/prepare.py @@ -67,7 +67,7 @@ get_default_qat_module_mappings, ) -from torch.ao.quantization.quantize import ( +from ..quantize import ( is_activation_post_process, convert ) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 418cae1511c35..3f54a6a1e2743 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -24,7 +24,7 @@ get_qparam_dict, ) -from torch.ao.quantization.quantize import ( +from ..quantize import ( is_activation_post_process, ) diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index 3c9adc2bc311b..10f8b06b6dfed 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -2,7 +2,7 @@ import torch import torch.nn as nn from ..utils import is_per_tensor, is_per_channel -from torch.ao.quantization.quantize import is_activation_post_process +from ..quantize import is_activation_post_process from torch.fx import GraphModule, map_arg diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py index 5b0f4ed8779ab..674ed59ac86ed 100644 --- a/torch/quantization/quantize.py +++ b/torch/quantization/quantize.py @@ -1,30 +1,580 @@ -# flake8: noqa: F401 -r""" -This file is in the process of migration to `torch/ao/quantization`, and -is kept here for compatibility while the migration process is ongoing. -If you are adding a new entry/functionality, please, add it to the -`torch/ao/quantization/quantize.py`, while adding an import statement -here. -""" +import copy +import itertools +import warnings import torch +import torch.nn as nn +import torch.nn.quantized as nnq +from torch.nn.intrinsic import _FusedModule -from torch.ao.quantization.quantize import _convert -from torch.ao.quantization.quantize import _observer_forward_hook -from torch.ao.quantization.quantize import _propagate_qconfig_helper -from torch.ao.quantization.quantize import _remove_activation_post_process -from torch.ao.quantization.quantize import _remove_qconfig -from torch.ao.quantization.quantize import add_observer_ -from torch.ao.quantization.quantize import add_quant_dequant -from torch.ao.quantization.quantize import convert -from torch.ao.quantization.quantize import get_observer_dict -from torch.ao.quantization.quantize import get_unique_devices_ -from torch.ao.quantization.quantize import is_activation_post_process -from torch.ao.quantization.quantize import prepare -from torch.ao.quantization.quantize import prepare_qat -from torch.ao.quantization.quantize import propagate_qconfig_ -from torch.ao.quantization.quantize import quantize -from torch.ao.quantization.quantize import quantize_dynamic -from torch.ao.quantization.quantize import quantize_qat -from torch.ao.quantization.quantize import register_activation_post_process_hook -from torch.ao.quantization.quantize import swap_module +from .quantization_mappings import ( + get_default_dynamic_quant_module_mappings, + get_default_static_quant_module_mappings, + get_default_qat_module_mappings, + get_default_qconfig_propagation_list, + no_observer_set, + _has_special_act_post_process, + _get_special_act_post_process, +) + +from .stubs import DeQuantStub, QuantWrapper +from .qconfig import ( + add_module_to_qconfig_obs_ctr, + default_dynamic_qconfig, + float16_dynamic_qconfig, + float_qparams_weight_only_qconfig) + +def is_activation_post_process(module): + return (isinstance(module, torch.quantization.ObserverBase) or + isinstance(module, torch.quantization.FakeQuantizeBase)) + +def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None, + qconfig_parent=None, prefix=''): + r"""This is a helper function for `propagate_qconfig_` + + Args: + module: input module + qconfig_dict: dictionary that maps from name of submodule to quantization + configuration + allow_list: list of quantizable modules + qconfig_parent: quantization config of parent module, we will fallback to + this config when there is no specified config for current + module + prefix: corresponding prefix of the current module, used as key in + qconfig_dict + + Return: + None, module is modified inplace with qconfig attached + """ + # TODO: Add test + if allow_list is None: + allow_list = get_default_qconfig_propagation_list() + + module_qconfig = qconfig_dict.get(type(module), qconfig_parent) + module_qconfig = qconfig_dict.get(prefix, module_qconfig) + module_qconfig = getattr(module, 'qconfig', module_qconfig) + + torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module) + + qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module) + module.qconfig = qconfig_with_device_check + + for name, child in module.named_children(): + module_prefix = prefix + '.' + name if prefix else name + _propagate_qconfig_helper(child, qconfig_dict, allow_list, + qconfig_with_device_check, module_prefix) + +# TODO(jerryzh): expose allow_list +def propagate_qconfig_(module, qconfig_dict=None, allow_list=None): + r"""Propagate qconfig through the module hierarchy and assign `qconfig` + attribute on each leaf module + + Args: + module: input module + qconfig_dict: dictionary that maps from name or type of submodule to + quantization configuration, qconfig applies to all submodules of a + given module unless qconfig for the submodules are specified (when + the submodule already has qconfig attribute) + + Return: + None, module is modified inplace with qconfig attached + """ + if qconfig_dict is None: + qconfig_dict = {} + _propagate_qconfig_helper(module, qconfig_dict, allow_list) + +def _observer_forward_hook(self, input, output): + r"""Forward hook that calls observer on the output + """ + return self.activation_post_process(output) + +def register_activation_post_process_hook(module): + assert hasattr(module, 'activation_post_process'), \ + 'Expect activation_post_process attribut already attached to the module' + return module.register_forward_hook(_observer_forward_hook) + +def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None): + r"""Add observer for the leaf child of the module. + + This function insert observer module to all leaf child module that + has a valid qconfig attribute. + + Args: + module: input module with qconfig attributes for all the leaf modules that we want to quantize + device: parent device, if any + non_leaf_module_list: list of non-leaf modules we want to add observer + + Return: + None, module is modified inplace with added observer modules and forward_hooks + """ + if qconfig_propagation_list is None: + qconfig_propagation_list = get_default_qconfig_propagation_list() + + if custom_module_class_mapping is None: + custom_module_class_mapping = {} + + # respect device affinity when adding observers + if device is None: + devices = get_unique_devices_(module) + assert len(devices) <= 1, ( + "add_observer_ only works with cpu or single-device CUDA modules, " + "but got devices {}".format(devices) + ) + device = next(iter(devices)) if len(devices) > 0 else None + + def get_activation_post_process(qconfig, device, special_act_post_process=None): + activation = qconfig.activation() if special_act_post_process is None else special_act_post_process() + if device is not None: + activation.to(device) + return activation + + def needs_observation(m): + return hasattr(m, 'qconfig') and m.qconfig is not None + + def insert_activation_post_process(m, special_act_post_process=None): + """ Adds an activation post process module and register + a post hook that calls the module + """ + # We don't insert observer/fake_quantize for DeQuantStub + if needs_observation(m) and not isinstance(m, DeQuantStub): + # observer and hook will be gone after we swap the module + m.add_module('activation_post_process', get_activation_post_process( + m.qconfig, device, special_act_post_process)) + # Register observer as the first entry in the hook list + # All post forward hooks are preserved and will be executed after the observer before convert + handle = register_activation_post_process_hook(m) + m._forward_hooks.move_to_end(handle.id, last=False) + + for name, child in module.named_children(): + if type(child) in [nnq.FloatFunctional, nnq.QFunctional]: + if needs_observation(child): + child.activation_post_process = get_activation_post_process(child.qconfig, device) + elif isinstance(child, _FusedModule): + # activation_post_process are now added directly to nn.Sequentail/_FusedModule + if needs_observation(child): + insert_activation_post_process(child) + elif _has_special_act_post_process(child): + special_act_post_process = _get_special_act_post_process(child) + insert_activation_post_process(child, special_act_post_process) + elif non_leaf_module_list is not None and type(child) in non_leaf_module_list: + if needs_observation(child): + insert_activation_post_process(child) + elif needs_observation(child) and type(child) in custom_module_class_mapping: + observed_child = custom_module_class_mapping[type(child)].from_float(child) + setattr(module, name, observed_child) + # TODO: These are the modules that cannot be observed + # Once there are more, we should move them to a separate list + if custom_module_class_mapping[type(child)] not in no_observer_set(): + insert_activation_post_process(observed_child) + else: + add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping) + + # Insert observers only for leaf nodes, note that this observer is for + # the output of the module, for input QuantStub will observe them + if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ + and type(module) in qconfig_propagation_list: + insert_activation_post_process(module) + +def get_unique_devices_(module): + return {p.device for p in module.parameters()} | \ + {p.device for p in module.buffers()} + +def add_quant_dequant(module): + r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig + Note that this function will modify the children of module inplace and it + can return a new module which wraps the input module as well. + + Args: + module: input module with qconfig attributes for all the leaf modules + that we want to quantize + + Return: + Either the inplace modified module with submodules wrapped in + `QuantWrapper` based on qconfig or a new `QuantWrapper` module which + wraps the input module, the latter case only happens when the input + module is a leaf module and we want to quantize it. + """ + if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig: + return QuantWrapper(module) + + for name, child in module.named_children(): + module._modules[name] = add_quant_dequant(child) + return module + +def prepare(model, inplace=False, allow_list=None, + observer_non_leaf_module_list=None, + prepare_custom_config_dict=None): + r"""Prepares a copy of the model for quantization calibration or quantization-aware training. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + The model will be attached with observer or fake quant modules, and qconfig + will be propagated. + + Args: + `model`: input model to be modified in-place + `inplace`: carry out model transformations in-place, the original module is mutated + `allow_list`: list of quantizable modules + `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer + `prepare_custom_config_dict`: customization configuration dictionary for prepare function + + .. code-block:: python + + # Example of prepare_custom_config_dict: + prepare_custom_config_dict = { + # user will manually define the corresponding observed + # module class which has a from_float class method that converts + # float custom module to observed custom module + "float_to_observed_custom_module_class": { + CustomModule: ObservedCustomModule + } + } + + """ + torch._C._log_api_usage_once("quantization_api.quantize.prepare") + if prepare_custom_config_dict is None: + prepare_custom_config_dict = {} + custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {}) + + if not inplace: + model = copy.deepcopy(model) + + # TODO: remove allow_list + qconfig_propagation_list = allow_list + if qconfig_propagation_list is None: + qconfig_propagation_list = get_default_qconfig_propagation_list() + propagate_qconfig_(model, qconfig_dict=None) + + # sanity check common API misusage + if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()): + warnings.warn("None of the submodule got qconfig applied. Make sure you " + "passed correct configuration through `qconfig_dict` or " + "by assigning the `.qconfig` attribute directly on submodules") + + add_observer_( + model, qconfig_propagation_list, observer_non_leaf_module_list, + custom_module_class_mapping=custom_module_class_mapping) + return model + +def _remove_activation_post_process(module): + # TODO: maybe we should change activation_post_process to _activation_post_process + # to prevent it from being used by user + if hasattr(module, 'activation_post_process') and \ + is_activation_post_process(module.activation_post_process): + delattr(module, 'activation_post_process') + + # remove activation_post_proceess hook + handle_ids_to_remove = set() + for handle_id, hook_fn in module._forward_hooks.items(): + if hook_fn is _observer_forward_hook: + handle_ids_to_remove.add(handle_id) + for handle_id in handle_ids_to_remove: + module._forward_hooks.pop(handle_id) + +# TODO: rename to something more general +def _remove_qconfig(module): + r"""Clean up the qconfig left in the module so that new qconfig can be + propagated. + + Args: + module: module to be cleaned up + """ + for child in module.children(): + _remove_qconfig(child) + + if hasattr(module, "qconfig"): + del module.qconfig + + _remove_activation_post_process(module) + +def quantize(model, run_fn, run_args, mapping=None, inplace=False): + r"""Quantize the input float model with post training static quantization. + + First it will prepare the model for calibration, then it calls + `run_fn` which will run the calibration step, after that we will + convert the model to a quantized model. + + Args: + model: input float model + run_fn: a calibration function for calibrating the prepared model + run_args: positional arguments for `run_fn` + inplace: carry out model transformations in-place, the original module is mutated + mapping: correspondence between original module types and quantized counterparts + + Return: + Quantized model. + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize") + if mapping is None: + mapping = get_default_static_quant_module_mappings() + if not inplace: + model = copy.deepcopy(model) + model.eval() + prepare(model, inplace=True) + run_fn(model, *run_args) + convert(model, mapping, inplace=True) + return model + +def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, + mapping=None, inplace=False): + r"""Converts a float model to dynamic (i.e. weights-only) quantized model. + + Replaces specified modules with dynamic weight-only quantized versions and output the quantized model. + + For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization + by default is performed for layers with large weights size - i.e. Linear and RNN variants. + + Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`. + If `qconfig` is provided, the `dtype` argument is ignored. + + Args: + model: input model + qconfig_spec: Either: + + - A dictionary that maps from name or type of submodule to quantization + configuration, qconfig applies to all submodules of a given + module unless qconfig for the submodules are specified (when the + submodule already has qconfig attribute). Entries in the dictionary + need to be QConfigDynamic instances. + + - A set of types and/or submodule names to apply dynamic quantization to, + in which case the `dtype` argument is used to specify the bit-width + + inplace: carry out model transformations in-place, the original module is mutated + mapping: maps type of a submodule to a type of corresponding dynamically quantized version + with which the submodule needs to be replaced + + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic") + if qconfig_spec is None: + if dtype == torch.qint8: + qconfig_spec = { + nn.Linear : default_dynamic_qconfig, + nn.LSTM : default_dynamic_qconfig, + nn.GRU : default_dynamic_qconfig, + nn.LSTMCell : default_dynamic_qconfig, + nn.RNNCell : default_dynamic_qconfig, + nn.GRUCell : default_dynamic_qconfig, + } + elif dtype == torch.float16: + qconfig_spec = { + nn.Linear : float16_dynamic_qconfig, + nn.LSTM : float16_dynamic_qconfig, + nn.GRU : float16_dynamic_qconfig, + nn.LSTMCell : float16_dynamic_qconfig, + nn.RNNCell : float16_dynamic_qconfig, + nn.GRUCell : float16_dynamic_qconfig, + } + elif dtype == torch.quint8: + qconfig_spec = { + nn.EmbeddingBag : float_qparams_weight_only_qconfig, + } + else: + raise ValueError( + "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype)) + elif isinstance(qconfig_spec, set): + if dtype is torch.qint8: + default_qconfig = default_dynamic_qconfig + elif dtype is torch.float16: + default_qconfig = float16_dynamic_qconfig + elif dtype is torch.quint8: + default_qconfig = float_qparams_weight_only_qconfig + else: + raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype)) + qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig))) + + if mapping is None: + mapping = get_default_dynamic_quant_module_mappings() + + if not inplace: + model = copy.deepcopy(model) + model.eval() + propagate_qconfig_(model, qconfig_spec) + convert(model, mapping, inplace=True) + return model + +def prepare_qat(model, mapping=None, inplace=False): + r""" + Prepares a copy of the model for quantization calibration or + quantization-aware training and converts it to quantized version. + + Quantization configuration should be assigned preemptively + to individual submodules in `.qconfig` attribute. + + Args: + model: input model to be modified in-place + mapping: dictionary that maps float modules to quantized modules to be + replaced. + inplace: carry out model transformations in-place, the original module + is mutated + """ + torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat") + if mapping is None: + mapping = get_default_qat_module_mappings() + + if not inplace: + model = copy.deepcopy(model) + + propagate_qconfig_(model, qconfig_dict=None) + convert(model, mapping=mapping, inplace=True, remove_qconfig=False) + prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True) + return model + +def quantize_qat(model, run_fn, run_args, inplace=False): + r"""Do quantization aware training and output a quantized model + + Args: + model: input model + run_fn: a function for evaluating the prepared model, can be a + function that simply runs the prepared model or a training + loop + run_args: positional arguments for `run_fn` + + Return: + Quantized model. + """ + torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat") + if not inplace: + model = copy.deepcopy(model) + model.train() + prepare_qat(model, inplace=True) + run_fn(model, *run_args) + convert(model, inplace=True) + return model + +def convert( + module, mapping=None, inplace=False, remove_qconfig=True, + convert_custom_config_dict=None): + r"""Converts submodules in input module to a different module according to `mapping` + by calling `from_float` method on the target module class. And remove qconfig at the + end if remove_qconfig is set to True. + + Args: + `module`: prepared and calibrated module + `mapping`: a dictionary that maps from source module type to target + module type, can be overwritten to allow swapping user defined + Modules + `inplace`: carry out model transformations in-place, the original module + is mutated + `convert_custom_config_dict`: custom configuration dictionary for convert function + + .. code-block:: python + + # Example of convert_custom_config_dict: + convert_custom_config_dict = { + # user will manually define the corresponding quantized + # module class which has a from_observed class method that converts + # observed custom module to quantized custom module + "observed_to_quantized_custom_module_class": { + ObservedCustomModule: QuantizedCustomModule + } + } + + """ + torch._C._log_api_usage_once("quantization_api.quantize.convert") + if not inplace: + module = copy.deepcopy(module) + _convert( + module, mapping, inplace=True, + convert_custom_config_dict=convert_custom_config_dict) + if remove_qconfig: + _remove_qconfig(module) + return module + +def _convert( + module, mapping=None, inplace=False, + convert_custom_config_dict=None): + r"""Converts submodules in input module to a different module according to `mapping` + by calling `from_float` method on the target module class + + Args: + module: input module + mapping: a dictionary that maps from source module type to target + module type, can be overwritten to allow swapping user defined + Modules + inplace: carry out model transformations in-place, the original module + is mutated + + """ + if mapping is None: + mapping = get_default_static_quant_module_mappings() + if convert_custom_config_dict is None: + convert_custom_config_dict = {} + custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {}) + + if not inplace: + module = copy.deepcopy(module) + reassign = {} + for name, mod in module.named_children(): + # both fused modules and observed custom modules are + # swapped as one unit + if not isinstance(mod, _FusedModule) and \ + type(mod) not in custom_module_class_mapping: + _convert(mod, mapping, True, # inplace + convert_custom_config_dict) + reassign[name] = swap_module(mod, mapping, custom_module_class_mapping) + + for key, value in reassign.items(): + module._modules[key] = value + + return module + +def swap_module(mod, mapping, custom_module_class_mapping): + r"""Swaps the module if it has a quantized counterpart and it has an + `observer` attached. + + Args: + mod: input module + mapping: a dictionary that maps from nn module to nnq module + + Return: + The corresponding quantized module of `mod` + """ + new_mod = mod + if hasattr(mod, 'qconfig') and mod.qconfig is not None: + swapped = False + if type(mod) in custom_module_class_mapping: + new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) + swapped = True + elif type(mod) in mapping: + new_mod = mapping[type(mod)].from_float(mod) + swapped = True + + if swapped: + # Preserve module's pre forward hooks. They'll be called on quantized input + for pre_hook_fn in mod._forward_pre_hooks.values(): + new_mod.register_forward_pre_hook(pre_hook_fn) + # Preserve module's post forward hooks except _observer_forward_hook + # After convert they'll work with quantized output + for hook_fn in mod._forward_hooks.values(): + if hook_fn is not _observer_forward_hook: + new_mod.register_forward_hook(hook_fn) + + # respect device affinity when swapping modules + devices = get_unique_devices_(mod) + assert len(devices) <= 1, ( + "swap_module only works with cpu or single-device CUDA modules, " + "but got devices {}".format(devices) + ) + device = next(iter(devices)) if len(devices) > 0 else None + if device: + new_mod.to(device) + return new_mod + +def get_observer_dict(mod, target_dict, prefix=""): + r"""Traverse the modules and save all observers into dict. + This is mainly used for quantization accuracy debug + Args: + mod: the top module we want to save all observers + prefix: the prefix for the current module + target_dict: the dictionary used to save all the observers + """ + def get_prefix(prefix): + return prefix if prefix == "" else prefix + '.' + + if hasattr(mod, 'activation_post_process'): + target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process + for name, child in mod.named_children(): + module_prefix = get_prefix(prefix) + name if prefix else name + get_observer_dict(child, target_dict, module_prefix) From e161872aab00f3ca347ea32b972aab53660fc382 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Thu, 2 Sep 2021 16:58:59 -0700 Subject: [PATCH 495/530] Revert D30732630: [quant] Enable jit tracing on quantizable LSTM Test Plan: revert-hammer Differential Revision: D30732630 (https://github.com/pytorch/pytorch/commit/116142143cc2d66c7e582d9f96e00862456fd736) Original commit changeset: 443e351ebb0e fbshipit-source-id: 49001392f01366f3b1ccc31139f824c80b86cd40 --- test/quantization/core/test_quantized_op.py | 7 --- torch/nn/quantizable/modules/rnn.py | 59 +++++++++++++++++---- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index 6275174d8e43a..49b7c96847612 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -2476,13 +2476,6 @@ def test_custom_module_lstm(self): msg=(f"Error is too high: SNR(dB): {power}, " f"Signal: {signal}, MSE: {mse}")) - # Trace - jit_qmodule = torch.jit.trace(lstm_quantized, qx) - - # Script - # TODO: Fix the scripting in the torch/nn/quantizable/modules/rnn.py - # jit_qmodule = torch.jit.script(lstm_quantized) - @override_qengines def test_custom_module_multi_head_attention(self): class MultiheadAttentionModel(torch.nn.Module): diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py index cd0d094d086a7..bdfd7788533b5 100644 --- a/torch/nn/quantizable/modules/rnn.py +++ b/torch/nn/quantizable/modules/rnn.py @@ -48,7 +48,7 @@ def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True, self.ogate_cy = torch.nn.quantized.FloatFunctional() def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]: - if hidden is None or hidden[0] is None or hidden[1] is None: + if hidden is None or hidden == (None, None): hidden = self.initialize_hidden(x.shape[0], x.is_quantized) hx, cx = hidden @@ -175,13 +175,10 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): cx_bw = cx_fw[1] cx_fw = cx_fw[0] hidden_bw = hx_bw, cx_bw - if hx_fw is None and cx_fw is None: - hidden_fw = None - else: - hidden_fw = torch.jit._unwrap_optional(hx_fw), torch.jit._unwrap_optional(cx_fw) + hidden_fw = hx_fw, cx_fw result_fw, hidden_fw = self.layer_fw(x, hidden_fw) - if hasattr(self, 'layer_bw') and self.bidirectional: + if self.bidirectional: x_reversed = x.flip(0) result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw) result_bw = result_bw.flip(0) @@ -191,7 +188,7 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): c = torch.stack([hidden_fw[1], hidden_bw[1]], 0) # type: ignore[list-item] else: result = result_fw - h, c = torch.jit._unwrap_optional(hidden_fw) # type: ignore[assignment] + h, c = hidden_fw # type: ignore[assignment] if self.batch_first: result.transpose_(0, 1) @@ -230,6 +227,46 @@ def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs): layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh) return layer + # Getters for the weights and biases + # Note that jit currently doesn't support the `porperty`, so if you need to + # access the weights/biases you would need to navigate manually to the + # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883 + @property + def weight_ih(self): + return self.layer_fw.cell.igates.weight + + @property + def weight_hh(self): + return self.layer_fw.cell.hgates.weight + + @property + def bias_ih(self): + return self.layer_fw.cell.igates.bias + + @property + def bias_hh(self): + return self.layer_fw.cell.hgates.bias + + @property + def weight_ih_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.igates.weight + + @property + def weight_hh_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.hgates.weight + + @property + def bias_ih_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.igates.bias + + @property + def bias_hh_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.hgates.bias + class LSTM(torch.nn.Module): r"""A quantizable long short-term memory (LSTM). @@ -325,12 +362,14 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): cx = hidden_non_opt[1].reshape(self.num_layers, num_directions, max_batch_size, self.hidden_size).unbind(0) - hxcx = [(hx[idx].squeeze_(0), cx[idx].squeeze_(0)) for idx in range(self.num_layers)] + hxcx = [] + for idx in range(self.num_layers): + hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0))) else: hxcx = hidden_non_opt - for idx, layer in enumerate(self.layers): - x, hxcx[idx] = layer(x, hxcx[idx]) + for idx in range(self.num_layers): + x, hxcx[idx] = self.layers[idx](x, hxcx[idx]) hx_list = [] cx_list = [] From db692ec0b3448ce07fd52e1400a0c733cad85821 Mon Sep 17 00:00:00 2001 From: driazati Date: Thu, 2 Sep 2021 17:09:48 -0700 Subject: [PATCH 496/530] Regenerate generated github workflows (#64465) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64465 These were out of date and causing master failures Test Plan: Imported from OSS Reviewed By: zhouzhuojie Differential Revision: D30744594 Pulled By: driazati fbshipit-source-id: 09a21c3c5d9bc83b368d66cabbafd1ba83302dd3 --- .github/workflows/generated-linux-bionic-py3.6-clang9.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml index e54555d12cf62..3aedb76b3e665 100644 --- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml @@ -27,16 +27,20 @@ env: PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} concurrency: - group: linux-bionic-py3.6-clang9-${{ github.event.pull_request.number || github.sha }} + group: linux-bionic-py3.6-clang9-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: ciflow_should_run: runs-on: ubuntu-18.04 if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} steps: - name: noop run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" calculate-docker-image: if: ${{ github.repository_owner == 'pytorch' }} runs-on: linux.2xlarge From c19bd05e8480cb597e4bb574400cad8e18a4dc25 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Thu, 2 Sep 2021 17:43:59 -0700 Subject: [PATCH 497/530] THC: Cleanup dead code (#64441) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64441 Test Plan: Imported from OSS Reviewed By: gchanan Differential Revision: D30735342 Pulled By: ngimel fbshipit-source-id: 84ab36f7aec6b8cd7f1f34c19a58a382c06ad68d --- BUILD.bazel | 2 - aten/src/THC/CMakeLists.txt | 8 --- aten/src/THC/THC.h | 1 - aten/src/THC/THCTensorMath.cu | 39 ----------- aten/src/THC/THCTensorMath.h | 22 ------ aten/src/THC/THCTensorMathPairwise.cu | 24 ------- aten/src/THC/THCTensorMathReduce.cu | 2 - aten/src/THC/generic/THCTensorMath.cu | 70 ------------------- aten/src/THC/generic/THCTensorMath.h | 10 --- aten/src/THC/generic/THCTensorMathPairwise.cu | 29 -------- aten/src/THC/generic/THCTensorMathPairwise.h | 15 ---- 11 files changed, 222 deletions(-) delete mode 100644 aten/src/THC/THCTensorMath.cu delete mode 100644 aten/src/THC/THCTensorMath.h delete mode 100644 aten/src/THC/THCTensorMathPairwise.cu delete mode 100644 aten/src/THC/THCTensorMathReduce.cu delete mode 100644 aten/src/THC/generic/THCTensorMath.cu delete mode 100644 aten/src/THC/generic/THCTensorMath.h delete mode 100644 aten/src/THC/generic/THCTensorMathPairwise.cu delete mode 100644 aten/src/THC/generic/THCTensorMathPairwise.h diff --git a/BUILD.bazel b/BUILD.bazel index a5f20c2020181..36b29379a5c2f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -394,8 +394,6 @@ filegroup( "aten/src/THC/THCStorageCopy.cu.cc", "aten/src/THC/THCTensor.cu.cc", "aten/src/THC/THCTensorCopy.cu.cc", - "aten/src/THC/THCTensorMath.cu.cc", - "aten/src/THC/THCTensorMathPairwise.cu.cc", "aten/src/THC/THCTensorMathScan.cu.cc", "aten/src/THC/THCTensorScatterGather.cu.cc", "aten/src/THC/THCTensorSort.cu.cc", diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index f34b040e484ce..82828a7ac1c89 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -17,10 +17,7 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cu ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cu ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cu - ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMath.cu ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathMagma.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathPairwise.cu - ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathReduce.cu PARENT_SCOPE) install(FILES @@ -33,7 +30,6 @@ install(FILES THCTensor.h THCTensorCopy.h THCTensorCopy.hpp - THCTensorMath.h THCApply.cuh THCReduceApplyUtils.cuh THCTensorMathReduce.cuh @@ -85,8 +81,4 @@ install(FILES generic/THCStorageCopy.h generic/THCTensorCopy.cu generic/THCTensorCopy.h - generic/THCTensorMath.h - generic/THCTensorMath.cu - generic/THCTensorMathPairwise.h - generic/THCTensorMathPairwise.cu DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic") diff --git a/aten/src/THC/THC.h b/aten/src/THC/THC.h index 717442db9eaa1..59e2f5de69fe0 100644 --- a/aten/src/THC/THC.h +++ b/aten/src/THC/THC.h @@ -11,6 +11,5 @@ #include #include -#include #endif diff --git a/aten/src/THC/THCTensorMath.cu b/aten/src/THC/THCTensorMath.cu deleted file mode 100644 index 418bfa9e14919..0000000000000 --- a/aten/src/THC/THCTensorMath.cu +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__ -#include -#endif -#include - -template -struct TensorFillOp { - TensorFillOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* v) { *v = val; } - - const T val; -}; - -#include -#include - -#include -#include - -#include -#include diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h deleted file mode 100644 index b70d4d14b02d9..0000000000000 --- a/aten/src/THC/THCTensorMath.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef TH_CUDA_TENSOR_MATH_INC -#define TH_CUDA_TENSOR_MATH_INC - -#include -#include - -#include -#include - -#include -#include - -#include -#include - -#include -#include - -#include -#include - -#endif diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu deleted file mode 100644 index 6fd026aa8966d..0000000000000 --- a/aten/src/THC/THCTensorMathPairwise.cu +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -template -struct TensorMulConstantOp { - TensorMulConstantOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in * val; - } - - __device__ __forceinline__ void operator()(T* v) { - *v *= val; - } - - const T val; -}; - -#include -#include diff --git a/aten/src/THC/THCTensorMathReduce.cu b/aten/src/THC/THCTensorMathReduce.cu deleted file mode 100644 index 1a2c626537156..0000000000000 --- a/aten/src/THC/THCTensorMathReduce.cu +++ /dev/null @@ -1,2 +0,0 @@ -#include -#include diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu deleted file mode 100644 index d07a3e3a62cdc..0000000000000 --- a/aten/src/THC/generic/THCTensorMath.cu +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THC/generic/THCTensorMath.cu" -#else - -#include - -#include -#include - -void THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); - - if (!THC_pointwiseApply1( - state, self_, TensorFillOp(value))) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} - -void THCTensor_(zero)(THCState *state, THCTensor *self_) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); - if (THCTensor_(isContiguous)(state, self_)) { - THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_), - 0, - sizeof(scalar_t) * THCTensor_(nElement)(state, self_), - c10::cuda::getCurrentCUDAStream())); - } else { - if (!THC_pointwiseApply1( - state, self_, - TensorFillOp(ScalarConvert::to(0)))) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - -ptrdiff_t -THCTensor_(numel)(THCState *state, THCTensor *t) -{ - return THCTensor_(nElement)(state, t); -} - -void THCTensor_(check_shape_except_dim)(THCState *state, - THCTensor *first, THCTensor *second, int dimension, int index); -inline void THCTensor_(check_shape_except_dim)(THCState *state, - THCTensor *first, THCTensor *second, int dimension, int index) -{ - int first_dims = first->dim(); - int second_dims = second->dim(); - THArgCheck(first_dims == second_dims, 0, - "Tensors must have same number of dimensions: got %d and %d", - first_dims, second_dims); - for (int dim = 0; dim < first_dims; dim++) { - if (dim == dimension) { - continue; - } - int64_t first_dim_size = THCTensor_(size)(state, first, dim); - int64_t second_dim_size = THCTensor_(size)(state, second, dim); - THArgCheck(first_dim_size == second_dim_size, 0, - "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d (The offending index is %d)", - dimension, (long long)first_dim_size, (long long)second_dim_size, dim, index); - } -} - - -#endif diff --git a/aten/src/THC/generic/THCTensorMath.h b/aten/src/THC/generic/THCTensorMath.h deleted file mode 100644 index 58ec1567aed9b..0000000000000 --- a/aten/src/THC/generic/THCTensorMath.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THC/generic/THCTensorMath.h" -#else - -TORCH_CUDA_CU_API void THCTensor_( - fill)(THCState* state, THCTensor* self, scalar_t value); -TORCH_CUDA_CU_API void THCTensor_(zero)(THCState* state, THCTensor* self); -TORCH_CUDA_CU_API ptrdiff_t THCTensor_(numel)(THCState* state, THCTensor* t); - -#endif diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu deleted file mode 100644 index aba731c725423..0000000000000 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.cu" -#else - -#include - -#if !defined(THC_REAL_IS_BOOL) - -void THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); - if (self_ == src_) { - if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src_); - - if (!THC_pointwiseApply2(state, self_, src_, TensorMulConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - -#endif - -#endif diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h deleted file mode 100644 index deeafb1291fbd..0000000000000 --- a/aten/src/THC/generic/THCTensorMathPairwise.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.h" -#else - -TORCH_CUDA_CU_API int THCTensor_( - equal)(THCState* state, THCTensor* self, THCTensor* src); - -#if !defined(THC_REAL_IS_BOOL) - -TORCH_CUDA_CU_API void THCTensor_( - mul)(THCState* state, THCTensor* self, THCTensor* src, scalar_t value); - -#endif - -#endif From cd82bc1af901f86c562663eb1c09413c7d4a19b2 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Thu, 2 Sep 2021 20:51:38 -0700 Subject: [PATCH 498/530] Skips layer norm OpInfo on tbb platform (#64469) Summary: The OpInfo tests appear to be discovering a layer norm x tbb issue that requires investigation. Skipping tests on that platform for now to restore CI signal. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64469 Reviewed By: ngimel Differential Revision: D30745746 Pulled By: mruberry fbshipit-source-id: 282484cc00b867fac85b7df61430d64277da6421 --- torch/testing/_internal/common_methods_invocations.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index a45bcf54faba8..221cb29ec5eb6 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -7,6 +7,7 @@ import random import numbers import unittest +import os import torch import numpy as np @@ -7384,6 +7385,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}), 'TestCommon', 'test_reference_testing' ), + unittest.skipIf("tbb" in os.getenv("BUILD_ENVIRONMENT", ""), "This test makes TBB Sad"), ], sample_inputs_func=sample_inputs_layer_norm,), OpInfo('nn.functional.pad', From e1c3e5f8308e6a47b76c8a22f7bc8cbc29aa4ae3 Mon Sep 17 00:00:00 2001 From: James Reed Date: Thu, 2 Sep 2021 21:11:57 -0700 Subject: [PATCH 499/530] [resubmit][FX] Prototype for guarding against mutable operations in tracing (#64467) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64467 Test Plan: Imported from OSS Reviewed By: driazati Differential Revision: D30744870 Pulled By: jamesr66a fbshipit-source-id: fc652f8b17748f90dbeb83fabf3bd5bb57d6ff1a --- ..._compat-fx_backcompat_class_members.expect | 2 +- test/test_fx.py | 67 +++++++++++++++++-- torch/csrc/jit/python/init.cpp | 14 ++-- torch/fx/operator_schemas.py | 53 +++++++++++++-- torch/fx/proxy.py | 7 ++ 5 files changed, 127 insertions(+), 16 deletions(-) diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect index 88e4654b568df..5c3630a3169f7 100644 --- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect +++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect @@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node'] torch.fx.proxy.GraphAppendingTracer [] torch.fx.proxy.Proxy ['keys'] torch.fx.proxy.TraceError [] -torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] +torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool'] torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map'] \ No newline at end of file diff --git a/test/test_fx.py b/test/test_fx.py index 5220f67ebf309..57a2960a409c3 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -132,10 +132,17 @@ def __init__(self, a, b): class TestFX(JitTestCase): def setUp(self): - if TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS: - return - lib_file_path = find_library_location('libtorchbind_test.so') - torch.ops.load_library(str(lib_file_path)) + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + if not (TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS): + lib_file_path = find_library_location('libtorchbind_test.so') + torch.ops.load_library(str(lib_file_path)) + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): """Check that an nn.Module's results match the GraphModule version @@ -2367,6 +2374,19 @@ def forward(self, x: torch.Tensor, y: int, z: int): traced.graph.lint() + def test_throw_out_variant(self): + def foo(x): + y = torch.rand_like(x) + torch.sigmoid(x, out=y) + return y + + class MyTracer(torch.fx.Tracer): + check_mutable_operations = True + + tracer = MyTracer() + with self.assertRaisesRegex(RuntimeError, 'mutable operation aten::sigmoid.out'): + traced_graph = tracer.trace(foo) + def test_ast_rewriter_reassigns_submodules(self): class M(torch.nn.Module): def __init__(self): @@ -3021,6 +3041,15 @@ def run_getitem_target(): class TestOperatorSignatures(JitTestCase): + def setUp(self): + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + @onlyCPU @ops(op_db, allowed_dtypes=(torch.float,)) def test_get_torch_func_signature_exhaustive(self, device, dtype, op): @@ -3090,6 +3119,15 @@ class TestFXAPIBackwardCompatibility(JitTestCase): def setUp(self): self.maxDiff = None + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + + def _fn_to_stable_annotation_str(self, obj): """ Unfortunately we have to serialize function signatures manually since @@ -3326,6 +3364,15 @@ def check_symbols_have_bc_designation(m, prefix): f"BC guarantees.") class TestFunctionalTracing(JitTestCase): + def setUp(self): + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary", "has_torch_function_variadic", "handle_torch_function", "boolean_dispatch") @@ -3340,6 +3387,7 @@ class TestFunctionalTracing(JitTestCase): ARG_TYPE_MISMATCH = (TypeError, r", not Proxy$") CONTROL_FLOW = (TraceError, r"symbolically traced variables cannot be used as inputs to control flow") INTERPOLATE_ARGS_CONFLICT = (ValueError, r"only one of size or scale_factor should be defined") + MUTABLE = (RuntimeError, r"Tried to trace mutable operation") UNTRACEABLE_FUNCTIONALS = { "adaptive_avg_pool1d": BUILT_IN_FUNC, @@ -3459,6 +3507,8 @@ class TestFunctionalTracing(JitTestCase): "upsample_bilinear": INTERPOLATE_ARGS_CONFLICT, "upsample_nearest": INTERPOLATE_ARGS_CONFLICT, + + "normalize" : MUTABLE, } # List of nn.functionals with Tensor inputs but not with type annotation @@ -3573,6 +3623,15 @@ def tearDownClass(cls): @skipIfNoTorchVision class TestVisionTracing(JitTestCase): + def setUp(self): + # Checking for mutable operations whil tracing is feature flagged + # Enable it in testing but not by default + self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations + torch.fx.proxy.TracerBase.check_mutable_operations = True + + def tearDown(self): + torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag + PROXY_ITERATED = (TraceError, r"Proxy object cannot be iterated") INCONSISTENT_TYPE = ( RuntimeError, diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 7e43e511c786f..35197e4ea1423 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -1280,11 +1280,15 @@ void initJITBindings(PyObject* module) { [](const FunctionSchema& self, const FunctionSchema& other) { return self == other; }) - .def("__str__", [](FunctionSchema& self) { - std::stringstream ss; - ss << self; - return ss.str(); - }); + .def( + "__str__", + [](FunctionSchema& self) { + std::stringstream ss; + ss << self; + return ss.str(); + }) + .def_property_readonly( + "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); }); py::class_(m, "Argument") .def_property_readonly("name", [](Argument& self) { return self.name(); }) .def_property_readonly("type", [](Argument& self) { return self.type(); }) diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py index ac559b19530c7..d7ddc3e0360c7 100644 --- a/torch/fx/operator_schemas.py +++ b/torch/fx/operator_schemas.py @@ -4,10 +4,13 @@ import typing import enum import warnings -from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast +from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast, TYPE_CHECKING from torch._jit_internal import boolean_dispatched from ._compatibility import compatibility +if TYPE_CHECKING: + from .node import Argument + @compatibility(is_backward_compatible=False) class ArgsKwargsPair(NamedTuple): """ @@ -79,7 +82,43 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins return inspect.Signature(parameters, return_annotation=return_type) @compatibility(is_backward_compatible=False) -def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]: +def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']): + signatures, schemas = get_signature_for_torch_op(target, return_schemas=True) + + if signatures and schemas: + matched_schemas = [] + + # Iterate through all of the schema until we find one that matches + # If one matches, populate `new_args_and_kwargs` with the new args/kwargs + # values. If none matches, `new_args_and_kwargs` will be None + for candidate_signature, schema in zip(signatures, schemas): + try: + candidate_signature.bind(*args, **kwargs) + matched_schemas.append((candidate_signature, schema)) + except TypeError as e: + continue + + def throw_if_mutable(schema): + if schema.is_mutable: + raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional ' + f'code, so operations that mutate operands in-place (e.g. via `out` arguments) ' + f'are not supported') + + if len(matched_schemas) == 0: + # Did not match any schema. Cannot check for mutation + pass + elif len(matched_schemas) == 1: + # Matched exactly one schema, unambiguous + _, schema_to_check = matched_schemas[0] + throw_if_mutable(schema_to_check) + pass + else: + # Ambiguous schema match. Since mutability checking is best effort, + # do nothing. + pass + +@compatibility(is_backward_compatible=False) +def get_signature_for_torch_op(op : Callable, return_schemas : bool = False): """ Given an operator on the `torch` namespace, return a list of `inspect.Signature` objects corresponding to the overloads of that op.. May return `None` if a signature @@ -90,21 +129,23 @@ def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature Returns: Optional[List[inspect.Signature]]: A list of signatures for the overloads of this - operator, or None if the operator signatures could not be retrieved. + operator, or None if the operator signatures could not be retrieved. If + return_schemas=True, returns a tuple containing the optional Python signatures + and the optional TorchScript Function signature """ override = _manual_overrides.get(op) if override: - return override + return (override, None) if return_schemas else None aten_fn = torch.jit._builtins._find_builtin(op) if aten_fn is None: - return None + return (None, None) if return_schemas else None schemas = torch._C._jit_get_schemas_for_operator(aten_fn) signatures = [_torchscript_schema_to_signature(schema) for schema in schemas] - return signatures + return (signatures, schemas) if return_schemas else signatures @compatibility(is_backward_compatible=False) def create_type_hint(x): diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py index 61b039f8b7219..b25e45d206a51 100644 --- a/torch/fx/proxy.py +++ b/torch/fx/proxy.py @@ -8,11 +8,15 @@ from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable from .node import Target, Node, Argument, base_types, map_aggregate from ._compatibility import compatibility +from .operator_schemas import check_for_mutable_operation @compatibility(is_backward_compatible=True) class TracerBase: graph: Graph record_stack_traces : bool = False + # Feature flag for mutable schema checking + # Enableby default in 1.12 + check_mutable_operations : bool = False @compatibility(is_backward_compatible=True) def create_node(self, kind : str, target : Target, @@ -25,6 +29,9 @@ def create_node(self, kind : str, target : Target, modification of values used in node creation. For example, one might want to disallow in-place operations from being recorded. """ + if kind == 'call_function' and self.check_mutable_operations: + check_for_mutable_operation(target, args, kwargs) + return self.graph.create_node(kind, target, args, kwargs, name, type_expr) @compatibility(is_backward_compatible=True) From 7031fbdc63b0334543e257c064aad3f85b9a102c Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 2 Sep 2021 22:16:22 -0700 Subject: [PATCH 500/530] update optimize_for_inference docs (#64428) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64428 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D30740898 Pulled By: eellison fbshipit-source-id: b94d2c3deb661a6ba048f19e8c1d5e1799667eeb --- docs/source/jit.rst | 1 + torch/jit/_freeze.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index f791c1c687153..8a80b6471e1a7 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -60,6 +60,7 @@ Creating TorchScript Code ScriptModule ScriptFunction freeze + optimize_for_inference save load ignore diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py index cab6d3c8f71ef..582baf7422343 100644 --- a/torch/jit/_freeze.py +++ b/torch/jit/_freeze.py @@ -179,6 +179,18 @@ def optimize_for_inference(mod: ScriptModule) -> ScriptModule: This is still in prototype, and may have the potential to slow down your model. Primary use cases that have been targeted so far have been vision models on cpu and gpu to a lesser extent. + + Example (optimizing a module with Conv->Batchnorm):: + + import torch + in_channels, out_channels = 3, 32 + conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True) + bn = torch.nn.BatchNorm2d(out_channels, eps=.001) + mod = torch.nn.Sequential(conv, bn) + frozen_mod = torch.jit.optimize_for_inference(torch.jit.script(mod.eval())) + assert "batch_norm" not in str(frozen_mod.graph) + # if built with MKLDNN, convolution will be run with MKLDNN weights + assert "MKLDNN" in frozen_mod.graph """ if not isinstance(mod, ScriptModule): raise RuntimeError( From 39aeb3bf63f61664bc6c4a929a80a660365c2a5e Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 2 Sep 2021 22:16:22 -0700 Subject: [PATCH 501/530] Add fusion enabled apis (#64429) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64429 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D30740897 Pulled By: eellison fbshipit-source-id: 446aa63b5d763f1cfffea62547db7294368e3438 --- docs/source/jit.rst | 2 ++ torch/jit/__init__.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 8a80b6471e1a7..97a0615812830 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -61,6 +61,8 @@ Creating TorchScript Code ScriptFunction freeze optimize_for_inference + enable_fusion + fusion_enabled save load ignore diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f7fa58bd36434..f804d3c72ce8e 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -4,6 +4,7 @@ from typing import Iterator from torch.utils import set_module +from typing import Optional # These are imported so users can access them from the `torch.jit` module from torch._jit_internal import ( @@ -197,6 +198,34 @@ def _hide_source_ranges() -> Iterator[None]: finally: torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges) # type: ignore[attr-defined] +def enable_fusion(enabled: bool, device: Optional[str] = None): + """ + Enables or disables JIT fusion based on the parameter `enabled`. + + If `device` is None, both CPU and GPU fusion will be turned on or off. + Otherwise, device must be equal to "cpu" or "cuda", and will turn on or off + CPU and GPU fusion respectively. + """ + + if device is None: + torch._C._jit_override_can_fuse_on_cpu(enabled) + torch._C._jit_override_can_fuse_on_gpu(enabled) + else: + assert device in ["cpu", "cuda"], "Device-specific fusion must be equal to 'cpu' or 'cuda' if not None" + if device == "cuda": + torch._C._jit_override_can_fuse_on_gpu(enabled) + else: + torch._C._jit_override_can_fuse_on_cpu(enabled) + +def fusion_enabled(device: str): + """ + Returns whether JIT fusion is enabled for "cpu" or "cuda" + """ + assert device == "cpu" or device == "cuda" + if device == "cpu": + return torch._C._jit_can_fuse_on_cpu() + else: + return torch._C._jit_can_fuse_on_gpu() if not torch._C._jit_init(): raise RuntimeError("JIT initialization failed") From 91b926fab3f7fec9055a0425f55443dee25afbad Mon Sep 17 00:00:00 2001 From: Kefei Lu Date: Thu, 2 Sep 2021 23:03:02 -0700 Subject: [PATCH 502/530] Add fx2trt pass for removing duplicate output args (#64461) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64461 Fx2TRT does not support duplicate nodes in the output args tuple. This pass removes duplicate output args from the target subnets and fixes their uses in the top level module where the subnets are called. This pass must be called after acc split on the top-level net and subsequent calls to the acc trace on the subnets. This pass will change both the subnets and top level module. Test Plan: Run: ``` buck run mode/opt -c python.package_style=inplace //caffe2/torch/fb/fx2trt/tests/passes/:test_remove_duplicate_output_args ``` Reviewed By: yinghai Differential Revision: D30740499 fbshipit-source-id: 98459f7677980b21c7bffda918158001285572db --- .../passes/remove_duplicate_output_args.py | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py diff --git a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py new file mode 100644 index 0000000000000..bfddab57c0935 --- /dev/null +++ b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import operator +import typing as t +import logging +import torch.fx as fx +import dataclasses as dc + + +_LOGGER = logging.getLogger(__name__) + + +def remove_duplicate_output_args( + top_level: fx.GraphModule, + target_subnets: t.Collection[str] +) -> t.Mapping[str, "RemoveDuplicateResult"]: + """Removes duplicate output args. + + This pass removes duplicate output args from the target subnets and fixes + their uses in the top level module where the subnets are called. This pass + must be called after acc split on the top-level net and subsequent calls to + the acc trace on the subnets. + + This pass will change both the subnets and top level module. + + Returns: + a mapping of the target subnet name to its dedupcate result + """ + + processed_subnets = {} + for node in top_level.graph.nodes: # type: fx.Node + if node.op == "call_module" and node.name in target_subnets: + assert isinstance(node.target, str) + sub_gm = top_level.get_submodule(node.target) + assert isinstance(sub_gm, fx.GraphModule) + + replace_res = _remove_duplicate_output_args(sub_gm) + processed_subnets[node.name] = replace_res + if replace_res.replacement_map is None: + continue + sub_gm.recompile() + + needs_recompile = False + # iterate on the copy since we will be changing elements of node.users + for user in list(node.users): + idx = _ensure_proper_output_use(user, node) + idx_new = replace_res.replacement_map[idx] + if idx_new != idx: + user.args = (user.args[0], idx_new) + needs_recompile = True + + if needs_recompile: + top_level.recompile() + return processed_subnets + + +@dc.dataclass(frozen=True) +class RemoveDuplicateResult: + replacement_map: t.Optional[t.List[int]] + module: fx.GraphModule + + +def _ensure_proper_output_use(user: fx.Node, target_node: fx.Node) -> int: + """ + Ensures the node looks in proper form of calling the output of an fx2trt + splitter sub-net. Specifically: + + 1. op is call function, target: operator.getitem + 2. args is a 2-element tuple + 3. args[0] is the name of the subnet's output + 4. args[1] is the index into the subnet output tuple + + E.g.: + + %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%_run_on_acc_1, 4), kwargs = {}) + + returns the index into the subnet output tuple + """ + _LOGGER.info(f"Checking user node: {user.format_node()}") + assert ( + user.op == "call_function" + and user.target == operator.getitem + and len(user.args) == 2 + and isinstance(user.args[0], fx.Node) + and user.args[0].name == target_node.name + and isinstance(user.args[1], int) + ), f"Node is not a proper user of splitter output: {user.format_node()}" + + return user.args[1] + + +def _remove_duplicate_output_args(gm: fx.GraphModule) -> RemoveDuplicateResult: + output_nodes = [n for n in gm.graph.nodes if n.op == "output"] + assert len(output_nodes) == 1, \ + f"Expecting exactly one `output` node, but got {len(output_nodes)}" + + changed = False + # arg node name to its index in the new output args tuple + name_to_idx: t.Dict[str, int] = {} + output_node = output_nodes[0] + + # Output op only uses its `args[0]`, and it does not have `kwargs`. + # https://pytorch.org/docs/stable/fx.html#torch.fx.Node + args: t.Sequence[t.Any] = output_node.args[0] + + # Only concern outselves to the case where the args is an iterable of fx.Node. + # Other return cases (e.g., a single value) is possible and we don't handle + # that in this pass. + if not (isinstance(args, t.Iterable) and all(isinstance(a, fx.Node) for a in args)): + return RemoveDuplicateResult(replacement_map=None, module=gm) + + # Map old index of the arg node to the remaining node's idx, + # initialized to `i => i` + replacement_map: t.List[int] = list(range(len(args))) + args_new = [] + for idx, a in enumerate(args): + assert isinstance(a, fx.Node), \ + f"Expecting fx.Node instance, but got: {type(a)}" + + if a.name not in name_to_idx: + args_new.append(a) + name_to_idx[a.name] = len(args_new) - 1 + else: + changed = True + _LOGGER.warning( + f"Replaced duplicate output arg '{a.name}': " + f"{idx} -> {name_to_idx[a.name]}" + ) + replacement_map[idx] = name_to_idx[a.name] + + output_node.args = (tuple(args_new),) + if changed: + gm.recompile() + return RemoveDuplicateResult(replacement_map, module=gm) From 6831d8e379392da1340a28fdb3e7e1382176d1d4 Mon Sep 17 00:00:00 2001 From: Ansley Ussery Date: Fri, 3 Sep 2021 06:10:37 -0700 Subject: [PATCH 503/530] Support Union in TorchScript (#64234) Summary: This PR is created to replace https://github.com/pytorch/pytorch/pull/53180 PR stack, which has all the review discussions. Reason for needing a replacement is due to a messy Sandcastle issue. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64234 Reviewed By: gmagogsfm Differential Revision: D30656444 Pulled By: ansley fbshipit-source-id: 77536c8bcc88162e2c72636026ca3c16891d669a --- CONTRIBUTING.md | 4 +- aten/src/ATen/core/jit_type.h | 172 +++-- aten/src/ATen/core/jit_type_base.h | 7 +- aten/src/ATen/core/type.cpp | 488 ++++++++++++- aten/src/ATen/native/README.md | 4 +- c10/mobile/CPUProfilingAllocator.h | 2 +- docs/source/rpc/distributed_autograd.rst | 4 +- test/cpp/jit/CMakeLists.txt | 1 + test/cpp/jit/test_alias_analysis.cpp | 25 + test/cpp/jit/test_union.cpp | 149 ++++ .../TestScript.test_annot_ast_mypy_fn.expect | 2 +- ...stScript.test_annot_ast_mypy_method.expect | 2 +- .../TestScript.test_annot_ast_py3_fn.expect | 2 +- ...estScript.test_annot_ast_py3_method.expect | 2 +- ...estScript.test_annot_string_mypy_fn.expect | 2 +- ...cript.test_annot_string_mypy_method.expect | 2 +- ...TestScript.test_annot_string_py3_fn.expect | 2 +- ...Script.test_annot_string_py3_method.expect | 2 +- test/jit/test_list_dict.py | 6 +- test/jit/test_typing.py | 16 +- test/jit/test_union.py | 657 ++++++++++++++++++ test/test_jit.py | 57 +- test/test_ops.py | 2 + test/test_public_bindings.py | 1 + torch/_C/__init__.pyi.in | 3 + torch/_jit_internal.py | 54 +- torch/csrc/jit/OVERVIEW.md | 9 +- torch/csrc/jit/frontend/convert_to_ssa.cpp | 6 +- torch/csrc/jit/frontend/exit_transforms.cpp | 6 +- torch/csrc/jit/frontend/ir_emitter.cpp | 307 +++++--- torch/csrc/jit/frontend/schema_matching.h | 7 +- .../csrc/jit/frontend/schema_type_parser.cpp | 13 + .../csrc/jit/frontend/script_type_parser.cpp | 33 +- torch/csrc/jit/ir/alias_analysis.cpp | 292 +++++--- torch/csrc/jit/ir/alias_analysis.h | 30 +- torch/csrc/jit/ir/ir.cpp | 17 +- torch/csrc/jit/ir/ir.h | 29 +- torch/csrc/jit/mobile/type_parser.cpp | 11 + torch/csrc/jit/passes/shape_analysis.cpp | 18 + torch/csrc/jit/passes/utils/memory_dag.cpp | 11 +- torch/csrc/jit/passes/utils/memory_dag.h | 26 +- torch/csrc/jit/python/pybind_utils.cpp | 13 + torch/csrc/jit/python/python_ir.cpp | 6 + torch/csrc/jit/serialization/import.cpp | 3 +- .../csrc/jit/serialization/import_source.cpp | 13 +- torch/csrc/jit/serialization/python_print.cpp | 28 +- torch/csrc/jit/serialization/unpickler.cpp | 17 +- torch/jit/_script.py | 2 +- torch/jit/annotations.py | 28 +- torch/jit/frontend.py | 1 + 50 files changed, 2132 insertions(+), 462 deletions(-) create mode 100644 test/cpp/jit/test_union.cpp create mode 100644 test/jit/test_union.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 93de9b022ee6f..e102de7be6334 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -435,12 +435,12 @@ is `./build/bin/FILENAME --gtest_filter=TESTSUITE.TESTNAME`, where `TESTNAME` is the name of the test you'd like to run and `TESTSUITE` is the suite that test is defined in. -For example, if you wanted to run the test ` MayContainAlias`, which +For example, if you wanted to run the test `MayContainAlias`, which is part of the test suite `ContainerAliasingTest` in the file `test/cpp/jit/test_alias_analysis.cpp`, the command would be: ```bash -./build/bin/test_jit --gtest_filter=ContainerAliasingTest.UnionAliasing +./build/bin/test_jit --gtest_filter=ContainerAliasingTest.MayContainAlias ``` diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index eee5acaccd655..4284e296229cc 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -30,6 +30,9 @@ struct FunctionSchema; struct NamedType; using OptNameList = c10::optional>; +void standardizeVectorForUnion(std::vector& reference, std::vector* to_fill); +void standardizeVectorForUnion(std::vector* to_flatten); + struct AnyType; using AnyTypePtr = std::shared_ptr; // Any is the top of the type hierarchy, all other types are subtypes @@ -94,25 +97,84 @@ struct SingleElementType : public Type { TypePtr elem; }; +struct UnionType; +using UnionTypePtr = std::shared_ptr; +struct TORCH_API UnionType : public Type { + friend struct Type; + + static const TypeKind Kind = TypeKind::UnionType; + + bool isSubtypeOfExt(const TypePtr& rhs_, std::ostream* why_not) const override; + + std::string str() const override; + + static UnionTypePtr create(std::vector reference); + + bool operator==(const Type& rhs) const override; + + at::ArrayRef containedTypes() const override { + return types_; + } + + // For testing purposes only + at::ArrayRef getTypes() const { + return types_; + } + + TypePtr createWithContained(std::vector contained_types) const override { + return create(contained_types); + } + + bool canHoldType(TypePtr type) const; + + bool hasFreeVariables() const override { + return has_free_variables_; + } + + c10::optional toOptional() const; + + c10::optional subtractTypeSet(std::vector& to_subtract) const; + + protected: + explicit UnionType(std::vector types, TypeKind kind=TypeKind::UnionType); + std::string annotation_str_impl(TypePrinter printer = nullptr) const override; + std::string unionStr(TypePrinter printer = nullptr, bool is_annotation_str = false) const; + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) + bool has_free_variables_; + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) + std::vector types_; + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) + bool can_hold_none_; + +}; + struct OptionalType; using OptionalTypePtr = std::shared_ptr; -// This type represents an optional type, for each element type. -// Optional[T] can accept both T and None(nullopt in C++) +// This type represents an optional type. There is one `Optional` for +// each element type. `Optional[T]` can accept both `T` and +// `None`(`c10::nullopt` in C++) // Subtype hierarchy for Optional: -// 1. Optional[T] <: Optional[R] iff T <: R -// 2. T <: Optional[R] if T <: R -// 3. None <: Optional[T] for all T -struct TORCH_API OptionalType - : public SingleElementType { - static OptionalTypePtr create(TypePtr element) { - TORCH_INTERNAL_ASSERT(element, "OptionalType requires valid TypePtr"); - // Optional is a union of [None, T], so Optional[[Optional[T]]] -> - // Optional[T] - if (auto opt_ptr = element->cast()) { - return opt_ptr; - } - return OptionalTypePtr( - new OptionalType(std::move(element))); // NOLINT(modernize-make-shared) +// - Optional[T] <: Optional[R] iff T <: R +// - T <: Optional[R] if T <: R +// - None <: Optional[T] for all T +// - Optional[T] == Union[T, None] for all T +struct TORCH_API OptionalType : public UnionType { + static OptionalTypePtr create(TypePtr contained) { + return OptionalTypePtr(new OptionalType(std::move(contained))); + } + + static const TypeKind Kind = TypeKind::OptionalType; + + friend struct Type; + + bool operator==(const Type& rhs) const override; + + TypePtr getElementType() const { + return contained_; + } + + at::ArrayRef containedTypes() const override { + return contained_; } std::string str() const override { @@ -127,20 +189,15 @@ struct TORCH_API OptionalType return create(contained_types[0]); } - bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override { - if (Type::isSubtypeOfExt(rhs, why_not)) { - return true; - } - if (auto rhs_ = rhs->cast()) { - return getElementType()->isSubtypeOfExt(rhs_->getElementType(), why_not); - } - return false; - } + bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override; + // common cast Optional[Tensor] for undefined tensor type static OptionalTypePtr ofTensor(); private: - OptionalType(TypePtr elem) : SingleElementType(elem) {} + explicit OptionalType(TypePtr contained); + + TypePtr contained_; std::string annotation_str_impl(TypePrinter printer = nullptr) const override { std::stringstream ss; @@ -908,7 +965,6 @@ struct TORCH_API RRefType } }; - struct NamedType; using NamedTypePtr = std::shared_ptr; using ConstNamedTypePtr = std::shared_ptr; @@ -1112,7 +1168,6 @@ struct TORCH_API EnumType : public NamedType { std::weak_ptr<::torch::jit::CompilationUnit> cu_; }; - // the common supertype of all Enums, only used in operator registraion. // EnumType <: AnyEnumType for all Enums struct AnyEnumType; @@ -1132,7 +1187,6 @@ struct TORCH_API AnyEnumType : public Type { : Type(TypeKind::AnyEnumType) {} }; - struct NumberType; using NumberTypePtr = std::shared_ptr; // This type represents a Python number @@ -1141,9 +1195,10 @@ using NumberTypePtr = std::shared_ptr; // FloatType <: NumberType // ComplexType <:NumberType struct TORCH_API NumberType : public Type { - bool operator==(const Type& rhs) const override { - return rhs.kind() == kind(); - } + bool operator==(const Type& rhs) const override; + + bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override; + std::string str() const override { return "Scalar"; // match what PythonArgParser says for clarity } @@ -1172,7 +1227,8 @@ struct TORCH_API FloatType : public NumberType { return "float"; } bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override { - return rhs->kind() == TypeKind::NumberType || NumberType::isSubtypeOfExt(rhs, why_not); + // NOLINTNEXTLINE(bugprone-parent-virtual-call) + return rhs->kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not); } static const TypeKind Kind = TypeKind::FloatType; // global singleton @@ -1196,7 +1252,8 @@ struct TORCH_API ComplexType : public NumberType { return "complex"; } bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override { - return rhs->kind() == TypeKind::NumberType || NumberType::isSubtypeOfExt(rhs, why_not); + // NOLINTNEXTLINE(bugprone-parent-virtual-call) + return rhs->kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not); } static const TypeKind Kind = TypeKind::ComplexType; // global singleton @@ -1220,7 +1277,8 @@ struct TORCH_API IntType : public NumberType { return "int"; } bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override { - return rhs->kind() == TypeKind::NumberType || NumberType::isSubtypeOfExt(rhs, why_not); + // NOLINTNEXTLINE(bugprone-parent-virtual-call) + return rhs->kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not); } static const TypeKind Kind = TypeKind::IntType; // global singleton @@ -1334,12 +1392,8 @@ struct TORCH_API NoneType : public Type { std::string str() const override { return "NoneType"; } - bool isSubtypeOfExt(const TypePtr& rhs, std::ostream *why_not) const override { - if (rhs->kind() == OptionalType::Kind) { - return true; - } - return Type::isSubtypeOfExt(rhs, why_not); - } + bool isSubtypeOfExt(const TypePtr& rhs, std::ostream *why_not) const override; + static const TypeKind Kind = TypeKind::NoneType; // global singleton static NoneTypePtr get(); @@ -1524,8 +1578,15 @@ TORCH_API std::ostream& operator<<(std::ostream& os, const Stride& s); // what is the type, ignoring extra size/shape information? // e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...) -// xxx: be careful with calls because this can be very slow. If calling this on a graph -// use `EraseShapeInformation` in shape_analysis.h +// `unshapedType` is used to remove Tensor subtypes. We treat all Tensor +// subtypes as simply "Tensor"; we also create a new version of any +// container types in which internal Tensors have undergone the same +// operation. This is used for type comparisons between two Tensor types +// (`unshapedType` means that we don't falsely return `false` for e.g. +// Tensors of different dimensions). It's also used in the alias +// analysis pass. +// Be careful with calls because this can be very slow. If calling this +// on a graph, use `EraseShapeInformation` in shape_analysis.h inline TypePtr unshapedType(const TypePtr& type) { if (type->isSubtypeOf(TensorType::get())) { return TensorType::get(); @@ -1569,27 +1630,32 @@ inline at::ScalarType scalarTypeFromJitType(const c10::TypePtr& type) { return *result; } -// Attempt to find the correct supertype of t1 and t2. If none is found then -// nullopt will be returned if default_to_any is false, and Any will be returned -// if it is true. If t1 == t2, or t1 is a type refinement of t2, -// then t2 will be returned (and vice versa). +// Attempt to find the correct supertype of the two types `t1` and `t2`. +// If no supertype is found, then nullopt will be returned if +// `default_to_union` is false, and `Union[t1, t2]` will be returned +// if it is true. If `t1 == t2`, or `t1` is a type refinement of `t2`, +// then `t2` will be returned (and vice versa). +// // Two different tensortypes will return dynamic. -// Currently we chose not to support returning a NumberType for a float & int -// input because of a lack of operator support for NumberType. +// +// Currently we chose not to support returning a NumberType for +// two types from the set of {FloatType, IntType, ComplexType}, because +// there is a lack of operator support for NumberType. +// // If `type_hint` is an `InterfaceType`, then we can use that as a // potential supertype for `ClassType`s in the list. Otherwise, we have // no way to find and use some common interface type TORCH_API c10::optional unifyTypes( const TypePtr& t1, const TypePtr& t2, - bool default_to_any = false, - TypePtr type_hint=nullptr); + bool default_to_union = false, + TypePtr type_hint = nullptr); TORCH_API c10::optional unifyTypeList( at::ArrayRef elements, std::ostream& why_not, - bool default_to_any=false, - TypePtr type_hint=nullptr); + bool default_to_union = false, + TypePtr type_hint = nullptr); namespace detail { template diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index dbb4a62f73088..a9be1e8d68658 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -21,7 +21,7 @@ namespace c10 { _(DictType) \ _(NumberType) \ _(FloatType) \ - _(ComplexType) \ + _(ComplexType) \ _(FutureType) \ _(RRefType) \ _(IntType) \ @@ -44,7 +44,8 @@ namespace c10 { _(ScalarTypeType) \ _(AnyListType) \ _(AnyTupleType) \ - _(AnyClassType) + _(AnyClassType) \ + _(UnionType) enum class TypeKind { #define DEFINE_TYPE(T) T, @@ -203,7 +204,7 @@ struct TORCH_API Type : std::enable_shared_from_this { // contained_types TypePtr withContained(std::vector contained_types) { auto current_contained = containedTypes(); - AT_ASSERT(current_contained.size() == contained_types.size()); + TORCH_INTERNAL_ASSERT(current_contained.size() == contained_types.size()); if (current_contained.equals(contained_types)) { return shared_from_this(); } diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 6bfba7b6d181a..fec0cb086ee51 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -265,7 +265,7 @@ AnyEnumTypePtr AnyEnumType::get() { return value; } -c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_any=false, TypePtr type_hint=nullptr) { +c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, TypePtr type_hint=nullptr) { // check direct subtyping relation if (t1->isSubtypeOf(t2)) { return t2; @@ -308,7 +308,7 @@ c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool } std::vector elements; for (size_t i = 0; i < tuple1->elements().size(); i++) { - if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_any)) { + if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_union)) { elements.push_back(*elem); } else { return c10::nullopt; @@ -347,11 +347,11 @@ c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool return c10::nullopt; } -c10::optional unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_any, TypePtr type_hint) { - auto unified = unifyTypesImpl(t1, t2, default_to_any, type_hint); +c10::optional unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, TypePtr type_hint) { + auto unified = unifyTypesImpl(t1, t2, default_to_union, type_hint); - if (default_to_any && !unified) { - return AnyType::get(); + if (default_to_union && !unified) { + return UnionType::create({t1, t2}); } return unified; @@ -360,7 +360,7 @@ c10::optional unifyTypes(const TypePtr& t1, const TypePtr& t2, bool def c10::optional unifyTypeList( at::ArrayRef elements, std::ostream& why_not, - bool default_to_any, + bool default_to_union, TypePtr type_hint) { if (elements.size() == 0) { why_not << "Cannot get unified type from empty list"; @@ -369,7 +369,7 @@ c10::optional unifyTypeList( TypePtr ret_type = elements.at(0); for (size_t i = 1; i < elements.size() && ret_type; ++i) { - c10::optional maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_any, type_hint); + c10::optional maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_union, type_hint); if (!maybe_unified) { why_not << "Could not unify type list since element " << i << " of type " << elements.at(i)->repr_str() @@ -547,8 +547,9 @@ TORCH_API TypePtr tryEvalTypeVariables(TypePtr type, std::unordered_mapkind() == OptionalType::Kind || - elem_type->kind() == NumberType::Kind) { + if (elem_type->kind() == UnionType::Kind + || elem_type->kind() == OptionalType::Kind + || elem_type->kind() == NumberType::Kind) { // Builtin Union types return false; } @@ -577,8 +578,16 @@ bool Type::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const { if (rhs->kind() == TypeKind::AnyType || *this == *rhs) { return true; } - if(auto rhs_ = rhs->cast()) { - return this->isSubtypeOfExt(rhs_->getElementType(), why_not); + if (auto opt_rhs = rhs->cast()) { + return this->isSubtypeOfExt(opt_rhs->getElementType(), why_not); + } + if (auto union_rhs = rhs->cast()) { + // Check if `this` is a subtype of any of the types within the Union + return std::any_of(union_rhs->containedTypes().begin(), + union_rhs->containedTypes().end(), + [&](TypePtr inner) { + return this->isSubtypeOfExt(inner, why_not); + }); } return false; } @@ -808,6 +817,453 @@ TupleTypePtr TupleType::createNamed(const c10::optional& qua field_types, qualName, schema)); // NOLINT(modernize-make-shared) } +bool NoneType::isSubtypeOfExt(const TypePtr& rhs, std::ostream *why_not) const { + if (rhs->kind() == OptionalType::Kind) { + return true; + } + return Type::isSubtypeOfExt(rhs, why_not); +} + +// Remove nested Optionals/Unions during the instantiation of a Union or +// an Optional. This populates `types` with all the types found during +// flattening. At the end of `flattenUnion`, `types` may have +// duplicates, but it will not have nested Optionals/Unions +void flattenUnion(TypePtr& type, std::vector* to_fill) { + if (auto union_type = type->cast()) { + for (auto inner : union_type->containedTypes()) { + flattenUnion(inner, to_fill); + } + } else if (auto opt_type = type->cast()) { + auto inner = opt_type->getElementType(); + flattenUnion(inner, to_fill); + to_fill->emplace_back(NoneType::get()); + } else if (type->kind() == NumberType::Kind) { + to_fill->emplace_back(IntType::get()); + to_fill->emplace_back(FloatType::get()); + to_fill->emplace_back(ComplexType::get()); + } else { + to_fill->emplace_back(type); + } +} + +// Helper function for `standardizeUnion` +// +// NB: If we have types `T1`, `T2`, `T3`, and `PARENT_T` such that `T1`, +// `T2`, and `T2` are children of `PARENT_T`, then `unifyTypes(T1, T2)` +// will return `PARENT_T`. This could be a problem if we didn't want our +// Union to also be able to take `T3 `. In our current type hierarchy, +// this isn't an issue--most types SHOULD be unified even if the parent +// type wasn't in the original vector. However, later additions to the +// type system might necessitate reworking `get_supertype` +void filterDuplicateSubtypes(std::vector* types) { + if (types->empty()) { + return; + } + auto get_supertype = [](const TypePtr t1, const TypePtr t2) -> c10::optional { + // We don't want nested Optionals. Also, prematurely unifying to + // `Optional` could prevent us from coalescing other types + if ((t1->isSubtypeOf(NoneType::get()) && !t2->isSubtypeOf(NoneType::get())) + || (!t1->isSubtypeOf(NoneType::get()) && t2->isSubtypeOf(NoneType::get()))) { + return c10::nullopt; + } else { + return unifyTypes(t1, t2, /*default_to_union=*/false); + } + }; + + // Coalesce types and delete all duplicates. Moving from right to left + // through the vector, we try to unify the current element (`i`) with + // each element (`j`) before the "new" end of the vector (`end`). + // If we're able to unify the types at `types[i]` and `types[j]`, we + // decrement `end`, swap `types[j]` with the unified type, and + // break. Otherwise, we keep `end` where it is to signify that the + // new end of the vector hasn't shifted + size_t end_idx = types->size()-1; + for (size_t i = types->size()-1; i > 0; --i) { + for (size_t j = std::min(i-1, end_idx); ; --j) { + c10::optional unified; + unified = get_supertype((*types)[i], (*types)[j]); + if (unified) { + (*types)[j] = *unified; + (*types)[i] = (*types)[end_idx]; + --end_idx; + break; + } + // Break condition here so we don't get `j = 0; j = j-1` and end + // up with MAX_INT + if (j == 0) { + break; + } + } + } + // Cut off the vector's tail so that `end` is the real last element + types->erase(types->begin() + end_idx + 1, types->end()); + +} + +void sortUnion(std::vector* types) { + // We want the elements to be sorted so we can easily compare two + // UnionType objects for equality in the future. Note that this order + // is guaranteed to be stable since we've already coalesced any + // possible types + std::sort(types->begin(), types->end(), + [](const TypePtr a, const TypePtr b) -> bool { + if (a->kind() != b->kind()) { + return a->kind() < b->kind(); + } + return a->str() < b->str(); + }); +} + +void standardizeVectorForUnion(std::vector& reference, std::vector* to_fill) { + for (auto type : reference) { + flattenUnion(type, to_fill); + } + filterDuplicateSubtypes(to_fill); + sortUnion(to_fill); +} + +void standardizeVectorForUnion(std::vector* to_flatten) { + TORCH_INTERNAL_ASSERT(to_flatten, "`standardizeVectorForUnion` was ", + "passed a `nullptr`"); + std::vector to_fill; + standardizeVectorForUnion(*to_flatten, &to_fill); + *to_flatten = to_fill; +} + +UnionType::UnionType(std::vector reference, TypeKind kind) : Type(kind) { + TORCH_INTERNAL_ASSERT(!reference.empty(), "Cannot create an empty Union"); + + standardizeVectorForUnion(reference, &types_); + + // Gate the assert in a regular conditional so that we don't create + // this long error message unnecessarily + if (types_.size() == 1) { + std::stringstream msg; + msg << "After type unification was performed, the Union with the " + << "original types {"; + for (auto i = 0; i < reference.size(); ++i) { + msg << reference[i]->repr_str(); + if (i > 0) { + msg << ","; + } + msg << " "; + } + msg << "} has the single type " << types_[0]->repr_str() + << ". Use the common supertype instead of creating a Union" + << "type"; + TORCH_INTERNAL_ASSERT(false, msg.str()); + } + + can_hold_none_ = false; + has_free_variables_ = false; + + for (const TypePtr& type : types_) { + if (type->kind() == NoneType::Kind) { + can_hold_none_ = true; + } + if (type->hasFreeVariables()) { + has_free_variables_ = true; + } + } + +} + +UnionTypePtr UnionType::create(std::vector reference) { + auto union_type = new UnionType(std::move(reference)); + + // Some very special-cased logic for `Optional`. This will be deleted + // in a later PR + bool int_found = false; + bool float_found = false; + bool complex_found = false; + bool nonetype_found = false; + + auto update_is_opt_flags = [&](TypePtr t) { + if (t == IntType::get()) { + int_found = true; + } else if (t == FloatType::get()) { + float_found = true; + } else if (t == ComplexType::get()) { + complex_found = true; + } else if (t == NoneType::get()) { + nonetype_found = true; + } + }; + + for (const auto& t : union_type->containedTypes()) { + update_is_opt_flags(t); + } + + bool numbertype_found = int_found && float_found && complex_found; + + if (nonetype_found) { + if (union_type->containedTypes().size() == 4 && numbertype_found) { + return OptionalType::create(NumberType::get()); + } + if (union_type->containedTypes().size() == 2) { + auto not_none = union_type->containedTypes()[0] != NoneType::get() + ? union_type->containedTypes()[0] + : union_type->containedTypes()[1]; + return OptionalType::create(not_none); + } + } + + return UnionTypePtr(union_type); +} + +bool UnionType::operator==(const Type& rhs) const { + if (auto union_rhs = rhs.cast()) { + // We can't compare the type vectors for equality using `operator=`, + // because the vectors hold `TypePtr`s and we want to compare `Type` + // equality + if (union_rhs->containedTypes().size() != this->containedTypes().size()) { + return false; + } + // Check that all the types in `this->types_` are also in + // `union_rhs->types_` + return std::all_of(this->containedTypes().begin(), this->containedTypes().end(), + [&](TypePtr lhs_type) { + return std::any_of(union_rhs->containedTypes().begin(), + union_rhs->containedTypes().end(), + [&](TypePtr rhs_type) { + return *lhs_type == *rhs_type; + }); + }); + } else if (auto optional_rhs = rhs.cast()) { + if (optional_rhs->getElementType() == NumberType::get()) { + return this->containedTypes().size() == 4 + && this->can_hold_none_ + && this->canHoldType(NumberType::get()); + } + auto optional_lhs = this->toOptional(); + return optional_lhs && *optional_rhs == *((optional_lhs.value())->expect()); + } else if (rhs.kind() == NumberType::Kind) { + return this->containedTypes().size() == 3 && canHoldType(NumberType::get()); + } else { + return false; + } +} + +bool UnionType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const { + std::vector rhs_types; + if (const auto union_rhs = rhs->cast()) { + // Fast path + if (this->containedTypes() == rhs->containedTypes()) { + return true; + } + rhs_types = rhs->containedTypes().vec(); + } else if (const auto optional_rhs = rhs->cast()) { + rhs_types.push_back(NoneType::get()); + if (optional_rhs->getElementType() == NumberType::get()) { + std::vector number_types{IntType::get(), FloatType::get(), ComplexType::get()}; + rhs_types.insert(rhs_types.end(), number_types.begin(), number_types.end()); + } else { + rhs_types.push_back(optional_rhs->getElementType()); + } + } else if (const auto number_rhs = rhs->cast()) { + std::vector number_types{IntType::get(), FloatType::get(), ComplexType::get()}; + rhs_types.insert(rhs_types.end(), number_types.begin(), number_types.end()); + } else { + rhs_types.push_back(rhs); + } + return std::all_of(this->containedTypes().begin(), this->containedTypes().end(), + [&](TypePtr lhs_type) -> bool { + return std::any_of(rhs_types.begin(), + rhs_types.end(), + [&](TypePtr rhs_type) -> bool { + return lhs_type->isSubtypeOfExt(rhs_type, why_not); + }); + }); +} + + +std::string UnionType::unionStr(TypePrinter printer, bool is_annotation_str) const { + std::stringstream ss; + + bool can_hold_numbertype = this->canHoldType(NumberType::get()); + + std::vector number_types{IntType::get(), FloatType::get(), ComplexType::get()}; + + auto is_numbertype = [&](TypePtr lhs) { + for (const auto& rhs : number_types) { + if (*lhs == *rhs) { + return true; + } + } + return false; + }; + + ss << "Union["; + bool printed = false; + for (size_t i = 0; i < types_.size(); ++i) { + if (!can_hold_numbertype || !is_numbertype(types_[i])) { + if (i > 0) { + ss << ", "; + printed = true; + } + if (is_annotation_str) { + ss << this->containedTypes()[i]->annotation_str(printer); + } else { + ss << this->containedTypes()[i]->str(); + } + } + } + if (can_hold_numbertype) { + if (printed) { + ss << ", "; + } + if (is_annotation_str) { + ss << NumberType::get()->annotation_str(printer); + } else { + ss << NumberType::get()->str(); + } + } + ss << "]"; + return ss.str(); +} + +std::string UnionType::str() const { + return this->unionStr(nullptr, /*is_annotation_str=*/false); +} + +std::string UnionType::annotation_str_impl(TypePrinter printer) const { + return this->unionStr(printer, /*is_annotation_str=*/true); +} + +bool UnionType::canHoldType(TypePtr type) const { + if (type == NumberType::get()) { + return canHoldType(IntType::get()) + && canHoldType(FloatType::get()) + && canHoldType(ComplexType::get()); + } else { + return std::any_of(this->containedTypes().begin(), this->containedTypes().end(), + [&](TypePtr inner) { + return type->isSubtypeOf(inner); + }); + } +} + +c10::optional UnionType::toOptional() const { + if (!canHoldType(NoneType::get())) { + return c10::nullopt; + } + + std::vector copied_types = this->containedTypes().vec(); + + auto maybe_opt = UnionType::create(std::move(copied_types)); + + if (maybe_opt->kind() == UnionType::Kind) { + return c10::nullopt; + } else { + return maybe_opt; + } +} + +c10::optional UnionType::subtractTypeSet(std::vector& to_subtract) const { + std::vector types; + + // Given a TypePtr `lhs`, this function says whether or not `lhs` (or + // one of its parent types) is in the `to_subtract` vector + auto should_subtract = [&](TypePtr lhs) -> bool { + return std::any_of(to_subtract.begin(), to_subtract.end(), + [&](TypePtr rhs) { + return lhs->isSubtypeOf(rhs); + }); + }; + + // Copy all the elements that should NOT be subtracted to the `types` + // vector + std::copy_if(this->containedTypes().begin(), this->containedTypes().end(), + std::back_inserter(types), + [&](const TypePtr t) { + return !should_subtract(t); + }); + + if (types.size() == 0) { + return c10::nullopt; + } else if (types.size() == 1) { + return types[0]; + } else { + return UnionType::create(std::move(types)); + } +} + +OptionalType::OptionalType(TypePtr contained) + : UnionType({contained, NoneType::get()}, TypeKind::OptionalType) { + bool is_numbertype = false; + if (auto as_union = contained->cast()) { + is_numbertype = as_union->containedTypes().size() == 3 && + as_union->canHoldType(NumberType::get()); + } + if (UnionType::containedTypes().size() == 2) { + contained_ = UnionType::containedTypes()[0]->kind()!= NoneType::Kind + ? UnionType::containedTypes()[0] + : UnionType::containedTypes()[1]; + } else if (contained == NumberType::get() || is_numbertype) { + contained_ = NumberType::get(); + types_.clear(); + types_.push_back(NumberType::get()); + types_.push_back(NoneType::get()); + } else { + std::vector to_subtract{NoneType::get()}; + auto without_none = this->subtractTypeSet(to_subtract); + contained_ = UnionType::create({*without_none}); + } + has_free_variables_ = contained_->hasFreeVariables(); +} + +bool OptionalType::operator==(const Type& rhs) const { + if (auto union_rhs = rhs.cast()) { + auto optional_rhs = union_rhs->toOptional(); + // `**optional_rhs` = `*` to get value of `c10::optional`, + // then `*` to dereference the pointer + return optional_rhs && *this == **optional_rhs; + } else if (auto optional_rhs = rhs.cast()) { + return *this->getElementType() == *optional_rhs->getElementType(); + } else { + return false; + } +} + +bool OptionalType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const { + if (OptionalTypePtr optional_rhs = rhs->cast()) { + return getElementType()->isSubtypeOfExt(optional_rhs->getElementType(), why_not); + } else if (UnionTypePtr union_rhs = rhs->cast()) { + if (!union_rhs->canHoldType(NoneType::get())) { + if (why_not) { + *why_not << rhs->repr_str() << " cannot hold None"; + } + return false; + } else if (!union_rhs->canHoldType(this->getElementType())) { + if (why_not) { + *why_not << rhs->repr_str() << " cannot hold " << this->getElementType(); + } + return false; + } else { + return true; + } + } else { + // NOLINTNEXTLINE(bugprone-argument-comment) + return Type::isSubtypeOfExt(rhs, why_not); + } +} + +bool NumberType::operator==(const Type& rhs) const { + if (auto union_type = rhs.cast()) { + return union_type->containedTypes().size() == 3 && union_type->canHoldType(NumberType::get()); + } else { + return rhs.kind() == this->kind(); + } +} + +bool NumberType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const { + if (auto union_type = rhs->cast()) { + return union_type->canHoldType(NumberType::get()); + } else { + return Type::isSubtypeOfExt(rhs, why_not); + } +} + TupleType::TupleType( std::vector elements, c10::optional name, @@ -1732,8 +2188,10 @@ size_t ClassType::addAttribute( TORCH_CHECK( (type->kind() == TensorType::Kind) || (type->kind() == OptionalType::Kind && - type->expectRef().getElementType()->kind() == + type->expect()->getElementType()->kind() == TensorType::Kind) || + (type->kind() == UnionType::Kind && + TensorType::get()->isSubtypeOf(type->expect())) || (type->kind() == NoneType::Kind), "Expecting parameter or buffer to have either None, Tensor or Optional[Tensor] type, but got: ", toString(type)); @@ -1880,7 +2338,9 @@ void SymbolicShape::dump() const { bool EnumType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const { return rhs->kind() == TypeKind::AnyType || - rhs->kind() == TypeKind::AnyEnumType || *this == *rhs; + rhs->kind() == TypeKind::AnyEnumType || + *this == *rhs || + Type::isSubtypeOfExt(rhs, why_not); } } // namespace c10 diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 8d2f2de7367d7..2c77fb348228b 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -7,8 +7,8 @@ Like all ATen methods/functions, native functions are made available from both ATen's C++ and Python APIs. In C++, they are made available either as methods on `Tensor` (`t.mymeth()`) and functions in the ATen namespace (`at::myfunc()`). In PyTorch, they are made available as -methods on `Variable` or as functions on `torch._C._FunctionBase` -(it is the user's responsibility to re-exporting these functions in +methods on `Variable` or as functions on `torch._C._FunctionBase`. +(It is the user's responsibility to re-export these functions in a more user-facing module.) The rest of this document describes how to implement an ATen function. diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h index 5112691a62d6f..bb080d9df97c3 100644 --- a/c10/mobile/CPUProfilingAllocator.h +++ b/c10/mobile/CPUProfilingAllocator.h @@ -50,7 +50,7 @@ class C10_API AllocationPlanner { private: AllocationPlan* allocation_plan_{nullptr}; // Maps allocated ptr to its allocation id. - // This is used when freeing the memory to lookup the allocation id + // This is used when freeing the memory to look up the allocation id // in order to establish the lifetime of a particular allocation. ska::flat_hash_map allocation_ptr_to_id_; uint64_t allocation_id_{0}; diff --git a/docs/source/rpc/distributed_autograd.rst b/docs/source/rpc/distributed_autograd.rst index 61af22b9486f5..71cf1f2fd3178 100644 --- a/docs/source/rpc/distributed_autograd.rst +++ b/docs/source/rpc/distributed_autograd.rst @@ -65,7 +65,7 @@ an RPC. input tensors. The output gradients of this function are sent to the source node to the appropriate ``send`` function during the backward pass. - Each ``send-recv`` pair is assigned a globally unique ``autograd_message_id`` - to uniquely identify the pair. This is useful to lookup the corresponding + to uniquely identify the pair. This is useful to look up the corresponding function on a remote node during the backward pass. - For :ref:`rref`, whenever we call :meth:`torch.distributed.rpc.RRef.to_here` we attach an appropriate ``send-recv`` pair for the tensors involved. @@ -98,7 +98,7 @@ This context serves the following purpose: 2. During the forward pass we store the ``send`` and ``recv`` functions for each autograd pass in this context. This ensures we hold references to the appropriate nodes in the autograd graph to keep it alive. In addition to - this, it is easy to lookup the appropriate ``send`` and ``recv`` functions + this, it is easy to look up the appropriate ``send`` and ``recv`` functions during the backward pass. 3. In general we also use this context to store some metadata for each distributed autograd pass. diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index e766f33a250b2..8bd37a1fb8a59 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -66,6 +66,7 @@ set(JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp ${JIT_TEST_ROOT}/test_subgraph_utils.cpp + ${JIT_TEST_ROOT}/test_union.cpp ${JIT_TEST_ROOT}/test_utils.cpp ${JIT_TEST_ROOT}/test_script_profile.cpp ${JIT_TEST_ROOT}/test_jit_logging_levels.cpp diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp index eef529d8d5d33..c92cb4da46dde 100644 --- a/test/cpp/jit/test_alias_analysis.cpp +++ b/test/cpp/jit/test_alias_analysis.cpp @@ -660,6 +660,31 @@ TEST(ContainerAliasingTest, PrimitveValuesDontAliasContainers) { } } +TEST(ContainerAliasingTest, UnionAliasing) { + auto graph = std::make_shared(); + parseIR( + R"IR( + graph(%a : Dict(str, Tensor), + %b : Tensor[], + %c : Union(Dict(str, Tensor), Tensor[])): + return (%a, %b, %c) + )IR", + &*graph); + + AliasDb aliasDb(graph); + auto a = graph->outputs().at(0); + auto b = graph->outputs().at(1); + auto c = graph->outputs().at(2); + + EXPECT_TRUE(aliasDb.mayAlias(a, c)); + EXPECT_TRUE(aliasDb.mayAlias(b, c)); + EXPECT_TRUE(aliasDb.mayAlias(c, c)); + EXPECT_FALSE(aliasDb.mayAlias(a, b)); + EXPECT_TRUE(aliasDb.mayContainAlias(a, b)); + EXPECT_TRUE(aliasDb.mayContainAlias(a, c)); + EXPECT_TRUE(aliasDb.mayContainAlias(b, c)); +} + TEST(ContainerAliasingTest, InputsCanAliasOutputs) { // Test input aliasing auto graph = std::make_shared(); diff --git a/test/cpp/jit/test_union.cpp b/test/cpp/jit/test_union.cpp new file mode 100644 index 0000000000000..f35acd35d1ed6 --- /dev/null +++ b/test/cpp/jit/test_union.cpp @@ -0,0 +1,149 @@ +#include + +#include +#include +#include + +namespace torch { +namespace jit { + +class UnionTypeTest : public ::testing::Test { + public: + // None + const TypePtr none = NoneType::get(); + + // List[str] + const TypePtr l1 = ListType::ofStrings(); + + // Optional[int] + const TypePtr opt1 = OptionalType::create(IntType::get()); + + // Optional[float] + const TypePtr opt2 = OptionalType::create(FloatType::get()); + + // Optional[List[str]] + const TypePtr opt3 = OptionalType::create(ListType::ofStrings()); + + // Tuple[Optional[int], int] + const TypePtr tup1 = + TupleType::create({OptionalType::create(IntType::get()), IntType::get()}); + + // Tuple[int, int] + const TypePtr tup2 = TupleType::create({IntType::get(), IntType::get()}); + + bool hasType(UnionTypePtr u, TypePtr t) { + auto res = std::find(u->getTypes().begin(), u->getTypes().end(), t); + return res != u->getTypes().end(); + } +}; + +TEST_F(UnionTypeTest, UnionOperatorEquals) { + const UnionTypePtr u1 = UnionType::create({l1, tup2, StringType::get()}); + + // Same thing, but using different TypePtrs + const TypePtr l1_ = ListType::ofStrings(); + const TypePtr tup2_ = TupleType::create({IntType::get(), IntType::get()}); + const UnionTypePtr u2 = UnionType::create({l1_, tup2_, StringType::get()}); + + ASSERT_TRUE(*u1 == *u2); +} + +TEST_F(UnionTypeTest, UnionCreate_OptionalT1AndOptionalT2) { + // Goal: Union[int, float, None] + const UnionTypePtr u = UnionType::create({opt1, opt2}); + + ASSERT_EQ(u->getTypes().size(), 3); + ASSERT_TRUE(UnionTypeTest::hasType(u, IntType::get())); + ASSERT_TRUE(UnionTypeTest::hasType(u, FloatType::get())); + ASSERT_TRUE(UnionTypeTest::hasType(u, NoneType::get())); +} + +TEST_F(UnionTypeTest, UnionCreate_OptionalTAndT) { + // Goal: Union[int, None] + const UnionTypePtr u = UnionType::create({opt1, IntType::get()}); + + ASSERT_EQ(u->getTypes().size(), 2); + ASSERT_TRUE(UnionTypeTest::hasType(u, IntType::get())); + ASSERT_TRUE(UnionTypeTest::hasType(u, NoneType::get())); +} + +TEST_F(UnionTypeTest, UnionCreate_TupleWithSubtypingRelationship) { + // Goal: Union[Tuple[Optional[int], int], str] + const UnionTypePtr u = UnionType::create({StringType::get(), tup1, tup2}); + + ASSERT_EQ(u->getTypes().size(), 2); + ASSERT_TRUE(UnionTypeTest::hasType(u, StringType::get())); + ASSERT_TRUE(UnionTypeTest::hasType(u, tup1)); +} + +TEST_F(UnionTypeTest, UnionCreate_ContainerTAndT) { + // Goal: Union[List[str], str] + const UnionTypePtr u = UnionType::create({l1, StringType::get()}); + + ASSERT_EQ(u->getTypes().size(), 2); + ASSERT_TRUE(UnionTypeTest::hasType(u, StringType::get())); + ASSERT_TRUE(UnionTypeTest::hasType(u, ListType::ofStrings())); +} + +TEST_F(UnionTypeTest, UnionCreate_OptionalContainerTAndContainerTAndT) { + // Goal: Union[List[str], None, str] + const UnionTypePtr u = UnionType::create({l1, opt3, StringType::get()}); + + ASSERT_EQ(u->getTypes().size(), 3); + ASSERT_TRUE(UnionTypeTest::hasType(u, StringType::get())); + ASSERT_TRUE(UnionTypeTest::hasType(u, ListType::ofStrings())); +} + +TEST_F(UnionTypeTest, Subtyping_NumberType) { + // Union[int, float, Complex] + const UnionTypePtr union1 = + UnionType::create({IntType::get(), FloatType::get(), ComplexType::get()}); + + // Union[int, float, Complex, None] + const UnionTypePtr union2 = UnionType::create( + {IntType::get(), FloatType::get(), ComplexType::get(), NoneType::get()}); + + const NumberTypePtr num = NumberType::get(); + + ASSERT_TRUE(num->isSubtypeOf(union1)); + ASSERT_TRUE(union1->isSubtypeOf(num)); + ASSERT_TRUE(*num == *union1); + + ASSERT_TRUE(num->isSubtypeOf(union2)); + ASSERT_FALSE(union2->isSubtypeOf(num)); + ASSERT_FALSE(*num == *union2); +} + +TEST_F(UnionTypeTest, Subtyping_OptionalType) { + // Union[int, None] + const UnionTypePtr union1 = + UnionType::create({IntType::get(), NoneType::get()}); + + // Union[int, str, None] + const UnionTypePtr union2 = + UnionType::create({IntType::get(), StringType::get(), NoneType::get()}); + + // Union[int, str, List[str]] + const UnionTypePtr union3 = UnionType::create( + {IntType::get(), StringType::get(), ListType::ofStrings()}); + + ASSERT_TRUE(none->isSubtypeOf(opt1)); + ASSERT_TRUE(none->isSubtypeOf(union1)); + ASSERT_TRUE(none->isSubtypeOf(union2)); + ASSERT_FALSE(none->isSubtypeOf(union3)); + + ASSERT_FALSE(opt1->isSubtypeOf(none)); + ASSERT_TRUE(opt1->isSubtypeOf(union1)); + ASSERT_TRUE(opt1->isSubtypeOf(union2)); + ASSERT_FALSE(opt1->isSubtypeOf(union3)); + + ASSERT_FALSE(union1->isSubtypeOf(none)); + ASSERT_TRUE(union1->isSubtypeOf(opt1)); + ASSERT_TRUE(union1->isSubtypeOf(union2)); + ASSERT_FALSE(union1->isSubtypeOf(union3)); + + ASSERT_FALSE(union2->isSubtypeOf(union1)); +} + +} // namespace jit +} // namespace torch diff --git a/test/expect/TestScript.test_annot_ast_mypy_fn.expect b/test/expect/TestScript.test_annot_ast_mypy_fn.expect index 4b15b27b48112..36888d04876ef 100644 --- a/test/expect/TestScript.test_annot_ast_mypy_fn.expect +++ b/test/expect/TestScript.test_annot_ast_mypy_fn.expect @@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_ast_mypy_method.expect b/test/expect/TestScript.test_annot_ast_mypy_method.expect index 9c0dcd14deeec..b6c19a6002483 100644 --- a/test/expect/TestScript.test_annot_ast_mypy_method.expect +++ b/test/expect/TestScript.test_annot_ast_mypy_method.expect @@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_ast_py3_fn.expect b/test/expect/TestScript.test_annot_ast_py3_fn.expect index 4b15b27b48112..36888d04876ef 100644 --- a/test/expect/TestScript.test_annot_ast_py3_fn.expect +++ b/test/expect/TestScript.test_annot_ast_py3_fn.expect @@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_ast_py3_method.expect b/test/expect/TestScript.test_annot_ast_py3_method.expect index 9c0dcd14deeec..b6c19a6002483 100644 --- a/test/expect/TestScript.test_annot_ast_py3_method.expect +++ b/test/expect/TestScript.test_annot_ast_py3_method.expect @@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_string_mypy_fn.expect b/test/expect/TestScript.test_annot_string_mypy_fn.expect index 4b15b27b48112..36888d04876ef 100644 --- a/test/expect/TestScript.test_annot_string_mypy_fn.expect +++ b/test/expect/TestScript.test_annot_string_mypy_fn.expect @@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_string_mypy_method.expect b/test/expect/TestScript.test_annot_string_mypy_method.expect index 9c0dcd14deeec..b6c19a6002483 100644 --- a/test/expect/TestScript.test_annot_string_mypy_method.expect +++ b/test/expect/TestScript.test_annot_string_mypy_method.expect @@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_string_py3_fn.expect b/test/expect/TestScript.test_annot_string_py3_fn.expect index 4b15b27b48112..36888d04876ef 100644 --- a/test/expect/TestScript.test_annot_string_py3_fn.expect +++ b/test/expect/TestScript.test_annot_string_py3_fn.expect @@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo(int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/expect/TestScript.test_annot_string_py3_method.expect b/test/expect/TestScript.test_annot_string_py3_method.expect index 9c0dcd14deeec..b6c19a6002483 100644 --- a/test/expect/TestScript.test_annot_string_py3_method.expect +++ b/test/expect/TestScript.test_annot_string_py3_method.expect @@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool)) foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[])) foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[])) foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[])) -foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) \ No newline at end of file +foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?)) diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py index d8434515291ab..10f5e879099a0 100644 --- a/test/jit/test_list_dict.py +++ b/test/jit/test_list_dict.py @@ -92,7 +92,7 @@ def reassign_from_empty_literal(): if 1 == 1: x = [1, 2, 3] return - with self.assertRaisesRegexWithHighlight(RuntimeError, r"previously has type List\[Tensor\]", "x"): + with self.assertRaisesRegexWithHighlight(RuntimeError, r"previously had type List\[Tensor\]", "x"): self.checkScript(reassign_from_empty_literal, (), optimize=False) def reassign_from_empty_builtin(): @@ -113,7 +113,7 @@ def reassign_bad_type(): if 1 == 1: x = [1.0] return - with self.assertRaisesRegexWithHighlight(RuntimeError, "previously has type", "x"): + with self.assertRaisesRegexWithHighlight(RuntimeError, "previously had type", "x"): self.checkScript(reassign_bad_type, (), optimize=False) def reassign_nested(): @@ -123,7 +123,7 @@ def reassign_nested(): if 1 == 1: x = [1.0] return - with self.assertRaisesRegexWithHighlight(RuntimeError, "previously has type", "x"): + with self.assertRaisesRegexWithHighlight(RuntimeError, "previously had type", "x"): self.checkScript(reassign_nested, (), optimize=False) def test_del(self): diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py index f60f25f782e95..125197c87bbb1 100644 --- a/test/jit/test_typing.py +++ b/test/jit/test_typing.py @@ -92,10 +92,9 @@ def fn(x): graph = torch.jit.script(fn).graph - print(graph) - # Check that we're making a `List[Tuple[str, Any]]` - FileCheck().check(r"(str, Any)[] = prim::ListConstruct").run(graph) + FileCheck().check("(str, Union[Tensor, Dict(str, Tensor)])" + "[] = prim::ListConstruct()").run(graph) def test_list_type_refinement_defaults_to_Any_list_comprehension(self): def fn(x): @@ -116,10 +115,9 @@ def fn(x): graph = torch.jit.script(fn).graph - print(graph) - # Check that we're making a `List[Tuple[str, Any]]` - FileCheck().check(r"(str, Any)[] = prim::ListConstruct").run(graph) + FileCheck().check("(str, Union[Tensor, Dict(str, Tensor)])" + "[] = prim::ListConstruct()").run(graph) def test_list_type_refinement_annotation_element_mismatch(self): def fn(): @@ -145,7 +143,8 @@ def fn(x): graph = torch.jit.script(fn).graph - FileCheck().check(r"Dict(str, Any) = prim::DictConstruct").run(graph) + FileCheck().check("Dict(str, Union[Tensor, Dict(str, Tensor)])" + " = prim::DictConstruct").run(graph) def test_dict_type_refinement_defaults_to_Any_dict_comprehension(self): def fn(x): @@ -161,7 +160,8 @@ def fn(x): graph = torch.jit.script(fn).graph - FileCheck().check("Dict(str, Any) = prim::DictConstruct").run(graph) + FileCheck().check("Dict(str, Union[Tensor, Dict(str, Tensor)])" + " = prim::DictConstruct").run(graph) def test_dict_type_refinement_annotation_key_mismatch(self): def fn(): diff --git a/test/jit/test_union.py b/test/jit/test_union.py new file mode 100644 index 0000000000000..df909a6e8100f --- /dev/null +++ b/test/jit/test_union.py @@ -0,0 +1,657 @@ +import io +import os +import sys + +import torch +from torch.testing import FileCheck +from enum import Enum +from typing import Dict, List, Optional, Tuple, Union + +# Make the helper files in test/ importable +pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(pytorch_test_dir) +from torch.testing._internal.jit_utils import JitTestCase, make_global + +if __name__ == '__main__': + raise RuntimeError("This test file is not meant to be run directly, use:\n\n" + "\tpython test/test_jit.py TESTNAME\n\n" + "instead.") + +class TestUnion(JitTestCase): + """ + This class tests the functionality of `Union`. + + Note: It's important to be able to refine the type of a `Union` to + one of its internal types. Currently, there are differences in the + way Python expects `isinstance` checks and the way TorchScript + expects `isinstance` checks. This means that we can't use + `checkScript` in our test cases because either the eager mode or the + script mode wouldn't run! So, some test cases have separate but + equivalent functions to emulate `checkScript`. + """ + + def test_union_with_scalar_values(self): + def fn(x: Union[int, float]) -> str: + return "foo" + + self.checkScript(fn, (1,)) + self.checkScript(fn, (1.0,)) + + scripted = torch.jit.script(fn) + + with self.assertRaisesRegex(RuntimeError, "Expected a member of" + r" Union\[float, int\] but " + "instead found type str"): + scripted("1") + + def test_union_with_collections(self): + def fn(x: Union[Dict[str, int], List[int]]) -> str: + return "foo" + + self.checkScript(fn, ({"foo": 1, "bar": 2, "baz": 3},)) + self.checkScript(fn, ([1, 2, 3],)) + + scripted = torch.jit.script(fn) + + with self.assertRaisesRegex(RuntimeError, "Expected a member of" + r" Union\[List\[int\], Dict\[str, " + r"int\]\] but instead found type " + r"Dict\[str, str\]"): + scripted({"foo": "bar", "baz": "qux"}) + + with self.assertRaisesRegex(RuntimeError, "Expected a member of" + r" Union\[List\[int\], Dict\[str, " + r"int\]\] but instead found type " + r"List\[str\]"): + scripted(["foo", "bar", "baz"]) + + with self.assertRaisesRegex(RuntimeError, "Expected a member of" + r" Union\[List\[int\], Dict\[str, " + r"int\]\] but instead found type " + "str"): + scripted("1") + + def test_union_with_enum(self): + class Color(Enum): + RED = 1 + GREEN = 2 + + make_global(Color) + + def fn(x: Union[str, Color]) -> str: + return "foo" + + self.checkScript(fn, (Color.RED,)) + self.checkScript(fn, ("red",)) + + scripted = torch.jit.script(fn) + + with self.assertRaisesRegex(RuntimeError, "Expected a member of" + r" Union\[__torch__.jit.test_union." + r"Color, str\] but instead found " + "type int"): + scripted(1) + + def test_union_in_class_constructor(self): + + @torch.jit.script + class A(object): # noqa: B903 + def __init__(self, x: Union[int, str]) -> None: + self.x = x + + def fn(x: Union[str, int]) -> A: + return A(x) + + self.assertEqual(fn("foo").x, "foo") + self.assertEqual(fn(1).x, 1) + + scripted = torch.jit.script(fn) + + with self.assertRaisesRegex(RuntimeError, "Expected a member of" + r" Union\[int, str\] but instead " + r"found type List\[str\]"): + scripted(["foo", "bar", "baz"]) + + def test_union_return_type(self): + def fn(x: int) -> Union[int, str]: + return "foo" + + self.checkScript(fn, (1,)) + + def test_union_as_annotation(self): + def fn() -> Union[int, str]: + x: Union[int, str] = "foo" + return x + + self.checkScript(fn, ()) + + def test_union_as_annotation_in_typed_container(self): + def fn() -> None: + l: List[Union[int, str]] = [] + u1: Union[int, str] = "foo" + u2: Union[int, str] = 1 + l.append(u1) + l.append(u2) + + self.checkScript(fn, ()) + + def test_union_as_annotation_py2(self): + def fn(): + # type: () -> Union[int, str] + x: Union[int, str] = "foo" + return x + + self.checkScript(fn, ()) + + def test_union_as_internal_tuple_type(self): + def fn(): + t: Tuple[Union[int, str], Union[int, str]] = (1, "foo") + return t + + self.checkScript(fn, ()) + + def test_union_variable_can_be_reassigned(self): + @torch.jit.script + def aux1(i: int): + return int(i ** 2) + + @torch.jit.script + def aux2(s: str): + return s + s + + def fn() -> Union[int, str]: + x: Union[int, str] = "foo" + i: int = 1 + x = i + y: int = aux1(x) + z: str = aux2(str(y)) + x = z + return x + + self.checkScript(fn, ()) + + def test_union_does_not_replace_existing_annotated_type(self): + def fn(): + x: List[int] = [1, 2, 3] + x.append("foo") + return x + + with self.assertRaisesRegex(RuntimeError, "Could not match type str"): + scripted = torch.jit.script(fn) + scripted() + + def test_union_does_not_replace_existing_annotated_type_union(self): + def fn(): + x: List[Union[int, str]] = [1, "foo", 3] + x.append(2.0) + return x + + with self.assertRaisesRegex(RuntimeError, "Could not match type float"): + scripted = torch.jit.script(fn) + scripted() + + def test_union_does_not_replace_existing_annotated_type_empty_container(self): + def fn(): + x: List[int] = [] + x.append("foo") + return x + + with self.assertRaisesRegex(RuntimeError, "Could not match type str"): + scripted = torch.jit.script(fn) + scripted() + + def test_unions_of_unions_are_flattened(self): + @torch.jit.script + def fn(x: Union[Union[int, str], float]) -> str: + return "foo" + + s = fn.graph + + FileCheck().check("x : Union[float, int, str]") \ + .run(s) + + def test_unions_of_a_single_argument_vanish(self): + @torch.jit.script + def fn(x: Union[int]) -> str: + return "foo" + + s = fn.graph + + FileCheck().check("x : int") \ + .run(s) + + def test_union_redundant_arguments_are_skipped(self): + @torch.jit.script + def fn(x: Union[int, str, int]) -> str: + return "foo" + + s = fn.graph + + FileCheck().check("x : Union[int, str]") \ + .run(s) + + def test_union_redundant_arguments_are_skipped_optional(self): + @torch.jit.script + def fn(x: Union[int, Optional[float], Optional[int]]) -> str: + return "foo" + + s = fn.graph + + FileCheck().check("x : Union[float, int, NoneType]") \ + .run(s) + + def test_union_redundant_arguments_are_skipped_subtyping(self): + @torch.jit.script + def fn(x: Union[str, Tuple[Optional[int], int], Tuple[int, int]]) -> str: + return "foo" + + s = fn.graph + + FileCheck().check("x : Union[(int?, int), str]") \ + .run(s) + + def test_union_redundant_arguments_are_skipped_container(self): + @torch.jit.script + def fn(x: Union[List[str], List[float], List[str]]) -> str: + return "foo" + + s = fn.graph + + FileCheck().check("x : Union[float[], str[]]") \ + .run(s) + + def test_union_argument_order_is_ignored(self): + @torch.jit.script + def fn1(x: Union[int, str]) -> str: + return "foo" + + @torch.jit.script + def fn2(x: Union[str, int]) -> str: + return "foo" + + for s in (fn1.graph, fn2.graph): + FileCheck().check("x : Union[int, str]") \ + .run(s) + + def test_union_argument_order_is_ignored_container(self): + @torch.jit.script + def fn1(x: Union[List[str], List[int]]) -> str: + return "foo" + + @torch.jit.script + def fn2(x: Union[List[int], List[str]]) -> str: + return "foo" + + for s in (fn1.graph, fn2.graph): + FileCheck().check("x : Union[int[], str[]]") \ + .run(s) + + def test_union_T_None_is_equivalent_to_optional_T(self): + @torch.jit.script + def inner(x: Union[int, None]) -> int: + if x is not None: + return x + else: + return 5 + + @torch.jit.script + def fn1() -> int: + a: Optional[int] = 5 + b: Optional[int] = None + a_ = inner(a) + b_ = inner(b) + return a_ + b_ + + self.assertEqual(fn1(), 10) + + @torch.jit.script + def inner2(x: Optional[int]) -> int: + if x is not None: + return x + else: + return 5 + + @torch.jit.script + def fn2() -> int: + a: Union[int, None] = 5 + b: Union[int, None] = None + a_ = inner(a) + b_ = inner(b) + return a_ + b_ + + self.assertEqual(fn2(), 10) + + def test_union_optional_of_union_is_flattened(self): + @torch.jit.script + def fn(flag: int) -> Union[str, int, None]: + y: Union[int, str, None] = "foo" + if flag == 0: + x: Optional[Union[int, str]] = y + elif flag == 1: + x: Optional[Union[int, str]] = 1 + else: + x: Optional[Union[int, str]] = None + return x + + # Can't use `checkScript` because it will flag the fact that + # the original code has `Optional[Union[int, str]]` but the + # saved/loaded code has `Union[int, NoneType, str]` (even + # though this is exactly what we want) + self.assertEqual(fn(0), "foo") + self.assertEqual(fn(1), 1) + self.assertEqual(fn(2), None) + + buffer = io.BytesIO() + torch.jit.save(fn, buffer) + buffer = io.BytesIO(buffer.getvalue()) + l = torch.jit.load(buffer) + + s = l.code + + FileCheck().check("Union[int, NoneType, str]") \ + .check("Union[int, NoneType, str]") \ + .run(s) + + def test_union_subclasses_larger_union(self): + def fn() -> Union[int, str, torch.Tensor]: + x: Union[int, str] = "foo" + return x + + self.checkScript(fn, ()) + + # TODO: We would like to eventually support this. The issue is being + # tracked at https://github.com/pytorch/pytorch/issues/58167 + def test_union_as_dict_key(self): + def fn(): + x: Dict[Union[int, str], str] = {} + x["foo"] = "bar" + x[1] = 2 + return x[1] + + with self.assertRaisesRegex(RuntimeError, "only int, float, " + "complex, Tensor and string keys " + "are supported"): + torch.jit.script(fn) + + def test_union_as_dict_value(self): + def fn(): + x: Dict[str, Union[int, str]] = {} + x["foo"] = "bar" + x["baz"] = 2 + return x["baz"] + + self.checkScript(fn, ()) + + def test_union_module_with_union_instance_variable(self): + class M(torch.nn.Module): + + x: Union[int, str] + + def __init__(self, x: Union[int, str]): + super().__init__() + self.x: Union[int, str] = x + + def forward(self, y: Union[int, str]): + self.x = y + return self.x + + self.checkModule(M(2,), (1,)) + self.checkModule(M("bar"), ("foo",)) + + def test_union_module_with_union_class_variable(self): + class M(torch.nn.Module): + x: Union[int, str] = "foo" + + def __init__(self, y: int): + super().__init__() + x = y + + def forward(self, z: str): + x = z + return x + + self.checkModule(M(1), ("foo",)) + + def test_union_type_refinement(self): + def fn(x: Union[int, str]) -> str: + if isinstance(x, str): + z = x + "bar" + return x + else: + return "baz" + + self.checkScript(fn, ("foo",)) + self.checkScript(fn, (1,)) + + def test_union_type_refinement_union_rhs(self): + def fn(x: int) -> str: + if torch.jit.isinstance(x, Union[int, str]): + return "bar" + else: + return "baz" + + self.checkScript(fn, (1,)) + + def test_union_type_refinement_tuple_rhs(self): + def fn(x: Union[int, float, List[str]]) -> str: + if isinstance(x, (int, float)): + if isinstance(x, int): + return str(x) + else: + return "foo" + else: + if len(x): + return x[0] + else: + return "bar" + + self.checkScript(fn, (1,)) + self.checkScript(fn, (1.0,)) + self.checkScript(fn, (["a", "b", "c"],)) + + def test_union_type_refinement_tuple_rhs_noncontained_type(self): + def fn(x: Union[int, List[str]]) -> str: + if isinstance(x, (int, float)): + y = x + x + return str(y) + else: + if len(x): + return x[0] + else: + return "bar" + + self.checkScript(fn, (1,)) + self.checkScript(fn, (["a", "b", "c"],)) + + def test_union_type_refinement_tuple_rhs_union(self): + @torch.jit.script + def fn(x: int) -> str: + if torch.jit.isinstance(x, (Union[int, str], float)): + y = x + x + return str(y) + else: + return "foo" + + # TODO: There's currently an unrelated bug in + # `torch.jit.isinstance` that makes it fail for tuple literals. + # Posted here: https://github.com/pytorch/pytorch/issues/60095 + # Change `assertEqual` to `checkScript` when the bug is fixed + self.assertEqual(fn(1), "2") + + def test_union_type_refinement_statically_false(self): + @torch.jit.script + def fn(x: int) -> str: + if torch.jit.isinstance(x, (Union[str, float], List[str], str)): + z = x + "foo" + return z + else: + return "bar" + + s = fn.graph + + # Check that we don't have any branching statements + FileCheck().check_not("block0()") \ + .check_not("block1()") \ + .run(s) + + def test_union_type_refinement_statically_true(self): + @torch.jit.script + def fn(x: Union[List[int], int]) -> Union[List[int], int]: + if not torch.jit.isinstance(x, (int, List[int])): + return x + else: + l = [1, 2, 3] + y: Union[List[int], int] = l + return y + + s = fn.graph + + # Check that we don't have any branching statements + FileCheck().check_not("block0()") \ + .check_not("block1()") \ + .run(s) + + def test_union_type_refinement_partial_static_refinement_tuple_rhs(self): + def fn(x: Union[List[int], int]) -> int: + if torch.jit.isinstance(x, (int, float, str)): + # We should know that `x` is an `int` here + z = x + 1 + return z + else: + return 100 + + self.checkScript(fn, ([1, 2, 3],)) + self.checkScript(fn, (1,)) + + def test_union_type_refinement_partial_static_refinement_union_rhs(self): + def fn(x: Union[List[int], int]) -> int: + if torch.jit.isinstance(x, Union[int, float, str]): + # We should know that `x` is an `int` here + z = x + 1 + return z + else: + return 100 + + self.checkScript(fn, ([1, 2, 3],)) + self.checkScript(fn, (1,)) + + def test_union_type_refinement_internal_declaration(self): + def fn(flag: bool) -> str: + x: Union[int, str, None] = None + if (flag): + y = "foo" + else: + y = 1 + if isinstance(x, str): + return x + else: + return "bar" + + self.checkScript(fn, (True,)) + self.checkScript(fn, (False,)) + + def test_union_branching_with_union_return_and_homogenous_types(self): + def fn(x: int) -> Union[int, str]: + if x % 2: + return "foo" + else: + return "bar" + + self.checkScript(fn, (1,)) + self.checkScript(fn, (8,)) + + def test_union_branching_does_not_autoinfer_undeclared_union(self): + def fn(x: int) -> str: + if x % 2: + y = "foo" + else: + y = x + if isinstance(y, str): + return y + else: + return "bar" + + with self.assertRaisesRegex(RuntimeError, "y is set to type str" + " in the true branch and type int " + "in the false branch"): + torch.jit.script(fn) + + def test_union_branching_does_not_widen_existing_inferred_type(self): + def fn(x: int) -> str: + y = "foo" + if x % 2: + y = "bar" + else: + y = x + if isinstance(y, str): + return y + else: + return "baz" + + with self.assertRaisesRegex(RuntimeError, "previously had type " + "str but is now being assigned to a" + " value of type int"): + torch.jit.script(fn) + + def test_union_schema_matching_on_internal_type(self): + def fn(x: Union[List[int], Dict[str, int]]) -> int: + if torch.jit.isinstance(x, List[int]): + return x[0] + else: + return list(x.values())[0] + + self.checkScript(fn, ([1, 2, 3],)) + self.checkScript(fn, ({"foo": 1, "bar": 2, "baz": 3},)) + + def test_union_subtractive_refinement(self): + def fn(x: Union[List[int], int]) -> int: + if not isinstance(x, int): + x.append(1) + return x[0] + else: + return x + + self.checkScript(fn, (1,)) + self.checkScript(fn, ([1, 2, 3],)) + + def test_union_subtractive_refinement_with_container(self): + def fn(x: Union[List[int], int]) -> int: + if not torch.jit.isinstance(x, List[int]): + return x + else: + x.append(1) + return x[0] + + self.checkScript(fn, (1,)) + self.checkScript(fn, ([1, 2, 3],)) + + def test_union_memory_aliasing(self): + def fn(): + x : List[torch.Tensor] = [] + z : List[Optional[List[torch.Tensor]]] = [] + z.append(x) + x_alias = z[0] + if torch.jit.isinstance(x_alias, List[torch.Tensor]): + x_alias.append(torch.tensor(3)) + return x + + self.checkScript(fn, ()) + + def test_union_serialization_preserves_type_annotations(self): + # This function will fail after being torch.jit.save'd and + # torch.jit.load'd if the type annotations aren't preserved + # for Union during serialization. We need the `Union[str, int]` + # annotation to make sure that `y` is typed as a Union instead + # of as a str in one branch and an int in the other + def fn(x: int) -> str: + if x % 2: + y: Union[str, int] = "bar" + else: + y: Union[str, int] = x + if isinstance(y, str): + return y + else: + return "baz" + + self.checkScript(fn, (1,)) + self.checkScript(fn, (8,)) diff --git a/test/test_jit.py b/test/test_jit.py index 8d1981d772763..7051d66dcf83c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -62,6 +62,7 @@ from jit.test_attr import TestGetDefaultAttr # noqa: F401 from jit.test_aten_pow import TestAtenPow # noqa: F401 from jit.test_optimize_for_mobile_preserve_debug_info import TestOptimizeForMobilePreserveDebugInfo # noqa: F401 +from jit.test_union import TestUnion # noqa: F401 # Torch from torch import Tensor @@ -2518,32 +2519,6 @@ def forward(self, input, other=four): t = Test() self.assertEqual(t(torch.ones(1)), torch.ones(1) + 4) - def test_union_to_optional(self): - def test1(u: Union[int, None]) -> int: - if u is not None: - return u - else: - return 0 - scripted = torch.jit.script(test1) - self.assertEqual(scripted(10), test1(10)) - - def test2(u: Union[None, int]) -> int: - if u is not None: - return u - else: - return 0 - scripted = torch.jit.script(test2) - self.assertEqual(scripted(40), test2(40)) - - def test3(u: Union[float, int]) -> int: - if u is not None: - return u - else: - return 0 - expected_result = "General Union types are not currently supported" - with self.assertRaisesRegex(RuntimeError, expected_result): - torch.jit.script(test3) - def test_mutable_default_values(self): with self.assertRaisesRegex(Exception, "Mutable default parameters"): @torch.jit.script @@ -8900,6 +8875,7 @@ def test_pack_unpack_state(self): torch.testing.assert_close(imported(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float))) @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support") + @unittest.skipIf(True, "Skipping while landing PR stack") def test_torch_functional(self): def stft(input, n_fft): # type: (Tensor, int) -> Tensor @@ -9809,8 +9785,9 @@ def bar(): bar() def test_if_different_type(self): - with self.assertRaisesRegex(RuntimeError, "Type mismatch: c0 is set to type int " - "in the true branch and type float in the false branch:"): + with self.assertRaisesRegex(RuntimeError, "c0 is set to type " + "int in the true branch and type " + "float in the false branch"): @torch.jit.script def diff_type_used(): if 1 == 2: @@ -9819,7 +9796,7 @@ def diff_type_used(): c0 = 1.0 return c0 - with self.assertRaisesRegex(RuntimeError, "Variable 'c0' previously has type float"): + with self.assertRaisesRegex(RuntimeError, "Variable 'c0' previously had type float"): @torch.jit.script def diff_existing_type(x): c0 = 1.0 @@ -10602,7 +10579,7 @@ def f5(a): with self.assertRaisesRegex(RuntimeError, r'Expected a value of' r' type \'List\[int\]\' for argument' r' \'size\' but instead found type ' - r'\'List\[Any\]\''): + r'\'List\[Union\[List\[int\], int\]\]'): @torch.jit.script def f6(a): a.expand(size=[3, [4]]) @@ -12672,7 +12649,7 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]: for pair in self.type_input_return_pairs(): cu = torch.jit.CompilationUnit(self.format_code(code, pair)) test_str.append(str(cu.foo.schema)) - self.assertExpected("\n".join(test_str)) + self.assertExpected("\n".join(test_str) + "\n") # String frontend , Python 3-style type annotations , Script method def test_annot_string_py3_method(self): @@ -12691,7 +12668,7 @@ def foo(self, x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output tm = TestModule() tm.define(self.format_code(code, pair)) test_str.append(str(tm.foo.schema)) - self.assertExpectedStripMangled("\n".join(test_str)) + self.assertExpectedStripMangled("\n".join(test_str) + "\n") # String frontend , MyPy-style type comments , Script function def test_annot_string_mypy_fn(self): @@ -12704,7 +12681,7 @@ def foo(x, y): for pair in self.type_input_return_pairs(): cu = torch.jit.CompilationUnit(self.format_code(code, pair)) test_str.append(str(cu.foo.schema)) - self.assertExpectedStripMangled("\n".join(test_str)) + self.assertExpectedStripMangled("\n".join(test_str) + "\n") # String frontend , MyPy-style type comments , Script method def test_annot_string_mypy_method(self): @@ -12725,7 +12702,7 @@ def foo(self, x, y): tm = TestModule() tm.define(self.format_code(code, pair)) test_str.append(str(tm.foo.schema)) - self.assertExpectedStripMangled("\n".join(test_str)) + self.assertExpectedStripMangled("\n".join(test_str) + "\n") # Python AST Frontend , Python 3-style type annotations , Script function def test_annot_ast_py3_fn(self): @@ -12742,7 +12719,7 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]: for pair in self.type_input_return_pairs(): fn = jit_utils._get_py3_code(self.format_code(code, pair), 'foo') test_str.append(str(fn.schema)) - self.assertExpectedStripMangled("\n".join(test_str)) + self.assertExpectedStripMangled("\n".join(test_str) + "\n") def test_multiline_annot_ast_py3_fn(self): code = dedent(''' @@ -12817,7 +12794,7 @@ def foo(self, x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output for pair in self.type_input_return_pairs(): fn = jit_utils._get_py3_code(self.format_code(code, pair), 'instance') test_str.append(str(fn.foo.schema)) - self.assertExpectedStripMangled("\n".join(test_str)) + self.assertExpectedStripMangled("\n".join(test_str) + "\n") # Python AST Frontend , MyPy-style type comments , Script function def test_annot_ast_mypy_fn(self): @@ -12833,7 +12810,7 @@ def foo(x, y): for pair in self.type_input_return_pairs(): fn = jit_utils._get_py3_code(self.format_code(code, pair), 'foo') test_str.append(str(fn.schema)) - self.assertExpected("\n".join(test_str)) + self.assertExpected("\n".join(test_str) + "\n") # Python AST Frontend , MyPy-style type comments , Script method def test_annot_ast_mypy_method(self): @@ -12851,7 +12828,7 @@ def foo(self, x, y): for pair in self.type_input_return_pairs(): fn = jit_utils._get_py3_code(self.format_code(code, pair), 'instance') test_str.append(str(fn.foo.schema)) - self.assertExpectedStripMangled("\n".join(test_str)) + self.assertExpectedStripMangled("\n".join(test_str) + "\n") # Tests that "# type: ignore[*]" is supported in type lines and is # properly ignored. @@ -13521,8 +13498,8 @@ def fn(x): self.checkScript(fn, ("y")) def index_str_to_tensor(s): - # type: (str) -> int - return torch.tensor(ord(s)) + # type: (str) -> Tensor + return torch.tensor(ord(s)) # noqa: T484 s = u'\u00a3'.encode('utf8')[:1] self.checkScript(index_str_to_tensor, (s,)) diff --git a/test/test_ops.py b/test/test_ops.py index 27aee72f00846..b5b03c5b96ab9 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1,5 +1,6 @@ from collections.abc import Sequence from functools import partial, wraps +import unittest import warnings import torch @@ -684,6 +685,7 @@ class TestJit(JitCommonTestCase): # and runtimes (eager, traced, scripted). # TODO WARNING: inplace x {traced, scripted} not currently tested @_variant_ops(op_db) + @unittest.skipIf(True, "Temporarily skipping while landing Union PR stack") def test_variant_consistency_jit(self, device, dtype, op): _requires_grad = op.supports_autograd and (dtype.is_floating_point or op.supports_complex_autograd(torch.device(device).type)) diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py index ba828e5b3dae7..9f8b79d96958b 100644 --- a/test/test_public_bindings.py +++ b/test/test_public_bindings.py @@ -210,6 +210,7 @@ def test_no_new_bindings(self): "TupleType", "Type", "unify_type_list", + "UnionType", "Use", "Value", "autocast_decrement_nesting", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 01fdf9e12500a..091cb097d14e5 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -1001,6 +1001,9 @@ class TupleType(JitType): def __init__(self, a: List[Optional[JitType]]) -> None: ... def elements(self) -> List[JitType]: ... +class UnionType(JitType): + def __init__(self, a: List[JitType]) -> None: ... + class ClassType(JitType): def __init__(self, qualified_name: str) -> None: ... diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 418607add7373..806dae6d37f45 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -885,33 +885,28 @@ def is_dict(ann) -> bool: (getattr(ann, '__origin__', None) is Dict or getattr(ann, '__origin__', None) is dict) -def is_optional(ann) -> bool: - if ann is Optional: - raise_error_container_parameter_missing("Optional") +def is_union(ann): + if ann is Union: + raise_error_container_parameter_missing("Union") - # Optional[T] is just shorthand for Union[T, None], so check for both - def safe_is_subclass(the_type, super_type): - # Don't throw if `the_type` isn't a class type (e.g. if it is - # another type annotation instance) - if not inspect.isclass(the_type): - return False - return issubclass(the_type, super_type) + return (hasattr(ann, '__module__') and + ann.__module__ == 'typing' and + (getattr(ann, '__origin__', None) is Union)) - if not hasattr(ann, '__module__'): - return False +def is_optional(ann): + if ann is Optional: + raise_error_container_parameter_missing("Optional") - union_optional = False - if ann.__module__ == 'typing' and \ - (getattr(ann, '__origin__', None) is Union): - args = getattr(ann, '__args__', ()) - if len(args) == 2: - union_optional = (safe_is_subclass(args[1], type(None)) and not safe_is_subclass(args[0], type(None))) \ - or (safe_is_subclass(args[0], type(None)) and not safe_is_subclass(args[1], type(None))) + def is_optional_as_optional(ann): + return (hasattr(ann, '__module__') and + ann.__module__ == 'typing' and + (getattr(ann, '__origin__', None) is Optional)) - optional = ann.__module__ == 'typing' and \ - (getattr(ann, '__origin__', None) is Optional) + def is_union_as_optional(ann): + ann_args = ann.__args__ + return len(ann_args) == 2 and None in ann_args - return optional or union_optional + return is_optional_as_optional(ann) or (is_union(ann) and is_union_as_optional(ann)) def is_future(ann) -> bool: if ann is Future: @@ -1192,15 +1187,16 @@ def container_checker(obj, target_type) -> bool: elif not isinstance(el, el_type): return False return True - elif origin_type is Union: # actually handles Optional Case + elif origin_type is Union: # also handles Optional if obj is None: # check before recursion because None is always fine return True - optional_type = get_args(target_type)[0] - optional_origin = get_origin(optional_type) - if optional_origin: - return container_checker(obj, optional_type) - elif isinstance(obj, optional_type): - return True + inner_types = get_args(target_type) + for t in inner_types: + t_origin = get_origin(t) + if (t_origin): + return container_checker(obj, t) + elif isinstance(obj, t): + return True return False diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md index f44c5988caab0..45e18afd20233 100644 --- a/torch/csrc/jit/OVERVIEW.md +++ b/torch/csrc/jit/OVERVIEW.md @@ -792,7 +792,7 @@ In practice, the interpreter will allocate one Stack, and it will eventually rea [runtime/operator.h](runtime/operator.h) -The Operator object represents a single registered operator in the system. It combines a FunctionSchema that describes how an Operation executes with a method to lookup the corresponding Operation given the `Node` representing the operator in a `Graph`. Most Operators are defined by providing a FunctionSchema and an Operation function. However, primitives like prim::Unpack require knowledge of their `Node` to know how to operate (e.g. how many elements to unpack). These Operators have a function that takes a `Node*` and returns an operation. +The Operator object represents a single registered operator in the system. It combines a FunctionSchema that describes how an Operation executes with a method to look up the corresponding Operation given the Node representing the operator in a Graph. Most Operators are defined by providing a FunctionSchema and an Operation function. However, primitives like prim::Unpack require knowledge of their Node to know how to operate (e.g. how many elements to unpack). These Operators have a function that takes a `Node*` and returns an operation. ## Interpreter ## @@ -1282,13 +1282,14 @@ Note the alias set `*`. This is the **wildcard set**. Optimization passes must a This annotation language is consumed by the `FunctionSchema` parser, which produces `AliasInfo` objects summarizing the aliasing relationships for each schema `Argument`. ### Alias Analysis in the IR + [ir/alias_analysis.h](ir/alias_analysis.h) An alias analysis pass consumes the per-operator aliasing information to construct a database of aliasing and mutation relationships in a graph, called `AliasDb`. This section focuses on the alias analysis pass; the public interface to `AliasDb` will be described later. -The core data structure in the AliasDb is called `AliasTracker`, which is a DAG where the edges are "may point to" relationships and the vertices are aliasing `Element`s. The most common kind of `Element` is an IR `Value`, but there are other kinds of things that can alias that aren't first-class `Value`s in the IR, like wildcards or contained types (such as in a list or tuple). +The core data structure in the AliasDb is called `MemoryDAG`, which is a DAG where the edges are "may point to" relationships and the vertices are aliasing `Element`s. The most common kind of `Element` is an IR `Value`, but there are other kinds of things that can alias that aren't first-class `Value`s in the IR, like wildcards or contained types (such as in a list or tuple). -The alias analysis pass walks through the nodes in a graph, examining schema `AliasInfo` objects and adding edges in the `AliasTracker` DAG accordingly. For example, for the node: +The alias analysis pass walks through the nodes in a graph, examining schema `AliasInfo` objects and adding edges in the `MemoryDAG` accordingly. For example, for the node: ``` %output : Tensor = aten::view(%self, %size) ``` @@ -1321,7 +1322,7 @@ A few things to note: The last point demonstrates a key concept: *leaf elements uniquely describe memory locations*. Since a leaf element doesn't point to anything, the memory that backs it must have been freshly allocated by some op. Thus we can use leaf elements to represent disjoint memory locations. -So to determine whether `a` and `b` may alias, we traverse the `AliasTracker` DAG and figure out if `a` and `b` share any leaf nodes. If they do, then we know `a` and `b` might point to the same memory location, i.e. `a` and `b` may alias. This kind of query is common enough that `AliasTracker` does path compression to speed up leaf-finding, so that aliasing queries can be serviced in amortized constant time. +So to determine whether `a` and `b` may alias, we traverse the `MemoryDAG` DAG and figure out if `a` and `b` share any leaf nodes. If they do, then we know `a` and `b` might point to the same memory location, i.e. `a` and `b` may alias. This kind of query is common enough that `MemoryDAG` does path compression to speed up leaf-finding, so that aliasing queries can be serviced in amortized constant time. ### Writing optimization passes with `AliasDb` `AliasDb` provides a high-level interface to help people write mutability-safe optimization passes. diff --git a/torch/csrc/jit/frontend/convert_to_ssa.cpp b/torch/csrc/jit/frontend/convert_to_ssa.cpp index 9b86c78c89d41..269c049dae64c 100644 --- a/torch/csrc/jit/frontend/convert_to_ssa.cpp +++ b/torch/csrc/jit/frontend/convert_to_ssa.cpp @@ -93,10 +93,8 @@ struct ControlFlowLoadStores { for (const auto& x : mutated_variables) { auto true_type = true_vars->findInAnyFrame(x); auto false_type = false_vars->findInAnyFrame(x); - auto unified = unifyTypes(true_type, false_type); - if (!unified) { - continue; - } + auto unified = + unifyTypes(true_type, false_type, /*default_to_union=*/true); addBlockOutput(true_block, true_type, x); addBlockOutput(false_block, false_type, x); diff --git a/torch/csrc/jit/frontend/exit_transforms.cpp b/torch/csrc/jit/frontend/exit_transforms.cpp index c91ec7bb634f3..71f534107575f 100644 --- a/torch/csrc/jit/frontend/exit_transforms.cpp +++ b/torch/csrc/jit/frontend/exit_transforms.cpp @@ -150,8 +150,10 @@ struct ExitTransformer { registerBlockOutputs(if_view.thenBlock(), true_outs); registerBlockOutputs(if_view.elseBlock(), false_outs); for (const auto i : c10::irange(true_outs.size())) { - auto out_type = - unifyTypes(true_outs.at(i)->type(), false_outs.at(i)->type()); + auto out_type = unifyTypes( + true_outs.at(i)->type(), + false_outs.at(i)->type(), + /*default_to_union=*/true); n->addOutput()->setType(*out_type); } } diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index d443f418e6eca..dd29f1eda6412 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -185,7 +185,9 @@ NoneStatus canBeNone(Value* v) { if (v->node()->mustBeNone()) { return ALWAYS; } - if (v->type()->kind() == OptionalType::Kind) { + if (v->type()->kind() == OptionalType::Kind || + (v->type()->kind() == UnionType::Kind && + v->type()->expect()->canHoldType(NoneType::get()))) { return MAYBE; } return NEVER; @@ -385,7 +387,7 @@ struct Environment { std::stringstream why_not; if (!as_simple_value->type()->isSubtypeOfExt(parent_type, &why_not)) { auto error = ErrorReport(loc); - error << "Variable '" << name << "' previously has type " + error << "Variable '" << name << "' previously had type " << simple_parent->type()->repr_str() << " but is now being assigned to a value of type " << as_simple_value->type()->repr_str(); @@ -547,6 +549,7 @@ struct Environment { if (!retval && required) { throwVarNotFoundError(ident, range); } + return retval; } @@ -1010,57 +1013,61 @@ struct to_ir { } void emitReturn(const Return& stmt) { - TypePtr result_type = def_stack_.back().declared_return_type_; - Value* result = emitExpr(stmt.expr(), result_type); + TypePtr declared_return_type = + def_stack_.back().declared_return_type_; // nullptr if not annotated + auto actual_return = emitExpr(stmt.expr(), declared_return_type); + // result type is annotated, every return must convert to that type - if (result_type) { + if (declared_return_type) { // this guard skips implicit conversion from None -> Tensor for the return // type. otherwise forgetting a return a function returning a tensor will // cause a None to be converted to a tensor. - if (!(result_type->isSubtypeOf(TensorType::get()) && - result->type()->isSubtypeOf(NoneType::get()))) { - result = tryConvertToType( + if (!(actual_return->type()->isSubtypeOf(TensorType::get()) && + actual_return->type()->isSubtypeOf(NoneType::get()))) { + actual_return = tryConvertToType( stmt.range(), *graph, - result_type, - result, + declared_return_type, + actual_return, /*allow_conversions=*/true); } - - if (!result->type()->isSubtypeOf(result_type)) { + if (!actual_return->type()->isSubtypeOf(declared_return_type)) { throw ErrorReport(stmt.range()) << "Return value was annotated as having type " - << result_type->repr_str() << " but is actually of type " - << result->type()->repr_str(); + << declared_return_type->repr_str() << " but is actually of type " + << actual_return->type()->repr_str(); } } else { - result_type = def_stack_.back().merged_return_type_; - if (!result_type) { - result_type = result->type(); + declared_return_type = def_stack_.back().merged_return_type_; + if (!declared_return_type) { + declared_return_type = actual_return->type(); } - auto merged_result_type = unifyTypes(result_type, result->type()); - if (!merged_result_type) { + auto merged_return_type = + unifyTypes(declared_return_type, actual_return->type()); + if (!merged_return_type) { throw ErrorReport(stmt.range()) << "Previous return statement returned a value of type " - << result_type->repr_str() + << declared_return_type->repr_str() << " but this return statement returns a value of type " - << result->type()->repr_str(); + << actual_return->type()->repr_str(); } - result_type = merged_result_type.value(); + declared_return_type = merged_return_type.value(); } - AT_ASSERT(result_type); + AT_ASSERT(declared_return_type); - def_stack_.back().merged_return_type_ = result_type; + def_stack_.back().merged_return_type_ = declared_return_type; // If the annotated return type is Any and the result type is not Any, // cast the result to Any to facilitate type unification between return // statements on different code paths (e.g. different branches of an if, // body and containing scope of a loop). - if (result_type == AnyType::get() && result->type() != AnyType::get()) { - result = graph->insertUncheckedCast(result, result_type); + if (declared_return_type == AnyType::get() && + actual_return->type() != AnyType::get()) { + actual_return = + graph->insertUncheckedCast(actual_return, declared_return_type); } - graph->insertNode(graph->create(prim::ReturnStmt, {result}, 0)); + graph->insertNode(graph->create(prim::ReturnStmt, {actual_return}, 0)); exit_blocks.insert(environment_stack->block()); } @@ -1142,10 +1149,10 @@ struct to_ir { return {}; } // statement must be var {is, is not} None - auto name = Var(lhs).name().name(); - // XXX - while it should in theory be possible to specialize - // the `x is None` to know x has type NoneType, we have previously not - // done this. Unfortunately, doing this will make the type None + const std::string& name = Var(lhs).name().name(); + // While it should in theory be possible to specialize + // the `x is None` to know x has type NoneType, we have previously + // not done this. Unfortunately, doing this will make the type None // propagate further in all loaded models. The handling of // unwrap_optional will fail in these cases since export did // not expect that the input would be none and an unannotated None. @@ -1154,7 +1161,7 @@ struct to_ir { // and (2) only enable this OPTIONAL_NONE when loading newer // graphs because it is incompatible with older graphs. // Refinement none(name, RefinementKind::OPTIONAL_NONE); - if (auto optional_type = lhs_value->type()->cast()) { + if (const auto optional_type = lhs_value->type()->cast()) { Refinement present(name, optional_type->getElementType()); if (tok == TK_IS) { return RefinementSet({}, {present}); @@ -1162,6 +1169,21 @@ struct to_ir { return RefinementSet({present}, {}); } } + if (const auto union_type = lhs_value->type()->cast()) { + std::vector to_subtract{NoneType::get()}; + c10::optional remaining = + union_type->subtractTypeSet(to_subtract); + std::vector all_present; + if (remaining) { + Refinement present{name, *remaining}; + all_present.push_back(std::move(present)); + } + if (tok == TK_IS) { + return RefinementSet({}, all_present); + } else { // TK_ISNOT + return RefinementSet(all_present, {}); + } + } return RefinementSet(); } @@ -1340,7 +1362,7 @@ struct to_ir { auto unified = unifyTypes( lt->getElementType(), out->type(), - /*default_to_any=*/true, + /*default_to_union=*/true, element_type_hint); if (lt->getElementType() != AnyType::get() && @@ -1458,7 +1480,7 @@ struct to_ir { c10::optional unified = unifyTypes( dt->getValueType(), v->type(), - /*default_to_any=*/true, + /*default_to_union=*/true, value_type_hint); // Warn the user if we inferred the type of the values to be `Any` @@ -1755,13 +1777,32 @@ struct to_ir { graph->createStore(x, fv)->insertBefore(false_block->return_node()); } - auto unified = unifyTypes(tv->type(), fv->type()); + SugaredValuePtr maybe_sugared_x = environment_stack->findInAnyFrame(x); + TypePtr full_type = nullptr; + if (maybe_sugared_x) { + Value* maybe_simple = asSimple(maybe_sugared_x); + if (maybe_simple) { + full_type = maybe_simple->type(); + } + } - // attempt to unify the types. we allow variables to be set to different - // types in each branch as long as that variable is not already in scope, - // or if that variable does not get used later. here, we save the error - // so that the error message will be more informative in the case that is - // used later. When a is accessed in (a + 1), the error will get printed + // Try to unify the types. If we found a type annotation earlier + // in the environment, and if that type annotation is some form + // of union, then we need to tell `unifyTypes` not to throw an + // error if the branched return types we found are heterogenous + bool default_to_union = full_type && + (full_type->kind() == UnionType::Kind || + full_type->kind() == OptionalType::Kind || + full_type->kind() == NumberType::Kind); + auto unified = unifyTypes( + tv->type(), fv->type(), /*default_to_union=*/default_to_union); + + // We allow variables to be set to different types in each branch + // as long as that variable is not already in scope or if that + // variable does not get used later. Here, we save the error so + // that the error message will be more informative in the case + // that is used later. When `a` is accessed in `(a + 1)`, the + // error will get printed: // if cond: // a = 1 // else: @@ -1799,76 +1840,146 @@ struct to_ir { } CondValue emitIsInstance(const Expr& obj, const Expr& classinfo) { - // turn (float, (int, tuple)) into a flat list of types and type kind - // category checks: tuple_check = true, types = {float, int} - struct GatheredTypes { - GatheredTypes(ScriptTypeParser parser) : typeParser_(std::move(parser)) {} - void gather(const Expr& classinfo) { - if (classinfo.kind() == TK_TUPLE_LITERAL) { - for (Expr e : TupleLiteral(classinfo).inputs()) { - gather(e); - } - return; + Value* lhs_val = emitExpr(obj); + std::vector lhs_types; + std::vector rhs_types; + + std::function gather_rhs = [&](const Expr& expr) { + if (expr.kind() == TK_TUPLE_LITERAL) { + for (Expr e : TupleLiteral(expr).inputs()) { + gather_rhs(e); } - TypePtr type = typeParser_.parseTypeFromExpr(classinfo); - types.emplace_back(type); + return; } - bool staticallyTrue(const TypePtr& actual_type) { - // is this isinstance check statically true? - for (const TypePtr& typ : types) { - if (actual_type->isSubtypeOf(typ)) { - return true; - } + TypePtr type = typeParser_.parseTypeFromExpr(expr); + rhs_types.emplace_back(type); + }; + + lhs_types.push_back(lhs_val->type()); + gather_rhs(classinfo); + + standardizeVectorForUnion(&lhs_types); + standardizeVectorForUnion(&rhs_types); + + RefinementSet refinement; + + TypePtr unified_true = nullptr; + TypePtr unified_false = nullptr; + + std::vector isinstance_types; + std::vector not_isinstance_types; + + std::vector true_refinements; + std::vector false_refinements; + + bool all_lhs_subtype_some_rhs = true; + + // We can discard any rhs types that we know statically would be + // impossible. For example, if we had: + // + // def fn(x: Optional[str]): + // if isinstance(x, (List[str], str, int)): + // ... + // + // then `x` would be `str` in the true branch and `None` in the + // false branch, not `(List[str], str, int)` in the true branch + // and `None` in the false branch + for (const TypePtr& lhs_type : lhs_types) { + if (lhs_type == AnyType::get()) { + isinstance_types.insert( + isinstance_types.end(), rhs_types.begin(), rhs_types.end()); + not_isinstance_types.push_back(AnyType::get()); + // Edge case: we can still say that all lhs types subtype some + // rhs type if `lhs` is `Any` and `rhs` is `Any` + if (isinstance_types.size() != 1 || + isinstance_types[0] != AnyType::get()) { + all_lhs_subtype_some_rhs = false; } - return false; + break; } - bool maybeOfKind(TypeKind kind, const TypePtr& actual_type) { - if (actual_type->kind() == AnyType::Kind) { - return true; + + auto get_smaller_type = [&](TypePtr t1, TypePtr t2) -> TypePtr { + if (t1->isSubtypeOf(t2)) { + return t1; + } else if (t2->isSubtypeOf(t1)) { + return t2; + } else { + return nullptr; } - if (auto op = actual_type->cast()) { - return op->getElementType()->kind() == kind; + }; + + TypePtr found_refinement = nullptr; + for (const TypePtr& rhs_type : rhs_types) { + TypePtr maybe_smaller_type = get_smaller_type(lhs_type, rhs_type); + if (!maybe_smaller_type) { + continue; + } else if (*maybe_smaller_type == *lhs_type) { + // Cover the case that we have something like + // lhs = `List[str]` and rhs = `list` + found_refinement = lhs_type; + } else if (*maybe_smaller_type == *rhs_type) { + // We want the narrowest possible type + found_refinement = found_refinement + ? *(unifyTypes(found_refinement, rhs_type)) + : rhs_type; } - return false; } - bool staticallyFalse(const TypePtr& actual_type) { - for (const TypePtr& typ : types) { - if (typ->isSubtypeOf(actual_type)) { - return false; - } - if ((typ->isSubtypeOf(AnyListType::get()) && - maybeOfKind(ListType::Kind, actual_type)) || - (typ->isSubtypeOf(AnyTupleType::get()) && - maybeOfKind(TupleType::Kind, actual_type))) { - return false; - } + + if (found_refinement) { + if (*found_refinement == *lhs_type) { + all_lhs_subtype_some_rhs &= true; } - return true; + isinstance_types.push_back(found_refinement); + } else { + // If the lhs couldn't be a subtype of the rhs (or couldn't + // be "refined" to itself, as in the `List[str]` and `list` + // case above), then we add `lhs_type` to the false branch + // refinements. This is because the type can still be itself + // if the `isinstance` check is false + not_isinstance_types.push_back(lhs_type); + all_lhs_subtype_some_rhs = false; } - ScriptTypeParser typeParser_; - std::vector types; - }; - GatheredTypes gathered(typeParser_); - gathered.gather(classinfo); - auto val = emitExpr(obj); - RefinementSet refinement; - if (gathered.types.size() == 1 && - gathered.types.at(0)->isSubtypeOf(val->type()) && - obj.kind() == TK_VAR) { + } + + // For use with `unifyTypeList` + std::stringstream nowhere; + + // Get a single type for the true and false branches + if (!isinstance_types.empty()) { + unified_true = + *unifyTypeList(isinstance_types, nowhere, /*default_to_union=*/true); + } + if (obj.kind() == TK_VAR && unified_true) { + std::string ident = Var(obj).name().name(); + true_refinements = {Refinement(ident, unified_true)}; + } + + // Get a single type for the true and false branches + if (!not_isinstance_types.empty()) { + unified_false = *unifyTypeList( + not_isinstance_types, nowhere, /*default_to_union=*/true); + } + if (obj.kind() == TK_VAR && unified_false) { std::string ident = Var(obj).name().name(); - Refinement isinstance(std::move(ident), gathered.types.at(0)); - refinement = RefinementSet({isinstance}, {}); + false_refinements = {Refinement(ident, unified_false)}; } - if (gathered.staticallyTrue(val->type())) { + refinement = RefinementSet(true_refinements, false_refinements); + + bool is_statically_false = isinstance_types.empty(); + + // If the statement is statically true + if (all_lhs_subtype_some_rhs) { return CondValue(*graph, obj.range(), true, std::move(refinement)); } - if (gathered.staticallyFalse(val->type())) { + + if (is_statically_false) { return CondValue(*graph, obj.range(), false, std::move(refinement)); } + // check maybe true/false at runtime, need an actual op Value* result = - graph->insertNode(graph->createIsInstance(val, gathered.types)) + graph->insertNode(graph->createIsInstance(lhs_val, rhs_types)) ->output(); return CondValue(result, std::move(refinement), c10::nullopt); } @@ -2124,6 +2235,7 @@ struct to_ir { } // emit assserions as an if branch so that assertions will reuse the + // message void emitAssert(const Assert& stmt) { CondValue cond_value = emitCondExpr(stmt.test()); List true_branch = List::create(stmt.range(), {}); @@ -2979,7 +3091,9 @@ struct to_ir { // after annotation so that variables assigned to this None will still // get the right type. To do this, we make a None constant that // has the type Optional[T] - if (type->kind() == OptionalType::Kind && + if ((type->kind() == OptionalType::Kind || + (type->kind() == UnionType::Kind && + type->expect()->canHoldType(NoneType::get()))) && expr->type()->isSubtypeOf(NoneType::get())) { Node* none = graph->createNone(); none->output()->setType(type); @@ -3435,8 +3549,9 @@ struct to_ir { size_t n_binders, const TypePtr& type_hint = nullptr) { switch (tree.kind()) { - case TK_VAR: + case TK_VAR: { return environment_stack->getSugaredVar(Var(tree).name()); + } case '.': { auto select = Select(tree); auto sv = emitSugaredExpr(select.value(), 1); @@ -3710,7 +3825,7 @@ struct to_ir { type_hint ? type_hint->expect()->getElementType() : nullptr; c10::optional unified = unifyTypeList( - types, nowhere, /*default_to_any=*/true, element_type_hint); + types, nowhere, /*default_to_union=*/true, element_type_hint); if (!type_hint && *unified == AnyType::get()) { TORCH_WARN( @@ -3881,7 +3996,7 @@ struct to_ir { c10::optional unified = unifyTypeList( types, /*why_not=*/nowhere, - /*default_to_any=*/true, + /*default_to_union=*/true, value_type_hint); if (!type_hint && *unified == AnyType::get()) { diff --git a/torch/csrc/jit/frontend/schema_matching.h b/torch/csrc/jit/frontend/schema_matching.h index 6b434882eb798..fb6d1ab7f92e5 100644 --- a/torch/csrc/jit/frontend/schema_matching.h +++ b/torch/csrc/jit/frontend/schema_matching.h @@ -8,9 +8,10 @@ namespace torch { namespace jit { -// try to match a list of inputs and keyword 'attributes' to this schema, -// if it works return the flat list of positional inputs to the call -// if it returns nullopt, then failure_messages contains a good error report +// Try to match a list of inputs and keyword 'attributes' to this +// schema. Return the flat list of positional inputs to the call or +// `c10::nullopt` on failure (`failure_messages` contains a good error +// report in this case) struct MatchedSchema { std::vector inputs; diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp index b4e6ca880ebce..a543b5b6fbe5d 100644 --- a/torch/csrc/jit/frontend/schema_type_parser.cpp +++ b/torch/csrc/jit/frontend/schema_type_parser.cpp @@ -32,6 +32,7 @@ using c10::StringType; using c10::Symbol; using c10::TensorType; using c10::TupleType; +using c10::UnionType; using c10::VarType; namespace torch { @@ -331,6 +332,18 @@ std::pair> SchemaTypeParser::parseType() { L.expect(')'); alias_info = parseAliasAnnotation(); value = DictType::create(key_type, value_type); + } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Union") { + L.next(); + L.expect('('); + std::vector types; + types.emplace_back(parseType().first); + while (L.cur().kind != ')') { + L.expect(','); + types.emplace_back(parseType().first); + } + L.expect(')'); + alias_info = parseAliasAnnotation(); + value = UnionType::create(types); } else if ( complete_tensor_types && L.cur().kind == TK_IDENT && parseTensorDType(L.cur().text())) { diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp index eac51ab527d52..bafe5188cc4eb 100644 --- a/torch/csrc/jit/frontend/script_type_parser.cpp +++ b/torch/csrc/jit/frontend/script_type_parser.cpp @@ -42,7 +42,7 @@ TypePtr ScriptTypeParser::subscriptToType( } std::vector subscript_expr_types; for (auto expr : subscript.subscript_exprs()) { - subscript_expr_types.push_back(parseTypeFromExprImpl(expr)); + subscript_expr_types.emplace_back(parseTypeFromExprImpl(expr)); } return TupleType::create(subscript_expr_types); } else if (typeName == "List" || typeName == "list") { @@ -65,6 +65,13 @@ TypePtr ScriptTypeParser::subscriptToType( parseTypeFromExprImpl(*subscript.subscript_exprs().begin()); return OptionalType::create(elem_type); + } else if (typeName == "Union") { + std::vector subscript_expr_types; + subscript_expr_types.reserve(subscript.subscript_exprs().size()); + for (auto expr : subscript.subscript_exprs()) { + subscript_expr_types.emplace_back(parseTypeFromExprImpl(expr)); + } + return UnionType::create(subscript_expr_types); } else if (typeName == "Future" || typeName == "torch.jit.Future") { if (subscript.subscript_exprs().size() != 1) { throw ErrorReport(subscript) @@ -83,30 +90,6 @@ TypePtr ScriptTypeParser::subscriptToType( auto elem_type = parseTypeFromExprImpl(*subscript.subscript_exprs().begin()); return RRefType::create(elem_type); - } else if (typeName == "Union") { - // In Python 3.9+, Union[NoneType, T] or Union[T, NoneType] are - // treated as Optional[T]. Adding the same support for Union in Torchscript. - const char* const err = - "General Union types are not currently supported." - " Only Union[T, NoneType] (i.e. Optional[T]) is " - "supported."; - if (subscript.subscript_exprs().size() != 2) { - throw ErrorReport(subscript) << (err); - } - auto first_type = parseTypeFromExprImpl(subscript.subscript_exprs()[0]); - auto second_type = parseTypeFromExprImpl(subscript.subscript_exprs()[1]); - - bool first_none = first_type == NoneType::get(); - bool second_none = second_type == NoneType::get(); - - if (first_none && !second_none) { - return OptionalType::create(second_type); - } else if (!first_none && second_none) { - return OptionalType::create(first_type); - } else { - throw ErrorReport(subscript.range()) << err; - } - } else if (typeName == "Dict" || typeName == "dict") { if (subscript.subscript_exprs().size() != 2) { throw ErrorReport(subscript) diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp index 18512b4617d6c..03afbdd3508b2 100644 --- a/torch/csrc/jit/ir/alias_analysis.cpp +++ b/torch/csrc/jit/ir/alias_analysis.cpp @@ -13,94 +13,139 @@ namespace jit { namespace { -// For any mutable type, map it to a type such that all other types which it can -// alias will be mapped to the same type. This function follows a similar logic -// to `unifyTypes` because any two mutable types which can be unified -// can alias each other. -// getMutableTypePtr(Optional[List[int]]) == getMutableTypePtr([List[int]]) -// If a type is not mutable, return nullopt -// This class helps convert types to their mutable equivalent by looking up -// cached conversions. +TypePtr toSingleType(AliasTypeSet& mut_types) { + return mut_types.size() == 1 ? mut_types[0] + : c10::UnionType::create(mut_types); +} + +// This class determines whether a type is mutable, and, if so, it maps +// the type to its "mutable equivalent" (see definition in +// `mapTypeToAliasTypeSet`). It uses a cache of TypePtrs to speed up these +// type lookups class MutableTypePtrHelper { public: explicit MutableTypePtrHelper( - std::unordered_map* mutable_type_cache) + std::unordered_map* mutable_type_cache) : mutable_type_cache_(mutable_type_cache) {} - c10::optional getMutableType(const TypePtr& type) { + // Map any mutable type to a type such that all other types which the + // mutable type can alias will be mapped to the same type. For + // example, calling this method on `Optional[List[int]]` should be + // the same as calling this method on `List[int]`. + // + // Rules: + // - If the type is not mutable, return `nullopt` + // - If the type is a `Tuple`, that means that it's an immutable + // object that can itself contain mutable objects. We want to make + // sure that the mutable objects are correctly aliased, so we + // remove the immutable objects. (For example, + // `Tuple[int, Tensor]` would become `Tuple[Tensor]`, while + // `Tuple[int, str]` would be returned as `nullopt`.) This is a + // convenience that makes it easy to check if the `Tuple` + // contains only immutable objects, though it's not technically + // necessary + // - For any Tensor type (including Tensor types that are part of + // a larger container, e.g. `List[Tensor]`), return the + // "unshaped" version of that Tensor. An "unshaped" Tensor is a + // Tensor with shape information removed. For example, a Tensor + // of dimension 4 would map to the same type as a Tensor of + // dimension 1. This allows us to treat all subclasses of Tensor + // as a single, homogenous "Tensor" type. + c10::optional mapTypeToAliasTypeSet(const TypePtr& type) { if (mutable_type_cache_) { - auto maybe_type = mutable_type_cache_->find(type); - if (maybe_type != mutable_type_cache_->end()) { - return maybe_type->second; + auto maybe_type_mapping = mutable_type_cache_->find(type); + if (maybe_type_mapping != mutable_type_cache_->end()) { + return maybe_type_mapping->second; } } - auto mutable_type = getMutableTypeImpl(type); - if (mutable_type_cache_ && mutable_type) { - mutable_type_cache_->emplace(type, *mutable_type); + auto mutable_types = mapTypeToAliasTypeSetImpl(type); + if (mutable_type_cache_ && mutable_types) { + mutable_type_cache_->emplace(type, *mutable_types); } - return mutable_type; + return mutable_types; } private: - c10::optional getMutableTypeImpl(const TypePtr& type) { + c10::optional mapTypeToAliasTypeSetImpl(const TypePtr& type) { switch (type->kind()) { case TypeKind::ListType: case TypeKind::DictType: case TypeKind::ClassType: case TypeKind::TensorType: - // TODO: lookup cached contained types. this is kind of tricky - // because a List[Optional[T]] should still be - // List[Optional[Unshaped(T)]], however the getMutableType(Optional[T]) - // == T - return unshapedType(type); - case TypeKind::OptionalType: - return getMutableType(type->castRaw()->getElementType()); + // TODO: Look up cached contained types. this is kind of tricky + // because a `List[Optional[T]]` should still be + // `List[Optional[Unshaped(T)]]`, but + // `mapTypeToAliasTypeSet(Optional[T])` should be `T` + return AliasTypeSet{unshapedType(type)}; + case TypeKind::UnionType: { + AliasTypeSet mutable_types; + for (const TypePtr& inner : + type->expect()->containedTypes()) { + if (auto maybe_inner_types = mapTypeToAliasTypeSet(inner)) { + mutable_types.insert( + mutable_types.end(), + (*maybe_inner_types).begin(), + (*maybe_inner_types).end()); + } + } + if (mutable_types.size() == 0) { + return c10::nullopt; + } + return mutable_types; + } + case TypeKind::OptionalType: { + auto inner = type->castRaw()->getElementType(); + return mapTypeToAliasTypeSet(inner); + } case TypeKind::AnyType: - return type; + return {AliasTypeSet{type}}; case TypeKind::FutureType: { - if (auto elem = - getMutableType(type->castRaw()->getElementType())) { - return FutureType::create(*elem); + if (auto maybe_mut_types = mapTypeToAliasTypeSet( + type->castRaw()->getElementType())) { + auto mut_type = toSingleType(*maybe_mut_types); + return {AliasTypeSet{FutureType::create(mut_type)}}; } return c10::nullopt; } case TypeKind::TupleType: { std::vector mutable_types; - for (const auto& elem : type->expectRef().elements()) { - if (auto mut_elem = getMutableType(elem)) { - mutable_types.push_back(*mut_elem); + for (const TypePtr& inner : type->expectRef().elements()) { + if (auto maybe_inner_types = mapTypeToAliasTypeSet(inner)) { + mutable_types.insert( + mutable_types.end(), + (*maybe_inner_types).begin(), + (*maybe_inner_types).end()); } } if (mutable_types.size() == 0) { return c10::nullopt; - } else { - return TupleType::create(mutable_types); } + return {AliasTypeSet{TupleType::create(mutable_types)}}; } default: return c10::nullopt; } } - std::unordered_map* mutable_type_cache_; + std::unordered_map* mutable_type_cache_; }; bool isMutableTypeImpl( const TypePtr& type, - std::unordered_map* mutable_type_cache) { - // check common cases to avoid recursively constructing type in - // getMutableTypePtrImpl + std::unordered_map* mutable_type_cache) { + // Check common cases to avoid recursively constructing type in + // `mapTypeToAliasTypeSetPtrImpl` auto kind = type->kind(); if (kind == TypeKind::TensorType || kind == TypeKind::ListType || kind == TypeKind::ClassType || kind == TypeKind::DictType) { return true; } MutableTypePtrHelper helper(mutable_type_cache); - return helper.getMutableType(type) != c10::nullopt; + return helper.mapTypeToAliasTypeSet(type) != c10::nullopt; } } // namespace -// static isMutableType does not use cache of type -> mutable type equivalent +// Static `isMutableType` does not use cache of type -> mutable type equivalent bool AliasDb::isMutableType(const TypePtr& type) { return isMutableTypeImpl(type, nullptr); } @@ -109,7 +154,7 @@ bool AliasDb::isMutableType(const Value* v) { return isMutableType(v->type()); } -// makes use of type -> mutable cache +// Make use of type -> mutable cache bool AliasDb::isMutableTypeInternal(const TypePtr& type) const { return isMutableTypeImpl(type, &mapped_mutable_types_); } @@ -118,21 +163,17 @@ bool AliasDb::isMutableTypeInternal(const Value* v) const { return isMutableTypeInternal(v->type()); } -c10::optional AliasDb::getMutableTypePtr(const TypePtr& type) const { +c10::optional AliasDb::mapTypeToAliasTypeSetPtr( + const TypePtr& type) const { MutableTypePtrHelper helper(&mapped_mutable_types_); - return helper.getMutableType(type); -} - -bool AliasDb::isContainerType(const TypePtr& type) const { - auto mut_type = getMutableTypePtr(type); - return mut_type && (*mut_type)->containedTypes().size() > 0; + return helper.mapTypeToAliasTypeSet(type); } AliasDb::~AliasDb() = default; -// Structure used during analysis to keeps track of all writes at a high level. -// When analysis is completed this will be used to construct a more efficient -// WriteIndex. +// Structure used during analysis to keep track of all writes at a high +// level. When the analysis is completed, this will be used to construct +// a more efficient WriteIndex struct AliasDb::WriteRegistry { void registerWrite(const Value* v, Node* n) { writes_[n].emplace_back(v); @@ -170,7 +211,7 @@ AliasDb::AliasDb(std::shared_ptr graph, bool isFrozen) writeIndex_ = TWriteIndex(); auto& writeIndex = *writeIndex_; // to make operator[] less ugly - // build the write index + // Build the write index for (const auto& write : writeRegistry_->writes_) { Node* node = write.first; const std::vector writtenValues = write.second; @@ -207,7 +248,7 @@ AliasDb::AliasDb(std::shared_ptr graph, bool isFrozen) // out of sync (since we have no way of registering new writes) writeRegistry_ = nullptr; - // initialize the write cache + // Initialize the write cache buildWrittenToLocationsIndex(); GRAPH_DEBUG(toString()); } @@ -324,10 +365,10 @@ MemoryLocations AliasDb::getReads(Node* n) const { std::string AliasDb::getElementName(const Element* e) const { if (e->values.empty()) { - // not the most efficient way, but given the fact there are + // Not the most efficient way, but given the fact there are // not too many types and even fewer of them will end up in - // wildcardIndex_, we should be fine with a linear search - // each time we hit a wildcard leaf + // `wildcardIndex_`, we should be fine with a linear search + // each time we hit a Wildcard leaf for (const auto& ent : wildcardIndex_) { if (ent.second == e) { return std::string("WILDCARD for type ") + ent.first->str(); @@ -362,17 +403,27 @@ std::string AliasDb::toString() const { ss << "\n===2. ALIAS DB===\n"; for (const auto& ptrPair : elementMap_) { const auto element = ptrPair.second; + int ct = 0; if (!element->pointsTo.empty()) { ss << getElementName(element) << " points to: "; for (const auto pointedTo : element->pointsTo) { - ss << getElementName(memoryDAG_->fromIndex(pointedTo)) << ", "; + if (ct > 0) { + ss << ", "; + } + ++ct; + ss << getElementName(memoryDAG_->fromIndex(pointedTo)); } ss << "\n"; } + ct = 0; if (!element->containedElements.empty()) { ss << getElementName(element) << " contains: "; for (const auto contained : element->containedElements) { - ss << getElementName(memoryDAG_->fromIndex(contained)) << ", "; + ss << getElementName(memoryDAG_->fromIndex(contained)); + if (ct > 0) { + ss << ", "; + } + ++ct; } ss << "\n"; } @@ -839,8 +890,7 @@ void AliasDb::analyzeLoop(Node* node) { TORCH_INTERNAL_ASSERT(blockOutputs.size() == node->outputs().size()); // Run alias analysis on the loop body, iterating until the block output - // alias info converges. - // Copy node input aliases to block input + // alias info converges. Copy node input aliases to block input mapAliases(blockInputs, loopCarriedInputs); // Populate block output alias info by analyzing the body @@ -996,7 +1046,7 @@ bool AliasDb::functionalNonEscapingListUse(const Use& use) const { return false; } -// List or dict or tuple: construct: create an aliasing element for the actual +// List or dict or tuple construct: create an aliasing element for the actual // container, then mark all inputs as wildcards, since they've gone inside the // container. Then, add the wildcard sets of appropriate type to the contained // elements of the container. @@ -1073,52 +1123,50 @@ void AliasDb::makePointerTo(const Value* from, const Value* to) { return; } - // the contained types of immutable type containers (optional, tuple, future) - // are unified, so these types can be mutable or immutable - // and point to a type which is mutable or immutable. - // Any is mutable but can point to an immutable type through refinement + // The contained types of immutable type containers (`Optional`, + // `Tuple`, `Future`, and `Union`) are unified, so these types can be + // mutable or immutable and point to a type which is mutable or + // immutable. `Any` is mutable but can point to an immutable type + // through refinement if (isMutableTypeInternal(from) != isMutableTypeInternal(to)) { bool expected_kind = false; for (auto kind : {from->type()->kind(), to->type()->kind()}) { expected_kind = expected_kind || (kind == TypeKind::OptionalType || kind == TypeKind::FutureType || - kind == TypeKind::TupleType) // immutable type containers + kind == TypeKind::TupleType || + kind == TypeKind::UnionType) // immutable type containers || kind == TypeKind::AnyType; } TORCH_INTERNAL_ASSERT( expected_kind, from->type()->str(), to->type()->str()); return; } - // both immutable if (!isMutableTypeInternal(from)) { return; } - if (from == to) { return; } - // At this point, we are dealing with two mutable types. - auto fromEl = getOrCreateElement(from); - auto toEl = getOrCreateElement(to); + // At this point, we are dealing with two mutable types + auto from_el = getOrCreateElement(from); + auto to_el = getOrCreateElement(to); - memoryDAGBuilder_->makePointerTo(fromEl, toEl); + memoryDAGBuilder_->makePointerTo(from_el, to_el); } void AliasDb::addToContainedElements( - const Value* elem, + const Value* inner, const Value* container) { - if (!isMutableTypeInternal(elem)) { + if (!isMutableTypeInternal(inner)) { return; } - TORCH_INTERNAL_ASSERT(isContainerType(container->type())); - - auto elemEl = getOrCreateElement(elem); - auto contEl = getOrCreateElement(container); + auto inner_el = getOrCreateElement(inner); + auto cont_el = getOrCreateElement(container); - memoryDAGBuilder_->addToContainedElements(elemEl, contEl); + memoryDAGBuilder_->addToContainedElements(inner_el, cont_el); } bool AliasDb::mayAlias(const Value* a, const Value* b) const { @@ -1203,8 +1251,8 @@ void AliasDb::createValue(const Value* value) { void AliasDb::giveFreshAlias( const Value* value, bool add_wildcard_to_contained_elems) { - auto maybe_mut_type = getMutableTypePtr(value->type()); - if (!maybe_mut_type) { + auto maybe_mut_types = mapTypeToAliasTypeSetPtr(value->type()); + if (!maybe_mut_types) { return; } @@ -1217,7 +1265,11 @@ void AliasDb::giveFreshAlias( auto new_elem = memoryDAGBuilder_->makeFreshValue(value); elementMap_[value] = new_elem; if (add_wildcard_to_contained_elems) { - addContainedTypesToFreshElement(new_elem, *maybe_mut_type); + if ((*maybe_mut_types).size() > 1) { + pointUnionTypeElementToAllContainedTypes(new_elem, *maybe_mut_types); + } else { + addContainedTypesToFreshElement(new_elem, *maybe_mut_types); + } } } @@ -1639,29 +1691,47 @@ bool AliasDb::mayAliasWildcard(const at::ArrayRef vs) const { } c10::optional AliasDb::tryGetOrCreateWildcard(const TypePtr& type) { - auto updated_type = getMutableTypePtr(type); - if (!updated_type) { + auto maybe_mut_types = mapTypeToAliasTypeSetPtr(type); + if (!maybe_mut_types) { return c10::nullopt; } - auto mapped_type = *updated_type; - auto existing_wildcard = wildcardIndex_.find(mapped_type); + auto mut_type = toSingleType(*maybe_mut_types); + auto existing_wildcard = wildcardIndex_.find(mut_type); if (existing_wildcard != wildcardIndex_.end()) { return existing_wildcard->second; } auto wildcard_elem = memoryDAGBuilder_->makeFreshValue(nullptr); - wildcardIndex_.emplace(mapped_type, wildcard_elem); - addContainedTypesToFreshElement(wildcard_elem, mapped_type); + wildcardIndex_.emplace(mut_type, wildcard_elem); + if ((*maybe_mut_types).size() > 1) { + pointUnionTypeElementToAllContainedTypes(wildcard_elem, *maybe_mut_types); + } else { + addContainedTypesToFreshElement(wildcard_elem, *maybe_mut_types); + } return wildcard_elem; } -void AliasDb::addContainedTypesToFreshElement( +void AliasDb::pointUnionTypeElementToAllContainedTypes( Element* container_elem, - const TypePtr& mut_type) { - for (const auto& contained : mut_type->containedTypes()) { - auto maybe_elem = tryGetOrCreateWildcard(contained); + const AliasTypeSet& mut_types) { + for (const auto& mut_type : mut_types) { + auto maybe_elem = tryGetOrCreateWildcard(mut_type); if (maybe_elem) { - memoryDAGBuilder_->addToContainedElements(*maybe_elem, container_elem); + TORCH_INTERNAL_ASSERT(*maybe_elem != container_elem); + memoryDAGBuilder_->makePointerTo(container_elem, *maybe_elem); + } + } +} + +void AliasDb::addContainedTypesToFreshElement( + Element* container_elem, + const AliasTypeSet& mut_types) { + for (const auto& mut_type : mut_types) { + for (const auto& contained : mut_type->containedTypes()) { + auto maybe_elem = tryGetOrCreateWildcard(contained); + if (maybe_elem) { + memoryDAGBuilder_->addToContainedElements(*maybe_elem, container_elem); + } } } } @@ -1669,26 +1739,38 @@ void AliasDb::addContainedTypesToFreshElement( // Search the wildcard index for an element that corresponds to the given type. // Const version returns nullptr Element* AliasDb::getWildcard(const TypePtr& type) const { - auto maybe_mut_type = getMutableTypePtr(type); - if (!maybe_mut_type) { - return nullptr; - } - TypePtr mut_type = *maybe_mut_type; - auto wildcard = wildcardIndex_.find(mut_type); - if (wildcard != wildcardIndex_.end()) { - return wildcard->second; + auto maybe_mut_types = mapTypeToAliasTypeSetPtr(type); + if (!maybe_mut_types) { + return {}; + } + if ((*maybe_mut_types).size() > 1) { + auto union_type = UnionType::create(*maybe_mut_types); + // Get a pair where the TypePtr is this Union + // type and the Element is the corresponding Wildcard + auto maybe_union_pair = wildcardIndex_.find(union_type); + if (maybe_union_pair != wildcardIndex_.end()) { + return (*maybe_union_pair).second; + } + } else { + // Get a pair where the TypePtr is the given + // type and the Element is the corresponding Wildcard + auto type_pair = wildcardIndex_.find((*maybe_mut_types)[0]); + if (type_pair != wildcardIndex_.end()) { + return type_pair->second; + } } - return nullptr; + return {}; } // Register `v` as a wildcard value. c10::optional AliasDb::setWildcard(const Value* v) { - auto maybe_wildcardElement = tryGetOrCreateWildcard(v->type()); + c10::optional maybe_wildcardElement = + tryGetOrCreateWildcard(v->type()); if (!maybe_wildcardElement) { return c10::nullopt; } - // Ensure that we create a corresponding element for `v` still, as it is an - // invariant that all mutable values have an element. + // Ensure that we create a corresponding Element for `v` still, as it is an + // invariant that all mutable values have an Element getOrCreateElement(v); wildcards_.insert(v); return *maybe_wildcardElement; diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h index cd888ade69291..7feb2b9938d8b 100644 --- a/torch/csrc/jit/ir/alias_analysis.h +++ b/torch/csrc/jit/ir/alias_analysis.h @@ -34,6 +34,12 @@ namespace jit { * Values that contain other mutable types, such as List[Tensor], are * initialized as containing the Wildcard set for all contained mutable types. * + * The AliasDb API references the idea of "mutable" vs "immutable" + * types. "Mutable" means that the object's value can change, while + * "immutable" means that the value is fixed. (For example, `List` is + * mutable, so you can add and delete elements from it. On the other + * hand, you can't modify a Tuple once you create it, making `Tuple` an + * immutable container.) */ class AliasDb { public: @@ -95,7 +101,7 @@ class AliasDb { const at::ArrayRef& a, const at::ArrayRef& b) const; - // Move 'n' (already in the graph) after 'movePoint' in the topological order. + // Move `n` (already in the graph) after `movePoint` in the topological order. // // Tries to preserve value dependencies, so other nodes might be moved. We // make two guarantees about the postcondition of the node list: @@ -125,6 +131,10 @@ class AliasDb { TORCH_API bool dumpToGraphvizFile(const char* filename) const; TORCH_API std::string toGraphviz() const; + // Returns `true` if the given element is mutable or if it is a + // container type with an internal mutable element (e.g. + // `Tuple[int, Tensor]` has an internal mutable type `Tensor`, so + // it would be considered a "mutable type" in AliasDb) static bool isMutableType(const Value* v); static bool isMutableType(const TypePtr& type); @@ -181,7 +191,7 @@ class AliasDb { // Register `v` as a wildcard value. c10::optional setWildcard(const Value* v); - // Is this a value which will not alias + // Is this a value which will not alias? bool nonAliasingValue(const Value* elem) const; /** @@ -221,11 +231,10 @@ class AliasDb { bool add_wildcard_to_contained_elems = true); Element* getOrCreateElement(const Value* value); - c10::optional getMutableTypePtr(const TypePtr& type) const; + c10::optional mapTypeToAliasTypeSetPtr( + const TypePtr& type) const; bool functionalNonEscapingListUse(const Use& use) const; - bool isContainerType(const TypePtr& type) const; - std::shared_ptr graph_; // If the Module is frozen then consider attributes as freshly created @@ -239,21 +248,24 @@ class AliasDb { // Mapping of values to MemoryDAG elements ska::flat_hash_map elementMap_; - // All wildcard elements (one for each unique mutable type). + // All wildcard Elements (one for each unique mutable type) std::unordered_map wildcardIndex_; Element* getWildcard(const TypePtr& type) const; c10::optional tryGetOrCreateWildcard(const TypePtr& type); void addContainedTypesToFreshElement( Element* container_elem, - const TypePtr& mut_type); + const AliasTypeSet& mut_types); + void pointUnionTypeElementToAllContainedTypes( + Element* container_elem, + const AliasTypeSet& mut_types); std::vector getElements(at::ArrayRef vs) const; bool mayAliasWildcard(const Value* v) const; bool mayAliasWildcard(const at::ArrayRef vs) const; bool hasWriters(const at::ArrayRef& values) const; - // cached mapping of type ptrs to their mutable types - mutable std::unordered_map mapped_mutable_types_; + // Cached mapping of type ptrs to their mutable types + mutable std::unordered_map mapped_mutable_types_; /** * State for tracking write info. diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index 05ce8d40ea7c5..e62ef93b57379 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -511,7 +511,7 @@ void Graph::lint() const { // - Params and return do NOT occur in nodes // - next_unique_ is greater than all uniques in graph // - uniques in all_nodes are unique - // - every use will occur later in the topsort + // - every use will occur later in the toposort struct LintScope { LintScope() = default; @@ -787,7 +787,9 @@ bool Value::mustBeNone() const { } bool Value::mustNotBeNone() const { return node_->kind() != prim::AutogradAdd && type() != NoneType::get() && - !type()->cast(); + !type()->cast() && + !(type()->cast() && + type()->expect()->canHoldType(NoneType::get())); } std::string Value::debugNameBase() const { @@ -1765,20 +1767,23 @@ Node* Graph::createEnumValue(Value* e) { return n; } -Node* Graph::createList(const TypePtr& elem_type, at::ArrayRef values) { +Node* Graph::createList( + const TypePtr& contained_type, + at::ArrayRef values) { auto n = create(prim::ListConstruct, values); for (const auto& v : values) { TORCH_CHECK( - v->type()->isSubtypeOf(elem_type), + v->type()->isSubtypeOf(contained_type), "Expected a list element that subtypes '", - elem_type->repr_str(), + contained_type->repr_str(), "' but got an element of type '", v->type()->repr_str(), "'"); } - n->output()->setType(ListType::create(elem_type)); + n->output()->setType(ListType::create(contained_type)); return n; } + Node* Graph::createListUnpack(Value* v, size_t size) { ListTypePtr list_type = v->type()->expect(); TypePtr elem_type = list_type->getElementType(); diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index dee222bd480df..99f6a6ce5c57b 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -84,7 +84,7 @@ using namespace ::c10::cuda; struct Function; struct MatchedSchema; -// Graph represents one "function" of computation. +// A Graph represents one "function" of computation. // It uses a simple ownership model where the graph owns all the nodes inside // it. All references inside the graph are raw pointers. Destroying the Graph // will invalidate any pointers to nodes in the graph. @@ -104,9 +104,9 @@ TORCH_API std::ostream& operator<<(std::ostream& out, const Node& n); // A list of nodes, with inputs and outputs struct Block; -// Each use is represented by this type, see Node::uses() -// 'user' is the consumer of the value, offset is the index into -// 'user's input this where the produces will be found. +// Each use is represented by this type, see 'Node::uses()' +// 'user' is the consumer of the value, 'offset' is the index into +// 'user's input this where the producers will be found. struct Use { Use(Node* user, size_t offset) : user(user), offset(offset) {} Node* user; @@ -338,14 +338,16 @@ struct TORCH_API Node { protected: Node(Graph* graph_, NodeKind kind_); // defined after graph public: - // each node but Return/Param - // is associated with exactly one place in the node list... - // of the graph_ - // this circular is a doubly-linked list, the Return node is used as the - // sentinel for the beginning and end of the list such that the list never has - // null pointers next_in_graph[0] is next pointer next_in_graph[1] is prev - // pointer using an array to allow the same iterator class for forward and - // reverse node lists This list represents a topological sort + // Each Node but Return/Param Nodes are associated with exactly one + // place in the Node list of the Graph. The Graph itself is a circular + // doubly-linked list. The Return Node is used as the sentinel for the + // "beginning"/"end" of the list. This means that you can tell when + // you've traversed the entire list without means worrying about null + // pointers. `next_in_graph[0]` is the pointer to the next Node, while + // `next_in_graph[1]` is the pointer to the previous Node. The + // linked list is implemented as an array to allow the same iterator + // class for forward and reversed Node lists. Taken together, this + // list also represents a topological sort of the Nodes in the Graph. // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-non-private-member-variables-in-classes,modernize-avoid-c-arrays) Node* next_in_graph[2] = {nullptr, nullptr}; @@ -980,7 +982,6 @@ struct TORCH_API Node { // subclasses should extend if they have additional information to copy. // 'this' will be allocated with s->allocNewInstance(g) so it should have // the same concrete type as 's' - // virtual void cloneFrom(Node* s); }; @@ -1247,7 +1248,7 @@ struct Graph { TORCH_API Node* createEnumName(Value* e); TORCH_API Node* createEnumValue(Value* e); TORCH_API Node* createList( - const TypePtr& elem_type, + const TypePtr& contained_type, at::ArrayRef values); TORCH_API Node* createListUnpack(Value* v, size_t size); TORCH_API Node* createDict( diff --git a/torch/csrc/jit/mobile/type_parser.cpp b/torch/csrc/jit/mobile/type_parser.cpp index 42814e5fe5aad..6b955ab6454a7 100644 --- a/torch/csrc/jit/mobile/type_parser.cpp +++ b/torch/csrc/jit/mobile/type_parser.cpp @@ -42,6 +42,17 @@ class TypeParser { return simpleTypeIt->second; } else if (token == "List") { return CreateSingleElementType(); + } else if (token == "Union") { + std::vector types; + expect("["); + while (cur() != "]") { + types.emplace_back(parse()); + if (cur() != "]") { + expect(","); + } + } + expect("]"); + return UnionType::create(types); } else if (token == "Optional") { return CreateSingleElementType(); } else if (token == "Future") { diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 5e13829a8ce6d..c74c6ee40221a 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -288,6 +288,24 @@ class ShapePropagator { return zerodim; } + bool mergeTypes( + ArrayRef lhs, + ArrayRef rhs, + ArrayRef outputs) { + AT_ASSERT(lhs.size() == rhs.size() && rhs.size() == outputs.size()); + bool changed = false; + for (size_t i = 0; i < lhs.size(); ++i) { + auto old_output_type = outputs[i]->type(); + auto new_type = + unifyTypes(lhs[i]->type(), rhs[i]->type(), /*default_to_union=*/true); + AT_ASSERT(new_type); + outputs[i]->setType(*new_type); + if (*old_output_type != *outputs[i]->type()) + changed = true; + } + return changed; + } + void broadcastBinary( Node* node, std::vector& types, diff --git a/torch/csrc/jit/passes/utils/memory_dag.cpp b/torch/csrc/jit/passes/utils/memory_dag.cpp index 6a880c86e4102..3f6cc8079b6f9 100644 --- a/torch/csrc/jit/passes/utils/memory_dag.cpp +++ b/torch/csrc/jit/passes/utils/memory_dag.cpp @@ -8,6 +8,7 @@ namespace torch { namespace jit { namespace { + void makePointerToImpl(Element* from, Element* to) { from->pointsTo.set(to->index); to->pointedFrom.set(from->index); @@ -131,11 +132,13 @@ Element* MemoryDAGBuilder::makeFreshValue(const Value* v) { return makeFreshValueImpl(v, indexToElementMap_); } +// This function builds up a bitset representing the "alias set" for +// `e` (`MemoryLocations` is just a typedef'd c10::SparseBitVector). const MemoryLocations& MemoryDAG::getMemoryLocations(const Element* e) const { // Note on cache invalidation: all mutation should occur through - // MemoryDAGBuilder. Thus, once we consume the builder to create an immutable - // MemoryDAG, we can cache here without worrying that we might potentially get - // invalidated. + // MemoryDAGBuilder. Thus, once we consume the builder to create an + // immutable MemoryDAG, we can cache here without worrying that we + // might potentially get invalidated. if (e->cachedMemoryLocations_) { return *e->cachedMemoryLocations_; } @@ -174,7 +177,6 @@ void MemoryDAG::setWildcards( makePointerToImpl(from, wildcardElement); } } - // Track which memory locations we edited with a new pointer to the wildcard // element. cacheUpdates[wildcardElement] |= pointeeSet; @@ -189,7 +191,6 @@ void MemoryDAG::setWildcards( for (const std::unique_ptr& e : this->indexToElementMap_) { if (e->values.empty()) { // This element is a wildcard element, we can skip it. - TORCH_INTERNAL_ASSERT(e->pointsTo.empty()); continue; } diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h index 38432ff69c9c1..3e3a19c31729c 100644 --- a/torch/csrc/jit/passes/utils/memory_dag.h +++ b/torch/csrc/jit/passes/utils/memory_dag.h @@ -1,9 +1,12 @@ #pragma once +#include #include #include #include #include +#include +#include #include #include #include @@ -20,6 +23,9 @@ struct Element; struct Value; class MemoryDAG; +using TypePtr = std::shared_ptr; +using AliasTypeSet = std::vector; + /** * Helper to build up the points-to graph. * @@ -38,13 +44,15 @@ class TORCH_API MemoryDAGBuilder { void addToContainedElements(Element* contained, Element* container); - // Make a fresh element (i.e. an element that doesn't point to anything) and + // Make a fresh Element (i.e. an Element that doesn't point to anything) and // return it. Element* makeFreshValue(const Value* v); friend MemoryDAG; private: + // `MemoryDAGBuilder` builds up `indexToElementMap_`, then uses + // the map to construct the `MemoryDAG` std::vector> indexToElementMap_; }; @@ -54,8 +62,8 @@ class TORCH_API MemoryDAGBuilder { // AliasDb to provide a higher-level API. // // We maintain a DAG where: -// - Vertices (called "elements") represent values and -// other aliasing entities (e.g. like the stuff inside a list) +// - Vertices (called "Elements") represent Values and +// other aliasing entities (e.g. the stuff inside a list) // - Edges represent a "points-to" relationship. // // Leaves in this DAG are entities that don't point to anything, and thus @@ -80,7 +88,7 @@ class TORCH_API MemoryDAG { bool mayAlias(const Element* a, const Element* b) const; bool mayAlias(Element* a, Element* b) const; - // Does a hold reference to any memory that is stored in elem, or vice versa? + // Does `a` hold reference to any memory that is stored in `b`, or vice versa? bool mayContainAlias(const Element* a, const Element* b) const; bool mayContainAlias(Element* a, Element* b) const; @@ -96,12 +104,13 @@ class TORCH_API MemoryDAG { MemoryLocations& cont) const; /** - * The following methods are special cases where we need to reach mutate the + * The following methods are special cases where we need to mutate the * internals of MemoryDAG for efficiency reasons. Don't call them unless you * know what you're doing! In particular, don't add new mutating methods * without ensuring that you are maintaining cache consistency for memory * locations. */ + // Adding wildcards can trigger extremely expensive cache invalidations. This // method adds them in a more efficient cache-aware way. void setWildcards( @@ -117,9 +126,10 @@ class TORCH_API MemoryDAG { std::vector> indexToElementMap_; }; -// `Element` represents the vertex in the points-to graph. It represents -// anything that could have an aliasing relationship, mostly IR `Value`s, but -// also the "inside of a list", or wildcards. +// `Element` represents a vertex in the points-to graph. It represents +// anything that could have an aliasing relationship--mostly IR +// `Value`s, but also wildcards or the type inside a container (e.g. `T` +// in `List[T]`) struct Element { Element(const Value* value_, unsigned index_); // wildcard constructor diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp index f81632bc0fb0a..f8fae19ed8f50 100644 --- a/torch/csrc/jit/python/pybind_utils.cpp +++ b/torch/csrc/jit/python/pybind_utils.cpp @@ -89,6 +89,19 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional N) { ? c10::ivalue::Tuple::createNamed(std::move(values), tuple_type) : c10::ivalue::Tuple::create(std::move(values)); } + case TypeKind::UnionType: { + auto actual_type = toTypeInferredIValue(obj); + auto actual_type_ptr = actual_type.type(); + auto union_type = type->expect(); + if (!actual_type_ptr->isSubtypeOf(union_type)) { + throw py::cast_error(c10::str( + "Expected a member of ", + union_type->annotation_str(), + " but instead found type ", + actual_type.type()->annotation_str())); + } + return actual_type; + } case TypeKind::StringType: return ConstantString::create(py::cast(obj)); case TypeKind::DeviceObjType: { diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp index e0951c3ebbfbc..2c8246daec92b 100644 --- a/torch/csrc/jit/python/python_ir.cpp +++ b/torch/csrc/jit/python/python_ir.cpp @@ -869,6 +869,12 @@ void initPythonIRBindings(PyObject* module_) { } return types; }); + py::class_>(m, "UnionType") + .def(py::init( + [](const std::vector& a) { return UnionType::create(a); })) + .def("containedTypes", [](UnionType& self) { + return self.containedTypes().vec(); + }); py::class_>(m, "ListType") .def(py::init([](TypePtr a) { return ListType::create(a); })) .def_static("ofInts", &ListType::ofInts) diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp index 1f70c3cad8a5e..86aa6e3909e14 100644 --- a/torch/csrc/jit/serialization/import.cpp +++ b/torch/csrc/jit/serialization/import.cpp @@ -47,7 +47,8 @@ void postSetStateValidate(const IValue& v) { // const auto attrType = objType->getAttribute(i); // Verify that all the non-optional attributes have been initialized // TODO: Issue #20497 - if (attrType->kind() != TypeKind::OptionalType && + if (attrType->kind() != TypeKind::UnionType && + attrType->kind() != TypeKind::OptionalType && attrType->kind() != TypeKind::NoneType) { TORCH_CHECK( !slot.isNone(), diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp index e7d9da26df41d..918b0d4338c73 100644 --- a/torch/csrc/jit/serialization/import_source.cpp +++ b/torch/csrc/jit/serialization/import_source.cpp @@ -482,12 +482,13 @@ void SourceImporterImpl::importClass( } break; case TK_DEF: { Def def = Def(statement); - if (pre_hook_names.find(def.name().name()) != pre_hook_names.end()) { - pre_hook_def_map.emplace(def.name().name(), def); - pre_hook_resolver_map.emplace(def.name().name(), shared_from_this()); - } else if (hook_names.find(def.name().name()) != hook_names.end()) { - hook_def_map.emplace(def.name().name(), def); - hook_resolver_map.emplace(def.name().name(), shared_from_this()); + const auto def_name = def.name().name(); + if (pre_hook_names.find(def_name) != pre_hook_names.end()) { + pre_hook_def_map.emplace(def_name, def); + pre_hook_resolver_map.emplace(def_name, shared_from_this()); + } else if (hook_names.find(def_name) != hook_names.end()) { + hook_def_map.emplace(def_name, def); + hook_resolver_map.emplace(def_name, shared_from_this()); } else { methods.emplace_back(def); method_resolvers.push_back(shared_from_this()); diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index 80123c625ea65..6b1bf15304624 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -511,13 +511,31 @@ struct PythonPrintImpl { } indent(); printValueList(body_, lhs); + // We need to preserve Union/Optional type annotations, but only if + // we're not assigning values as part of a tuple unpacking statement + // (Python doesn't allow type annotations in multiple assignment) + if (lhs.size() == 1) { + Value* v = lhs.at(0); + if (!annotated_unions_.count(v) && !expr_table_.count(v) && + (v->type()->kind() == UnionType::Kind || + v->type()->kind() == OptionalType::Kind)) { + body_ << " : " << v->type()->annotation_str(); + annotated_unions_.insert(v); + } + } body_ << " = "; + // or if value is being assigned to something of a union type printValueList(body_, rhs); body_ << "\n"; } bool requiresAnnotation(Value* lhs, Value* rhs) { - return *lhs->type() != *rhs->type(); + if (lhs->type()->kind() == UnionType::Kind || + lhs->type()->kind() == OptionalType::Kind) { + return annotated_unions_.insert(lhs).second; + } else { + return *lhs->type() != *rhs->type(); + } } void printAnnotatedAssignment( @@ -1302,10 +1320,12 @@ struct PythonPrintImpl { body_ << arg_name; if (print_first_argument_type) { body_ << ": " << arg.type()->annotation_str(type_printer_); + annotated_unions_.insert(*param_it); } } else { body_ << ",\n " << arg_name << ": " << arg.type()->annotation_str(type_printer_); + annotated_unions_.insert(*param_it); } if (arg.default_value()) { printDefaultValue(arg, body_, *arg.default_value()); @@ -1559,6 +1579,12 @@ struct PythonPrintImpl { // table. PrintDepsTable& deps_table_; + // We need to preserve Union/Optional type annotations, but we should + // only print the annotation on variable declaration (not on any + // following uses). This set tracks the Value*s that we've already + // printed with annotations + std::unordered_set annotated_unions_; + // A function that, given a named type, returns us the correct string to print // for it. c10::TypePrinter type_printer_; diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index b521dc88a12ba..e0e556ecbbde3 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -23,8 +23,8 @@ static void restoreAccurateTypeTagsIfPossible(const IValue& root) { // Pickled objects are stored in a form compatible with Python pickling. // In torchscript List[T]/Dict[K, V] are statically typed and contain -// dynamic type tags allow T, K, and V to be recovered. But this info -// is not stored in the Python pickling information. However, we +// dynamic type tags that allow T, K, and V to be recovered. But this +// info is not stored in the Python pickling information. However, we // can recover this information from the static type of the top-level // object being unpickled, because we have a record of the type of the // objects it contains as attributes. @@ -108,6 +108,19 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) { to_process.emplace_back(std::move(elem)); } } break; + case UnionType::Kind: { + auto t = w.static_type->expect(); + if (t->containedTypes().size() == 2 && + t->canHoldType(NoneType::get())) { + if (!w.value.isNone()) { + auto inner = t->containedTypes()[0] != NoneType::get() + ? t->containedTypes()[0] + : t->containedTypes()[1]; + Work elem = {inner, w.value}; + to_process.emplace_back(std::move(elem)); + } + } + } break; case ListType::Kind: { // specialized lists do not need their type refined, so we can exit // early here diff --git a/torch/jit/_script.py b/torch/jit/_script.py index de32e1ab8de37..acc9e7c44f51f 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -449,7 +449,7 @@ def method_template(self, *args, **kwargs): setattr(RecursiveScriptClass, method_name, method_template) # this is a Python 'non-data descriptor' that causes the first access - # to ScriptModule's forward to lookup the forward method and stash + # to ScriptModule's forward to look up the forward method and stash # it in the objects dict. Due to the standard rules for attribute lookup, # subsequent lookups will just directly return the previously looked up method. # This is necessary because nn.Module defines forward as a method. If we diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index f2cf78949b47d..b189f36c4107f 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -6,13 +6,13 @@ import torch import warnings from .._jit_internal import List, Tuple, is_tuple, is_list, Dict, is_dict, Optional, \ - is_optional, _qualified_name, Any, Future, is_future, is_ignored_fn + is_optional, _qualified_name, Any, Future, is_future, is_ignored_fn, Union, is_union from .._jit_internal import BroadcastingList1, BroadcastingList2, BroadcastingList3 # type: ignore[attr-defined] from ._state import _get_script_class from torch._C import TensorType, TupleType, FloatType, IntType, ComplexType, \ - ListType, StringType, DictType, BoolType, OptionalType, InterfaceType, AnyType, NoneType, \ - DeviceObjType, StreamObjType, FutureType, EnumType + ListType, StringType, DictType, BoolType, OptionalType, InterfaceType, AnyType, \ + NoneType, DeviceObjType, StreamObjType, FutureType, EnumType, UnionType from textwrap import dedent @@ -45,7 +45,8 @@ class EvalEnv(object): 'List': List, 'Dict': Dict, 'Optional': Optional, - 'Future': Future, + 'Union': Union, + 'Future': Future } def __init__(self, rcb): @@ -245,6 +246,9 @@ def split_type_line(type_line): def try_real_annotations(fn, loc): """Tries to use the Py3.5+ annotation syntax to get the type.""" try: + # Note: anything annotated as `Optional[T]` will automatically + # be returned as `Union[T, None]` per + # https://github.com/python/typing/blob/master/src/typing.py#L850 sig = inspect.signature(fn) except ValueError: return None @@ -276,7 +280,6 @@ def get_enum_value_type(e: Type[enum.Enum], loc): return torch._C.unify_type_list(ir_types) def is_tensor(ann): - if issubclass(ann, torch.Tensor): return True @@ -326,6 +329,19 @@ def try_ann_to_type(ann, loc): msg = "Unsupported annotation {} could not be resolved because {} could not be resolved." assert valid_type, msg.format(repr(ann), repr(contained)) return OptionalType(valid_type) + if is_union(ann): + inner: List = [] + # We need these extra checks because both `None` and invalid + # values will return `None` + # TODO: Determine if the other cases need to be fixed as well + for a in ann.__args__: + if a is None: + inner.append(NoneType.get()) + maybe_type = try_ann_to_type(a, loc) + msg = "Unsupported annotation {} could not be resolved because {} could not be resolved." + assert maybe_type, msg.format(repr(ann), repr(maybe_type)) + inner.append(maybe_type) + return UnionType(inner) # type: ignore[arg-type] if torch.distributed.rpc.is_available() and is_rref(ann): return RRefType(try_ann_to_type(ann.__args__[0], loc)) if is_future(ann): @@ -390,6 +406,8 @@ def ann_to_type(ann, loc): 'is_list', 'Dict', 'is_dict', + 'is_optional', + 'is_union', 'TensorType', 'TupleType', 'FloatType', diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 0928106f3ba49..6053ee7ee7f63 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -452,6 +452,7 @@ def get_default_args(fn): return {} signature = inspect.signature(fn) + return { k: v.default for k, v in signature.parameters.items() From 0e3b45eaefbef29c36f0198195022a1e4088b3e0 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 3 Sep 2021 10:21:01 -0700 Subject: [PATCH 504/530] Fix logical typo in _compare_trilu_indices (#64468) Summary: I'm pretty sure that repeating the same call twice is pretty meaningless and intend was to call `tril`/`tril_indices` in first case and `triu`/`triu_indices` in another Pull Request resolved: https://github.com/pytorch/pytorch/pull/64468 Reviewed By: mruberry Differential Revision: D30744978 Pulled By: malfet fbshipit-source-id: 7cd36789a7ebf1cc263fb2d875e479c05e7588a4 --- torch/testing/_internal/common_methods_invocations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 221cb29ec5eb6..f678f2258574f 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -9464,8 +9464,8 @@ def _compare_trilu_indices( # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 self.assertEqualIgnoreType( torch.ones(row, col, device='cpu') - .tril(offset).nonzero().to(dtype).transpose(0, 1), - torch.tril_indices(row, col, offset, dtype=dtype, device=device)) + .triu(offset).nonzero().to(dtype).transpose(0, 1), + torch.triu_indices(row, col, offset, dtype=dtype, device=device)) def _compare_large_trilu_indices( From e4ff14ad5955f7c4d052aa44069c77654e8b5f2e Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 3 Sep 2021 13:21:23 -0700 Subject: [PATCH 505/530] [CUDA graphs] Error if attempting to capture uncapturable nccl (#64440) Summary: NCCL < 2.9.6 is not capturable. Attempting to capture it can cause nasty behavior (for example, ive seen capture succeed, but replay silently hang). Pytorch should preempt this with a friendlier error. cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64440 Reviewed By: mruberry Differential Revision: D30733884 Pulled By: ngimel fbshipit-source-id: 5f2df3cf5cc0e5e68f49bf22a80d9f58064dc7ec --- torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 911963b76cd7c..9773b350e2cd7 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -189,6 +190,17 @@ std::string getExceptionMsgFromExceptionPtr( } } +inline void errorIfCapturingNonCapturableNCCL() { + auto status = c10::cuda::currentStreamCaptureStatusMayInitCtx(); + // parentheses avoid some compiler warnings + static const uint64_t min_version = (((uint64_t)2) << 32) + (((uint64_t)9) << 16) + ((uint64_t)6); + static const uint64_t cur_version = torch::cuda::nccl::version(); + if (cur_version < min_version) { + TORCH_CHECK(status == c10::cuda::CaptureStatus::None, + "Capturing NCCL collectives is only allowed with NCCL >= 2.9.6"); + } +} + } // namespace const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 10000; @@ -1079,6 +1091,8 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( OpType opType, const char* profilingTitle) { + errorIfCapturingNonCapturableNCCL(); + // Bump collective counter if (sequenceNum_) { sequenceNum_->increment(); From a91a278d60dcb7c65e2be5c5bd63429bf5df064e Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Fri, 3 Sep 2021 18:48:41 -0700 Subject: [PATCH 506/530] Fix `copy_transpose_valid` condition for `copy_same_type_transpose_` (#64425) Summary: Thanks to ngimel for the hint where the problem might be (https://github.com/pytorch/pytorch/issues/64358#issuecomment-910868849)! I added a test that fails on master to verify the fix. The shape `(60, 60)` was chosen because of `MIN_SZ = 60 * 60` in `copy_transpose_valid`. Fixes https://github.com/pytorch/pytorch/issues/64358 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64425 Reviewed By: mruberry Differential Revision: D30752725 Pulled By: ngimel fbshipit-source-id: f40370ea8365c94e30f8e8a3dcab5f3b3462464a --- aten/src/ATen/native/Copy.cpp | 4 ++++ test/test_torch.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 7fa952d020ef9..6dc1fc7af5e5c 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -28,6 +28,7 @@ bool copy_transpose_valid(const Tensor& self, const Tensor& src) { return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 && src.stride(0) == 1 && src.stride(1) == src.size(0) && self.scalar_type() == src.scalar_type() && + self.sizes().equals(src.sizes()) && self.numel() >= MIN_SZ; } @@ -45,6 +46,9 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { } Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options()); + // The code below is implemented with the assumption that sizes are equal + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.sizes().equals(src.sizes())); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] { scalar_t* sp = src.data_ptr(); scalar_t* rp = self.data_ptr(); diff --git a/test/test_torch.py b/test/test_torch.py index 2899f2ef4c3b2..ae75ee8d66044 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -324,6 +324,14 @@ def test_copy_transpose(self): self.assertEqual(y[:, 0], range(100)) self.assertEqual(y[:, 40], range(4000, 4100)) + # Verifies the bugfix for https://github.com/pytorch/pytorch/issues/64358 + def test_copy_transpose_2d_broadcast(self): + # The shape (60, 60) is chosen because of + # `MIN_SZ = 60 * 60` in `copy_transpose_valid` from aten/src/ATen/native/Copy.cpp + A = torch.randn(60, 60) + A.copy_(torch.tensor([[1.]])) + self.assertEqual(A, torch.ones(60, 60)) + def test_device(self): cpu = torch.device('cpu') self.assertEqual('cpu', str(cpu)) From 604e885925af78106e12a3ffd77687da5891761d Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Sat, 4 Sep 2021 00:43:25 -0700 Subject: [PATCH 507/530] Automated submodule update: FBGEMM (#64338) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/9ccb2714a93e8324119676f6b3dc1c26eef0a703 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64338 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jspark1105 Differential Revision: D30690319 fbshipit-source-id: 884d1f950cd1f7d2a77b79affb9215f285d5d0da --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index e6f80ee6570bb..7b49986d74a66 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit e6f80ee6570bb8a7ed15a5ad0d496fdfb8927470 +Subproject commit 7b49986d74a6666fa6913bd9b461ebebb2cad476 From 6cac7ca98054feb299c2d68994809b547f3a3c2e Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sat, 4 Sep 2021 12:37:09 -0700 Subject: [PATCH 508/530] Ensure num_threads is initialized in get_num_threads (#64486) Summary: Possible source of the recent layernorm CI failures. `lazy_init_num_threads` appears at the top of `parallel_for` and can change the number of threads set. So, we need to ensure `num_threads` is initialized during `get_num_threads` calls as well. It's already done this way for OpenMP, but is missing from other parallel backends. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64486 Reviewed By: mruberry Differential Revision: D30752615 Pulled By: ngimel fbshipit-source-id: 085873ce312edbee1254c0aaae30dec7fcfe2c57 --- aten/src/ATen/ParallelNative.cpp | 1 + aten/src/ATen/ParallelNativeTBB.cpp | 1 + aten/src/ATen/ParallelThreadPoolNative.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp index 17b4b20aa9bd0..565c979e35e16 100644 --- a/aten/src/ATen/ParallelNative.cpp +++ b/aten/src/ATen/ParallelNative.cpp @@ -222,6 +222,7 @@ void set_num_threads(int nthreads) { } int get_num_threads() { + at::internal::lazy_init_num_threads(); #ifndef C10_MOBILE // not initializing pool unnecessarily, // because pool cannot be resized after initialization diff --git a/aten/src/ATen/ParallelNativeTBB.cpp b/aten/src/ATen/ParallelNativeTBB.cpp index 15040498edc5c..c38dcb64f81bd 100644 --- a/aten/src/ATen/ParallelNativeTBB.cpp +++ b/aten/src/ATen/ParallelNativeTBB.cpp @@ -66,6 +66,7 @@ void set_num_threads(int nthreads) { } int get_num_threads() { + at::internal::lazy_init_num_threads(); return tbb::global_control::active_value( tbb::global_control::max_allowed_parallelism); } diff --git a/aten/src/ATen/ParallelThreadPoolNative.cpp b/aten/src/ATen/ParallelThreadPoolNative.cpp index 2670c7bd08d1b..cc5821d494a25 100644 --- a/aten/src/ATen/ParallelThreadPoolNative.cpp +++ b/aten/src/ATen/ParallelThreadPoolNative.cpp @@ -57,6 +57,7 @@ void set_num_interop_threads(int nthreads) { } int get_num_interop_threads() { + at::internal::lazy_init_num_threads(); int nthreads = num_interop_threads.load(); if (nthreads > 0) { return nthreads; From 18b2751ea143374adbb690889427e06a9334da05 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sat, 4 Sep 2021 20:29:44 -0700 Subject: [PATCH 509/530] [nnc] Make our exceptions c10::Errors, get C++ stacktraces (#64332) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64332 With this diff, if a compiler bug occurs (unlikely, I know!) we'll be able to get a c++ stacktrace leading to the exception, rather than just a terse message. E.g., ``` RuntimeError: UNSUPPORTED DTYPE Exception raised from compilation_error at ../torch/csrc/jit/tensorexpr/exceptions.h:32 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x6b (0x7f966659b2eb in /fsx/users/bertrand/c\ onda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libc10.so) frame #1: + 0x376f099 (0x7f966a195099 in /fsx/users/bertrand/conda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so) frame #2: + 0x3763bf5 (0x7f966a189bf5 in /fsx/users/bertrand/conda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so) frame #3: torch::jit::tensorexpr::CudaCodeGen::Initialize() + 0xdd8 (0x7f966a193368 in /fsx/users/bertrand/conda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libtorch_cuda\ .so) ``` Test Plan: Imported from OSS Reviewed By: huiguoo Differential Revision: D30745610 Pulled By: bertmaher fbshipit-source-id: a1cfaa7364ef4120de834e9cbe57ced1d082ab4e --- torch/csrc/jit/tensorexpr/exceptions.h | 64 +++++++++++++++----------- torch/csrc/jit/tensorexpr/loopnest.cpp | 4 +- 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h index 7194dfe166aa8..35ba8a34e88e2 100644 --- a/torch/csrc/jit/tensorexpr/exceptions.h +++ b/torch/csrc/jit/tensorexpr/exceptions.h @@ -26,66 +26,78 @@ namespace torch { namespace jit { namespace tensorexpr { -class unsupported_dtype : public std::runtime_error { +TORCH_API std::string buildErrorMessage(const std::string& s); + +class compilation_error : public c10::Error { public: - explicit unsupported_dtype() : std::runtime_error("UNSUPPORTED DTYPE") {} + explicit compilation_error(const std::string& err) + : c10::Error( + { + __func__, + __FILE__, + static_cast(__LINE__), + }, + buildErrorMessage(err)) {} +}; + +class unsupported_dtype : public compilation_error { + public: + explicit unsupported_dtype() : compilation_error("UNSUPPORTED DTYPE") {} explicit unsupported_dtype(const std::string& err) - : std::runtime_error("UNSUPPORTED DTYPE: " + err) {} + : compilation_error("UNSUPPORTED DTYPE: " + err) {} }; -class out_of_range_index : public std::runtime_error { +class out_of_range_index : public compilation_error { public: - explicit out_of_range_index() : std::runtime_error("OUT OF RANGE INDEX") {} + explicit out_of_range_index() : compilation_error("OUT OF RANGE INDEX") {} explicit out_of_range_index(const std::string& err) - : std::runtime_error("OUT OF RANGE INDEX: " + err) {} + : compilation_error("OUT OF RANGE INDEX: " + err) {} }; -class unimplemented_lowering : public std::runtime_error { +class unimplemented_lowering : public compilation_error { public: explicit unimplemented_lowering() - : std::runtime_error("UNIMPLEMENTED LOWERING") {} + : compilation_error("UNIMPLEMENTED LOWERING") {} explicit unimplemented_lowering(ExprPtr expr) - : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {} + : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {} explicit unimplemented_lowering(StmtPtr stmt) - : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {} + : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {} }; -class malformed_input : public std::runtime_error { +class malformed_input : public compilation_error { public: - explicit malformed_input() : std::runtime_error("MALFORMED INPUT") {} + explicit malformed_input() : compilation_error("MALFORMED INPUT") {} explicit malformed_input(const std::string& err) - : std::runtime_error("MALFORMED INPUT: " + err) {} + : compilation_error("MALFORMED INPUT: " + err) {} explicit malformed_input(ExprPtr expr) - : std::runtime_error("MALFORMED INPUT: " + std::to_string(expr)) {} + : compilation_error("MALFORMED INPUT: " + std::to_string(expr)) {} explicit malformed_input(const std::string& err, ExprPtr expr) - : std::runtime_error( + : compilation_error( "MALFORMED INPUT: " + err + " - " + std::to_string(expr)) {} explicit malformed_input(StmtPtr stmt) - : std::runtime_error("MALFORMED INPUT: " + std::to_string(stmt)) {} + : compilation_error("MALFORMED INPUT: " + std::to_string(stmt)) {} explicit malformed_input(const std::string& err, StmtPtr stmt) - : std::runtime_error( + : compilation_error( "MALFORMED INPUT: " + err + " - " + std::to_string(stmt)) {} }; -class malformed_ir : public std::runtime_error { +class malformed_ir : public compilation_error { public: - explicit malformed_ir() : std::runtime_error("MALFORMED IR") {} + explicit malformed_ir() : compilation_error("MALFORMED IR") {} explicit malformed_ir(const std::string& err) - : std::runtime_error("MALFORMED IR: " + err) {} + : compilation_error("MALFORMED IR: " + err) {} explicit malformed_ir(ExprPtr expr) - : std::runtime_error("MALFORMED IR: " + std::to_string(expr)) {} + : compilation_error("MALFORMED IR: " + std::to_string(expr)) {} explicit malformed_ir(const std::string& err, ExprPtr expr) - : std::runtime_error( + : compilation_error( "MALFORMED IR: " + err + " - " + std::to_string(expr)) {} explicit malformed_ir(StmtPtr stmt) - : std::runtime_error("MALFORMED IR: " + std::to_string(stmt)) {} + : compilation_error("MALFORMED IR: " + std::to_string(stmt)) {} explicit malformed_ir(const std::string& err, StmtPtr stmt) - : std::runtime_error( + : compilation_error( "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {} }; -TORCH_API std::string buildErrorMessage(const std::string& s); - } // namespace tensorexpr } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index e67d094065d1a..570fe338093b9 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -476,11 +476,13 @@ bool LoopNest::vectorize(ForPtr f) { normalize(to(new_f)); new_f = FlattenIndexes(new_f); new_f = v.vectorize(to(new_f)); - } catch (std::runtime_error& e) { + } catch (compilation_error& e) { // We clone f before vectorizing. So, any partial vectorization will // have modified the clone. In case of an exception, we can continue // using f. new_f = f; + } catch (std::runtime_error& e) { + new_f = f; } if (new_f != f) { From 008bf6689b7d298e3a788dc5576c9b691c5f25a7 Mon Sep 17 00:00:00 2001 From: Chris Cai Date: Sat, 4 Sep 2021 20:54:29 -0700 Subject: [PATCH 510/530] Back out "D30740897 Add fusion enabled apis" (#64500) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64500 D30740897 (https://github.com/pytorch/pytorch/commit/39aeb3bf63f61664bc6c4a929a80a660365c2a5e) broke caffe2/torch/fb/module_factory/optimizers/tests:test_full_sync_optimizer_needed_coverage (https://fburl.com/test/mb46jxon) and blocked training_platform_unit_tests {F660271297} multsect results confirms ``` multisect --config FBCODE_TEST bisect 844424966128796 --workers 16 revisions --begin 09629edc --end fc86b434 D30740897 (https://github.com/pytorch/pytorch/commit/39aeb3bf63f61664bc6c4a929a80a660365c2a5e) ```` {F660271232} Test Plan: ``` buck test mode/opt //caffe2/torch/fb/module_factory/optimizers/tests:test_full_sync_optimizer_needed_coverage Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/4785074671474181 ✓ Pass: caffe2/torch/fb/module_factory/optimizers/tests:test_full_sync_optimizer_needed_coverage - main (3.729) Summary Pass: 1 ``` Differential Revision: D30753916 fbshipit-source-id: 302fd4113ef1f3069846be03edc2300d82b66719 --- docs/source/jit.rst | 2 -- torch/jit/__init__.py | 29 ----------------------------- 2 files changed, 31 deletions(-) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 97a0615812830..8a80b6471e1a7 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -61,8 +61,6 @@ Creating TorchScript Code ScriptFunction freeze optimize_for_inference - enable_fusion - fusion_enabled save load ignore diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f804d3c72ce8e..f7fa58bd36434 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -4,7 +4,6 @@ from typing import Iterator from torch.utils import set_module -from typing import Optional # These are imported so users can access them from the `torch.jit` module from torch._jit_internal import ( @@ -198,34 +197,6 @@ def _hide_source_ranges() -> Iterator[None]: finally: torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges) # type: ignore[attr-defined] -def enable_fusion(enabled: bool, device: Optional[str] = None): - """ - Enables or disables JIT fusion based on the parameter `enabled`. - - If `device` is None, both CPU and GPU fusion will be turned on or off. - Otherwise, device must be equal to "cpu" or "cuda", and will turn on or off - CPU and GPU fusion respectively. - """ - - if device is None: - torch._C._jit_override_can_fuse_on_cpu(enabled) - torch._C._jit_override_can_fuse_on_gpu(enabled) - else: - assert device in ["cpu", "cuda"], "Device-specific fusion must be equal to 'cpu' or 'cuda' if not None" - if device == "cuda": - torch._C._jit_override_can_fuse_on_gpu(enabled) - else: - torch._C._jit_override_can_fuse_on_cpu(enabled) - -def fusion_enabled(device: str): - """ - Returns whether JIT fusion is enabled for "cpu" or "cuda" - """ - assert device == "cpu" or device == "cuda" - if device == "cpu": - return torch._C._jit_can_fuse_on_cpu() - else: - return torch._C._jit_can_fuse_on_gpu() if not torch._C._jit_init(): raise RuntimeError("JIT initialization failed") From 1901c675e1cf9626a25d96bcc64b4952c64d56a2 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Sun, 5 Sep 2021 02:23:31 -0700 Subject: [PATCH 511/530] Back out "nn.functional.linear OpInfo" (#64517) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64517 Original commit changeset: ca41dbd98176 Test Plan: PyTorch CI Reviewed By: ngimel Differential Revision: D30758201 fbshipit-source-id: 2d3274293d340373b8af86083336607818019619 --- .../_internal/common_methods_invocations.py | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index f678f2258574f..5e009ee7f487e 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2648,30 +2648,6 @@ def sample_inputs_hardswish(self, device, dtype, requires_grad): requires_grad=requires_grad, low=-5, high=5)) for _ in range(1, N)] return tensors -def sample_inputs_linear(self, device, dtype, requires_grad): - features_options = [[3, 4], [128, 128]] - batch_options: List[List[int]] = [ - [], # no batch - [0], - [64], - [5, 7], - ] - create_tensor = partial(make_tensor, device=device, dtype=dtype, - requires_grad=requires_grad, low=-2, high=2) - - sample_inputs = [] - for has_bias, (in_feat, out_feat), batch_shape in \ - itertools.product([True, False], features_options, batch_options): - input_tensor = create_tensor(batch_shape + [in_feat]) - weight = create_tensor([out_feat, in_feat]) - if not has_bias: - sample_inputs.append(SampleInput(input_tensor, args=(weight,))) - continue - - bias = create_tensor([out_feat]) - sample_inputs.append(SampleInput(input_tensor, args=(weight, bias))) - return sample_inputs - def sample_inputs_interpolate(mode, self, device, dtype, requires_grad): N, C = 2, 3 D = 4 @@ -7545,17 +7521,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): dtypesIfCPU=floating_types_and(torch.int64), dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_avgpool2d), - OpInfo('nn.functional.linear', - aten_name='linear', - supports_autograd=True, - sample_inputs_func=sample_inputs_linear, - dtypesIfCPU=all_types_and_complex_and(torch.half, torch.bfloat16), - dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, - *[torch.bfloat16] if CUDA11OrLater else []), - supports_forward_ad=True, - supports_out=False), UnaryUfuncInfo( 'nn.functional.logsigmoid', aten_name="log_sigmoid", From 49fe829caea178eee7dcb75a923ff29291117827 Mon Sep 17 00:00:00 2001 From: Sangbaek Park Date: Sun, 5 Sep 2021 12:52:46 -0700 Subject: [PATCH 512/530] [Vulkan] Code Quality: Remove duplicate code for hardshrink and leaky_relu functions (#64405) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64405 Code quality improvement: removed duplicate code for hardshrink and leaky_relu functions. ghstack-source-id: 137319378 Test Plan: ```buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test adb shell "/data/local/tmp/vulkan_api_test"``` Reviewed By: SS-JIA Differential Revision: D30690251 fbshipit-source-id: 5729d1f32946e42f41df77756a8313f297dd822f --- aten/src/ATen/native/vulkan/ops/Clamp.cpp | 149 +++++----------------- 1 file changed, 30 insertions(+), 119 deletions(-) diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index 7982b0eda0d7a..a6e65607fb07c 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -207,7 +207,7 @@ Tensor& activation_( TORCH_CHECK( self.is_vulkan(), - "Vulkan: In-place clamp is only supported on Vulkan tensors."); + "Vulkan: In-place operator is only supported on Vulkan tensors."); vTensor& v_self = convert(self); @@ -289,9 +289,10 @@ Tensor& hardsigmoid_(Tensor& self) { return ops::activation_(self, VK_KERNEL(hardsigmoid_)); } -Tensor hardshrink( +Tensor activation_scalar( const Tensor& self_arg, - const Scalar& lambd) { + const Scalar& scalar_arg, + const api::Shader::Descriptor& shader_descriptor) { api::Context* const context = api::context(); const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); @@ -310,11 +311,11 @@ Tensor hardshrink( const struct Block final { uvec3 extents; uint32_t _; - float lambd; + float scalar_value; } block { v_output.extents(), 0u, - lambd.to(), + scalar_arg.to(), }; context->dispatch( @@ -324,7 +325,7 @@ Tensor hardshrink( VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, - VK_KERNEL(hardshrink), + shader_descriptor, v_output.extents(), context->gpu().adapter->local_work_group_size(), // Write-only access bypasses synchronization but inserts appropriate @@ -351,14 +352,15 @@ Tensor hardshrink( return convert(v_output); } -Tensor& hardshrink_( +Tensor& activation_scalar_( Tensor& self, - const Scalar& lambd) { + const Scalar& scalar_arg, + const api::Shader::Descriptor& shader_descriptor) { api::Context* const context = api::context(); TORCH_CHECK( self.is_vulkan(), - "Vulkan: In-place hardshrink is only supported on Vulkan tensors."); + "Vulkan: In-place operator is only supported on Vulkan tensors."); vTensor& v_self = convert(self); @@ -369,11 +371,11 @@ Tensor& hardshrink_( const struct Block final { uvec3 extents; uint32_t _; - float lambd; + float scalar_value; } block { v_self.extents(), 0u, - lambd.to(), + scalar_arg.to(), }; context->dispatch( @@ -382,7 +384,7 @@ Tensor& hardshrink_( VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, - VK_KERNEL(hardshrink_), + shader_descriptor, v_self.extents(), context->gpu().adapter->local_work_group_size(), // Read-Write access triggers an async synchronization if necessory @@ -404,119 +406,28 @@ Tensor& hardshrink_( return self; } -Tensor leaky_relu( +Tensor hardshrink( const Tensor& self_arg, - const Scalar& negative_slope) { - api::Context* const context = api::context(); - - const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); - const vTensor& v_self = convert(self); - - vTensor v_output{ - context, - v_self.sizes(), - v_self.options(), - }; - - api::Command::Pool& command_pool = context->command().pool; - api::Command::Buffer& command_buffer = command_pool.stream(); - { - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - const struct Block final { - uvec3 extents; - uint32_t _; - float negative_slope; - } block { - v_output.extents(), - 0u, - negative_slope.to(), - }; + const Scalar& lambd) { + return ops::activation_scalar(self_arg, lambd, VK_KERNEL(hardshrink)); +} - context->dispatch( - command_buffer, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }, - VK_KERNEL(leaky_relu), - v_output.extents(), - context->gpu().adapter->local_work_group_size(), - // Write-only access bypasses synchronization but inserts appropriate - // barriers if necessary. - v_output.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Write), - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - v_self.image( - command_buffer, - vTensor::Stage::Compute), - // Object lifetime is managed by the resource pool. - // It is OK not to keep track of the handle. - context->resource().pool.uniform(block).object); - } - else { - TORCH_CHECK(false, "Not implemented!"); - } - } - command_pool.submit(context->gpu().queue, command_buffer); +Tensor& hardshrink_( + Tensor& self, + const Scalar& lambd) { + return ops::activation_scalar_(self, lambd, VK_KERNEL(hardshrink_)); +} - return convert(v_output); +Tensor leaky_relu( + const Tensor& self_arg, + const Scalar& negative_slope) { + return ops::activation_scalar(self_arg, negative_slope, VK_KERNEL(leaky_relu)); } Tensor& leaky_relu_( Tensor& self, const Scalar& negative_slope) { - api::Context* const context = api::context(); - - TORCH_CHECK( - self.is_vulkan(), - "Vulkan: In-place leaky relu is only supported on Vulkan tensors."); - - vTensor& v_self = convert(self); - - api::Command::Pool& command_pool = context->command().pool; - api::Command::Buffer& command_buffer = command_pool.stream(); - { - if C10_LIKELY(v_self.has_image()) { - const struct Block final { - uvec3 extents; - uint32_t _; - float negative_slope; - } block { - v_self.extents(), - 0u, - negative_slope.to(), - }; - - context->dispatch( - command_buffer, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }, - VK_KERNEL(leaky_relu_), - v_self.extents(), - context->gpu().adapter->local_work_group_size(), - // Read-Write access triggers an async synchronization if necessory - // and inserts appropriate barriers if hazards are detected. - v_self.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Read | vTensor::Access::Write), - // Object lifetime is managed by the resource pool. - // It is OK not to keep track of the handle. - context->resource().pool.uniform(block).object); - } - else { - TORCH_CHECK(false, "Not implemented!"); - } - } - command_pool.submit(context->gpu().queue, command_buffer); - - return self; + return ops::activation_scalar_(self, negative_slope, VK_KERNEL(leaky_relu_)); } Tensor sigmoid(const Tensor& self) { @@ -542,8 +453,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl(TORCH_SELECTIVE_NAME("aten::clamp_"), TORCH_FN(clamp_)); m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid"), hardsigmoid); m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), hardsigmoid_); - m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink"), TORCH_FN(hardshrink)); - m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink_"), TORCH_FN(hardshrink_)); + m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink"), hardshrink); + m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink_"), hardshrink_); m.impl(TORCH_SELECTIVE_NAME("aten::hardswish"), hardswish); m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_); m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh); From bcc7e82371082a284ad9ee423cad192bb024e1a1 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Sun, 5 Sep 2021 16:06:09 -0700 Subject: [PATCH 513/530] Revert D30745610: [nnc] Make our exceptions c10::Errors, get C++ stacktraces Test Plan: revert-hammer Differential Revision: D30745610 (https://github.com/pytorch/pytorch/commit/18b2751ea143374adbb690889427e06a9334da05) Original commit changeset: a1cfaa7364ef fbshipit-source-id: 9b716053b96a65745240ddef1c456c44d5d09671 --- torch/csrc/jit/tensorexpr/exceptions.h | 64 +++++++++++--------------- torch/csrc/jit/tensorexpr/loopnest.cpp | 4 +- 2 files changed, 27 insertions(+), 41 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h index 35ba8a34e88e2..7194dfe166aa8 100644 --- a/torch/csrc/jit/tensorexpr/exceptions.h +++ b/torch/csrc/jit/tensorexpr/exceptions.h @@ -26,78 +26,66 @@ namespace torch { namespace jit { namespace tensorexpr { -TORCH_API std::string buildErrorMessage(const std::string& s); - -class compilation_error : public c10::Error { +class unsupported_dtype : public std::runtime_error { public: - explicit compilation_error(const std::string& err) - : c10::Error( - { - __func__, - __FILE__, - static_cast(__LINE__), - }, - buildErrorMessage(err)) {} -}; - -class unsupported_dtype : public compilation_error { - public: - explicit unsupported_dtype() : compilation_error("UNSUPPORTED DTYPE") {} + explicit unsupported_dtype() : std::runtime_error("UNSUPPORTED DTYPE") {} explicit unsupported_dtype(const std::string& err) - : compilation_error("UNSUPPORTED DTYPE: " + err) {} + : std::runtime_error("UNSUPPORTED DTYPE: " + err) {} }; -class out_of_range_index : public compilation_error { +class out_of_range_index : public std::runtime_error { public: - explicit out_of_range_index() : compilation_error("OUT OF RANGE INDEX") {} + explicit out_of_range_index() : std::runtime_error("OUT OF RANGE INDEX") {} explicit out_of_range_index(const std::string& err) - : compilation_error("OUT OF RANGE INDEX: " + err) {} + : std::runtime_error("OUT OF RANGE INDEX: " + err) {} }; -class unimplemented_lowering : public compilation_error { +class unimplemented_lowering : public std::runtime_error { public: explicit unimplemented_lowering() - : compilation_error("UNIMPLEMENTED LOWERING") {} + : std::runtime_error("UNIMPLEMENTED LOWERING") {} explicit unimplemented_lowering(ExprPtr expr) - : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {} + : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {} explicit unimplemented_lowering(StmtPtr stmt) - : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {} + : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {} }; -class malformed_input : public compilation_error { +class malformed_input : public std::runtime_error { public: - explicit malformed_input() : compilation_error("MALFORMED INPUT") {} + explicit malformed_input() : std::runtime_error("MALFORMED INPUT") {} explicit malformed_input(const std::string& err) - : compilation_error("MALFORMED INPUT: " + err) {} + : std::runtime_error("MALFORMED INPUT: " + err) {} explicit malformed_input(ExprPtr expr) - : compilation_error("MALFORMED INPUT: " + std::to_string(expr)) {} + : std::runtime_error("MALFORMED INPUT: " + std::to_string(expr)) {} explicit malformed_input(const std::string& err, ExprPtr expr) - : compilation_error( + : std::runtime_error( "MALFORMED INPUT: " + err + " - " + std::to_string(expr)) {} explicit malformed_input(StmtPtr stmt) - : compilation_error("MALFORMED INPUT: " + std::to_string(stmt)) {} + : std::runtime_error("MALFORMED INPUT: " + std::to_string(stmt)) {} explicit malformed_input(const std::string& err, StmtPtr stmt) - : compilation_error( + : std::runtime_error( "MALFORMED INPUT: " + err + " - " + std::to_string(stmt)) {} }; -class malformed_ir : public compilation_error { +class malformed_ir : public std::runtime_error { public: - explicit malformed_ir() : compilation_error("MALFORMED IR") {} + explicit malformed_ir() : std::runtime_error("MALFORMED IR") {} explicit malformed_ir(const std::string& err) - : compilation_error("MALFORMED IR: " + err) {} + : std::runtime_error("MALFORMED IR: " + err) {} explicit malformed_ir(ExprPtr expr) - : compilation_error("MALFORMED IR: " + std::to_string(expr)) {} + : std::runtime_error("MALFORMED IR: " + std::to_string(expr)) {} explicit malformed_ir(const std::string& err, ExprPtr expr) - : compilation_error( + : std::runtime_error( "MALFORMED IR: " + err + " - " + std::to_string(expr)) {} explicit malformed_ir(StmtPtr stmt) - : compilation_error("MALFORMED IR: " + std::to_string(stmt)) {} + : std::runtime_error("MALFORMED IR: " + std::to_string(stmt)) {} explicit malformed_ir(const std::string& err, StmtPtr stmt) - : compilation_error( + : std::runtime_error( "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {} }; +TORCH_API std::string buildErrorMessage(const std::string& s); + } // namespace tensorexpr } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 570fe338093b9..e67d094065d1a 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -476,13 +476,11 @@ bool LoopNest::vectorize(ForPtr f) { normalize(to(new_f)); new_f = FlattenIndexes(new_f); new_f = v.vectorize(to(new_f)); - } catch (compilation_error& e) { + } catch (std::runtime_error& e) { // We clone f before vectorizing. So, any partial vectorization will // have modified the clone. In case of an exception, we can continue // using f. new_f = f; - } catch (std::runtime_error& e) { - new_f = f; } if (new_f != f) { From 544c8e6a5d26efdf1cf679b313893fe119825930 Mon Sep 17 00:00:00 2001 From: Shen Xu Date: Sun, 5 Sep 2021 16:44:13 -0700 Subject: [PATCH 514/530] Mark functions in backend header as inline to suppress warning (#64098) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64098 Reviewed By: kimishpatel, iseeyuan Differential Revision: D30593104 fbshipit-source-id: 328196b9bc4a89a28ad89bede7e337107976c303 --- torch/csrc/jit/backends/backend.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/backends/backend.h b/torch/csrc/jit/backends/backend.h index 941f27bfe2b11..5aae642fa5517 100644 --- a/torch/csrc/jit/backends/backend.h +++ b/torch/csrc/jit/backends/backend.h @@ -9,7 +9,7 @@ namespace torch { namespace jit { namespace { // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration) -c10::FunctionSchema getIsAvailableSchema() { +inline c10::FunctionSchema getIsAvailableSchema() { c10::Argument self("self", c10::AnyType::get()); c10::Argument available("available", c10::BoolType::get()); c10::FunctionSchema preprocessor_schema( @@ -23,7 +23,7 @@ c10::FunctionSchema getIsAvailableSchema() { constexpr static auto kBackendsNamespace = "__backends__"; // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration) -c10::FunctionSchema getCompileSchema() { +inline c10::FunctionSchema getCompileSchema() { c10::Argument self("self", c10::AnyType::get()); c10::Argument mod("processed", c10::AnyType::get()); auto any_dict_ty = @@ -40,7 +40,7 @@ c10::FunctionSchema getCompileSchema() { } // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration) -c10::FunctionSchema getExecuteSchema() { +inline c10::FunctionSchema getExecuteSchema() { auto any_list_ty = c10::ListType::create(c10::AnyType::get()); c10::Argument self("self", c10::AnyType::get()); c10::Argument handle("handle", c10::AnyType::get()); From 571a2becf337ae84275fa96300043762387058cf Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 6 Sep 2021 11:37:39 -0700 Subject: [PATCH 515/530] Move ParallelNative and PureTorch to GHA (#64452) Summary: Separate ParallelTBB move to https://github.com/pytorch/pytorch/pull/64193 as it requires some further investiagation Pull Request resolved: https://github.com/pytorch/pytorch/pull/64452 Reviewed By: seemethere, janeyx99 Differential Revision: D30738337 Pulled By: malfet fbshipit-source-id: 81c46423e903058bd1a3e8553e8a10ce978eeefd --- .circleci/cimodel/data/pytorch_build_data.py | 2 - .circleci/config.yml | 63 --- .github/generated-ciflow-ruleset.json | 8 +- .github/scripts/generate_ci_workflows.py | 46 +- ...rallelnative-linux-xenial-py3.6-gcc5.4.yml | 510 ++++++++++++++++++ ...ed-puretorch-linux-xenial-py3.6-gcc5.4.yml | 275 ++++++++++ .jenkins/pytorch/build.sh | 2 +- 7 files changed, 827 insertions(+), 79 deletions(-) create mode 100644 .github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml create mode 100644 .github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index dbe17bf4f15f5..46527c1168891 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -8,8 +8,6 @@ ("3.6", [ ("important", [X(True)]), ("parallel_tbb", [X(True)]), - ("parallel_native", [X(True)]), - ("pure_torch", [X(True)]), ]), ]), # TODO: bring back libtorch test diff --git a/.circleci/config.yml b/.circleci/config.yml index ffc67a14ec5ad..c57eb26c032e1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7190,50 +7190,6 @@ workflows: build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large - - pytorch_linux_build: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - - pytorch_linux_test: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test - requires: - - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_test: - name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test - requires: - - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_build: - name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - build_environment: "pytorch-pure_torch-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - pytorch_linux_build: name: pytorch_linux_xenial_py3_6_gcc7_build requires: @@ -9324,25 +9280,6 @@ workflows: build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large - - pytorch_linux_build: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - - pytorch_linux_test: - name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test - requires: - - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build - build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - resource_class: large - - pytorch_linux_build: - name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build - requires: - - "docker-pytorch-linux-xenial-py3.6-gcc5.4" - build_environment: "pytorch-pure_torch-linux-xenial-py3.6-gcc5.4-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" - pytorch_linux_build: name: pytorch_linux_xenial_py3_6_gcc7_build requires: diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index d3ebad35a5303..8e05b532f86fc 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -11,9 +11,11 @@ "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", + "parallelnative-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-linux-xenial-cuda11.1-py3.6-gcc7", "periodic-win-vs2019-cuda11.1-py3", + "puretorch-linux-xenial-py3.6-gcc5.4", "win-vs2019-cpu-py3", "win-vs2019-cuda10.1-py3", "win-vs2019-cuda11.3-py3" @@ -29,6 +31,8 @@ "linux-bionic-py3.8-gcc9-coverage", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", + "parallelnative-linux-xenial-py3.6-gcc5.4", + "puretorch-linux-xenial-py3.6-gcc5.4", "win-vs2019-cpu-py3" ], "ciflow/cuda": [ @@ -67,8 +71,10 @@ "linux-xenial-cuda11.3-py3.6-gcc7", "linux-xenial-py3.6-gcc5.4", "linux-xenial-py3.6-gcc7-bazel-test", + "parallelnative-linux-xenial-py3.6-gcc5.4", "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7", - "periodic-linux-xenial-cuda11.1-py3.6-gcc7" + "periodic-linux-xenial-cuda11.1-py3.6-gcc7", + "puretorch-linux-xenial-py3.6-gcc5.4" ], "ciflow/noarch": [ "linux-bionic-py3.6-clang9" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 16100f72a527c..b5146114054a6 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -285,19 +285,41 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: # build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4", # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", # test_runner_type=LINUX_CPU_TEST_RUNNER, + # on_pull_request=True, + # ciflow_config=CIFlowConfig( + # enabled=True, + # trigger_action_only=True, + # labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, + # ), # ), - # CIWorkflow( - # arch="linux", - # build_environment="parallelnative-linux-xenial-py3.6-gcc5.4", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), - # CIWorkflow( - # arch="linux", - # build_environment="pure_torch-linux-xenial-py3.6-gcc5.4", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ), + CIWorkflow( + arch="linux", + build_environment="parallelnative-linux-xenial-py3.6-gcc5.4", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + test_runner_type=LINUX_CPU_TEST_RUNNER, + # This is a master only job despite on_pull_request is set to True + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, + ), + ), + # Build PyTorch with BUILD_CAFFE2=OFF + CIWorkflow( + arch="linux", + build_environment="puretorch-linux-xenial-py3.6-gcc5.4", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + test_runner_type=LINUX_CPU_TEST_RUNNER, + exclude_test=True, + # This is a master only job despite on_pull_request is set to True + on_pull_request=True, + ciflow_config=CIFlowConfig( + enabled=True, + trigger_action_only=True, + labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, + ), + ), # CIWorkflow( # arch="linux", # build_environment="linux-xenial-py3.6-gcc7", diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml new file mode 100644 index 0000000000000..1b352f6b8cd80 --- /dev/null +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml @@ -0,0 +1,510 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: parallelnative-linux-xenial-py3.6-gcc5.4 + +on: + pull_request: + types: [unassigned] + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + +concurrency: + group: parallelnative-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + steps: + - name: noop + run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" + calculate-docker-image: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.2xlarge + needs: [ciflow_should_run] + env: + DOCKER_BUILDKIT: 1 + timeout-minutes: 90 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + - name: Check if image should be built + id: check + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + - name: Build and push docker image + if: ${{ steps.check.outputs.rebuild }} + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + DOCKER_SKIP_S3_UPLOAD: 1 + run: | + export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} + cd .circleci/docker && ./build_docker.sh + + build: + runs-on: linux.2xlarge + needs: [calculate-docker-image, ciflow_should_run] + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-build + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch + run: | + docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + + generate-test-matrix: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: ubuntu-18.04 + needs: [ciflow_should_run] + env: + TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: 1 + ENABLE_JIT_LEGACY_TEST: '' + ENABLE_MULTIGPU_TEST: '' + ENABLE_NOGPU_NO_AVX_TEST: '' + ENABLE_NOGPU_NO_AVX2_TEST: '' + ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: '' + ENABLE_NOARCH_TEST: '' + NUM_TEST_SHARDS: 1 + MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu + NOGPU_RUNNER_TYPE: linux.2xlarge + PR_BODY: ${{ github.event.pull_request.body }} + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} + ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} + container: + image: python:3.9 + steps: + - name: Install dependencies + run: pip install typing-extensions + - name: Clone pytorch/pytorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + - name: Generating test matrix + id: set-matrix + run: .github/scripts/generate_pytorch_test_matrix.py + + test: + needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run] + strategy: + matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Test PyTorch + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + if [[ $NUM_TEST_SHARDS -ne 2 ]]; then + export SHARD_NUMBER=0 + fi + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086 + docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_LABELS \ + -e CONTINUE_THROUGH_ERROR \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Zip test reports for upload + if: always() + env: + COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store PyTorch Test Reports + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Test Reports on S3 + if: always() + with: + name: test-reports-${{ matrix.config }} + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.16.34 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . diff --git a/.github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml new file mode 100644 index 0000000000000..af1228903b1f5 --- /dev/null +++ b/.github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml @@ -0,0 +1,275 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: puretorch-linux-xenial-py3.6-gcc5.4 + +on: + pull_request: + types: [unassigned] + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: puretorch-linux-xenial-py3.6-gcc5.4 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + +concurrency: + group: puretorch-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + ciflow_should_run: + runs-on: ubuntu-18.04 + if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }} + env: + LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + steps: + - name: noop + run: echo running ciflow_should_run + - name: print labels + run: echo "${LABELS}" + calculate-docker-image: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.2xlarge + needs: [ciflow_should_run] + env: + DOCKER_BUILDKIT: 1 + timeout-minutes: 90 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + - name: Check if image should be built + id: check + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + - name: Build and push docker image + if: ${{ steps.check.outputs.rebuild }} + env: + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }} + DOCKER_SKIP_S3_UPLOAD: 1 + run: | + export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} + cd .circleci/docker && ./build_docker.sh + + build: + runs-on: linux.2xlarge + needs: [calculate-docker-image, ciflow_should_run] + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + JOB_BASE_NAME: puretorch-linux-xenial-py3.6-gcc5.4-build + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE:?}/*" + rm -f ~/.ssh/authorized_keys + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch + run: | + docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }} + CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }} + CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }} + CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Chown workspace + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 085cf5152e6f1..226b8521ee049 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -59,7 +59,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then export BUILD_SPLIT_CUDA=ON fi -if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then +if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* || ${BUILD_ENVIRONMENT} == *"puretorch"* ]]; then export BUILD_CAFFE2=OFF fi From 0a1aaff0dea5953928cefc506b4f4d39e0cb8a4d Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Mon, 6 Sep 2021 21:24:38 -0700 Subject: [PATCH 516/530] Remove dead code from THC (THCApply.cuh) (#64559) Summary: cc peterbell10 Pull Request resolved: https://github.com/pytorch/pytorch/pull/64559 Reviewed By: mruberry Differential Revision: D30769526 Pulled By: ngimel fbshipit-source-id: 034a5c778a2b902cffa57b76511fa0dcdea26825 --- .../ATen/native/cuda/DistributionBernoulli.cu | 1 - .../native/cuda/DistributionCauchyKernel.cu | 1 - .../cuda/DistributionExponentialKernel.cu | 1 - .../cuda/DistributionGeometricKernel.cu | 1 - .../cuda/DistributionLogNormalKernel.cu | 1 - .../ATen/native/cuda/DistributionNormal.cu | 1 - .../native/cuda/DistributionRandomKernel.cu | 1 - aten/src/ATen/native/cuda/Distributions.cu | 1 - aten/src/THC/CMakeLists.txt | 1 - aten/src/THC/THCApply.cuh | 760 ------------------ aten/src/THC/THCTensorCopy.cu | 33 +- aten/src/THC/generic/THCTensorCopy.cu | 18 - 12 files changed, 2 insertions(+), 818 deletions(-) delete mode 100644 aten/src/THC/THCApply.cuh diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu index 0baaf2e049b04..e113d82c0f5c7 100644 --- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu +++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu index 6f43ee664cb2c..b33ee792ea4cc 100644 --- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu index 6e1823032a789..f28a910e9980b 100644 --- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu index 9086e2a35c8d3..6cafba0dcbe78 100644 --- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu index 9497cf83cc405..c5da3bdf92d2a 100644 --- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu index 32d223c5d0a93..1b2dd19eec0d1 100644 --- a/aten/src/ATen/native/cuda/DistributionNormal.cu +++ b/aten/src/ATen/native/cuda/DistributionNormal.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu index 57d0701329d91..ea2aaad9445b2 100644 --- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index a48a3778305ab..81f8fe8fa227f 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index 82828a7ac1c89..ab7f72b2f41d4 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -30,7 +30,6 @@ install(FILES THCTensor.h THCTensorCopy.h THCTensorCopy.hpp - THCApply.cuh THCReduceApplyUtils.cuh THCTensorMathReduce.cuh THCAsmUtils.cuh diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh deleted file mode 100644 index e424b2406ee3c..0000000000000 --- a/aten/src/THC/THCApply.cuh +++ /dev/null @@ -1,760 +0,0 @@ -#ifndef THC_APPLY_INC -#define THC_APPLY_INC - -#include -#include -#include -#include -#include -#include - -// -// This file contains pointwise operation functions and kernels that -// work on both contiguous and non-contiguous tensor arguments of -// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without -// copying or temporary storage. -// - -// Rearrange dimensions for pointwise operations so that strides are in -// decreasing order as much as possible, so that kernels have better memory -// access patterns. -// -// For example, consider a binary operation on two "transposed" 2-dim tensors: -// sizes: 256 512 -// aInfo->strides: 1 256 -// bInfo->strides: 1 256 -// -// Given this, each concurrent memory access inside kernelPointwiseApply2() is -// exactly 256 elements apart, resulting in poor performance. -// -// This function exchanges dimensions so that memory access is contiguous: -// sizes: 512 256 -// aInfo->strides: 256 1 -// bInfo->strides: 256 1 -// -// (Actually, it becomes even better because now collapseDims() can turn each -// input into one contiguous array.) -// -// In general, given M (<=3) TensorInfo's with N dimensions, we can view each -// strides[i] (0 <= i < N) as an M-tuple. Given each pair i < j, we exchange -// strides[i] and [j] if -// (1) strides[i][k] < strides[j][k] for some k (0 <= k < M) -// (exchanging them will benefit input #k), and -// (2) strides[i][k] <= strieds[j][k] for all k -// (exchanging them will not make any input worse). -template -void rearrangeDims(TensorInfo* aInfo, - TensorInfo* bInfo = nullptr, - TensorInfo* cInfo = nullptr) { - int numInfos = 1; - int dims = aInfo->dims; - IndexType *sizes[3] = { aInfo->sizes, }; - IndexType *strides[3] = { aInfo->strides, }; - - if (bInfo != nullptr) { - ++numInfos; - if (bInfo->dims != dims) return; - sizes[1] = bInfo->sizes; - strides[1] = bInfo->strides; - } - - if (cInfo != nullptr) { - ++numInfos; - if (cInfo->dims != dims) return; - sizes[2] = cInfo->sizes; - strides[2] = cInfo->strides; - } - - // Bail out if sizes do not match: we are using "deprecated pointwise - // behavior" among tensors of different shapes but same number of elements. - for (int i = 1; i < numInfos; ++i) { - for (int j = 0; j < dims; ++j) { - if (sizes[i][j] != sizes[0][j]) return; - } - } - - for (int i = 0; i < dims - 1; ++i) { - // No need to consider dimensions of size 1. - if (sizes[0][i] == 1) continue; - - for (int j = i + 1; j < dims; ++j) { - if (sizes[0][j] == 1) continue; - - // Compare the relative sizes of strides between dim #i and dim #j. - bool hasIncreasingStrides = false; - bool hasDecreasingStrides = false; - - for (int k = 0; k < numInfos; k++) { - IndexType stride_i = strides[k][i]; - IndexType stride_j = strides[k][j]; - if (stride_i < stride_j) { - hasIncreasingStrides = true; - } else if (stride_i > stride_j) { - hasDecreasingStrides = true; - } - } - - if (hasIncreasingStrides && !hasDecreasingStrides) { - for (int k = 0; k < numInfos; k++) { - IndexType size = sizes[k][i]; - sizes[k][i] = sizes[k][j]; - sizes[k][j] = size; - - IndexType stride = strides[k][i]; - strides[k][i] = strides[k][j]; - strides[k][j] = stride; - } - } - } - } -} - -// Threads per block for our apply kernel -// FIXME: use occupancy calculator instead -#define THC_APPLY_THREADS_PER_BLOCK (32 * 16) -#define THC_APPLY_BLOCKS_PER_SM 4 -template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_2(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM) -#endif -__global__ void -kernelPointwiseApply1(const OffsetInfo a, - IndexType totalElements, - Op op) { - // NOTE: The two typecasts below are essential when IndexType is 64-bit; - // without them, results are silently truncated to 32 bits! - for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < totalElements; - linearIndex += (IndexType) gridDim.x * blockDim.x) { - op(a.get(linearIndex)); - } -} - -template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_2(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM) -#endif -__global__ void -kernelPointwiseApply2(const OffsetInfo a, - const OffsetInfo b, - IndexType totalElements, - Op op) { - for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < totalElements; - linearIndex += (IndexType) gridDim.x * blockDim.x) { - op(a.get(linearIndex), b.get(linearIndex)); - } -} - -template -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_2(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM) -#endif -__global__ void -kernelPointwiseApply3(const OffsetInfo a, - const OffsetInfo b, - const OffsetInfo c, - IndexType totalElements, - Op op) { - for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < totalElements; - linearIndex += (IndexType) gridDim.x * blockDim.x) { - op(a.get(linearIndex), b.get(linearIndex), c.get(linearIndex)); - } -} - -inline dim3 getApplyBlock() { - return dim3(THC_APPLY_THREADS_PER_BLOCK); -} - -inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid, int curDevice) { - if (curDevice == -1) return false; - - uint64_t numBlocks = THCCeilDiv(totalElements, static_cast(THC_APPLY_THREADS_PER_BLOCK)); - uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0]; - if (numBlocks > maxGridX) - numBlocks = maxGridX; - - // For 32-bit indices, make sure that gridDim.x * blockDim.x fits in 32 bits. - if (totalElements <= INT32_MAX && - numBlocks > INT32_MAX / THC_APPLY_THREADS_PER_BLOCK) - numBlocks = INT32_MAX / THC_APPLY_THREADS_PER_BLOCK; - - grid = dim3(numBlocks); - return true; -} - -template -bool THC_pointwiseApply1(THCState* state, - TensorTypeA* a, - const Op& op, - TensorArgType aType = ReadWrite) { - if (THCTensor_nDimensionLegacyAll(state, a) > MAX_CUTORCH_DIMS) { - return false; - } - - if (THCTensor_nDimensionLegacyAll(state, a) == 0) { - // Zero-dim tensor; do nothing - return true; - } - - const dim3 block = getApplyBlock(); - - dim3 grid; - ptrdiff_t totalElements = THCTensor_nElement(state, a); - - int curDevice = -1; - cudaGetDevice(&curDevice); - if (!getApplyGrid(state, totalElements, grid, curDevice)) { - return false; - } - - /* - Expands readable/writable tensors whose indices may be "overlapped." - This ensures that each element of the tensor is operated on once and only - once. - */ - TensorTypeA* oldA = NULL; - - if (aType == ReadWrite && - THCTensor_maybeOverlappingIndices(state, a)) { - // Must perform in contiguous space - oldA = a; - a = (TensorTypeA*)THCTensor_newContiguous(state, a); - } - - // It is possible that the tensor dimensions are able to be collapsed, - // and thus we can reduce the actual code complexity of the copy by - // exploiting this knowledge statically, since the div/mod is the - // most expensive part of the operation, more so than memory accesses. - // For instance, when copying a non-contiguous to a contiguous tensor - // (or vice versa), the contiguous tensor can be collapsed to one - // dimension, and the loop to translate the linear index to the array - // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(TYPE, A) \ - kernelPointwiseApply1 \ - <<>>( \ - OffsetInfo(aInfo), (TYPE) totalElements, op); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); - -#define HANDLE_A_CASE(TYPE, A) { \ - switch (A) { \ - case 1: \ - HANDLE_CASE(TYPE, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, 2); \ - break; \ - default: \ - HANDLE_CASE(TYPE, -1); \ - break; \ - } \ -} - - // Can we use 32-bit integer math in the kernel (the linear ID for the copy - // and the resulting non-linear offset is all computable using 32-bit math?) - // We also use unsigned index math in the kernel, as signed div/mod has - // additional overhead. - if (THCTensor_canUse32BitIndexMath(state, a)) { - TensorInfo aInfo = - getTensorInfo(state, a); - rearrangeDims(&aInfo); - aInfo.collapseDims(); -#if CUDA_VERSION < 9000 - if (!aInfo.isContiguous()) { - grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); - } -#endif - HANDLE_A_CASE(unsigned int, aInfo.dims); - } else { - TensorInfo aInfo = - getTensorInfo(state, a); - rearrangeDims(&aInfo); - aInfo.collapseDims(); - - /* - Only instantiates the all 1D special case and the fallback all nD case for - large (64-bit indexed) tensors to reduce compilation time. - */ - if (aInfo.dims == 1) { - OffsetInfo - aOffset(aInfo); - kernelPointwiseApply1 - <<>>( - aOffset, (uint64_t) totalElements, op); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } else { - -#if CUDA_VERSION < 9000 - grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); -#endif - OffsetInfo - aOffset(aInfo); - kernelPointwiseApply1 - <<>>( - aOffset, (uint64_t) totalElements, op); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - } -#undef HANDLE_CASE -#undef HANDLE_A_CASE - - if (oldA) { - // Ignore overlaps when copying back; if we use THCTensor_copy - // instead, it will recursively try and invoke ourselves to make - // oldA contiguous. - THCTensor_copyIgnoringOverlaps(state, oldA, a); - THCTensor_free(state, a); - a = oldA; - } - - return true; -} - -template -bool THC_pointwiseApply2(THCState* state, - TensorTypeA* a, - TensorTypeB* b, - const Op& op, - TensorArgType aType = ReadWrite, - TensorArgType bType = ReadOnly) { - ptrdiff_t totalElements = THCTensor_nElement(state, a); - if (totalElements != THCTensor_nElement(state, b)) { - return false; - } - - if (THCTensor_nDimensionLegacyAll(state, a) > MAX_CUTORCH_DIMS || - THCTensor_nDimensionLegacyAll(state, b) > MAX_CUTORCH_DIMS) { - return false; - } - - if (THCTensor_nDimensionLegacyAll(state, a) == 0) { - // Zero-dim tensor; do nothing - return true; - } - - const dim3 block = getApplyBlock(); - - dim3 grid; - int curDevice = -1; - cudaGetDevice(&curDevice); - if (!getApplyGrid(state, totalElements, grid, curDevice)) { - return false; - } - - /* - Expands readable/writable tensors whose indices may be "overlapped." - This ensures that each element of the tensor is operated on once and only - once. - */ - TensorTypeA* oldA = NULL; - TensorTypeB* oldB = NULL; - - if (aType == ReadWrite && - THCTensor_maybeOverlappingIndices(state, a)) { - // Must perform in contiguous space - oldA = a; - a = (TensorTypeA*)THCTensor_newContiguous(state, a); - } - if (bType == ReadWrite && - THCTensor_maybeOverlappingIndices(state, b)) { - // Must perform in contiguous space - oldB = b; - b = (TensorTypeB*)THCTensor_newContiguous(state, b); - } - - // It is possible that the tensor dimensions are able to be collapsed, - // and thus we can reduce the actual code complexity of the copy by - // exploiting this knowledge statically, since the div/mod is the - // most expensive part of the operation, more so than memory accesses. - // For instance, when copying a non-contiguous to a contiguous tensor - // (or vice versa), the contiguous tensor can be collapsed to one - // dimension, and the loop to translate the linear index to the array - // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(TYPE, A, B) \ - kernelPointwiseApply2 \ - <<>>( \ - OffsetInfo(aInfo), \ - OffsetInfo(bInfo), \ - (TYPE) totalElements, op); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - -#define HANDLE_B_CASE(TYPE, A, B) { \ - switch (B) { \ - case 1: \ - HANDLE_CASE(TYPE, A, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, A, 2); \ - break; \ - default: \ - HANDLE_CASE(TYPE, A, -1); \ - break; \ - } \ -} - -#define HANDLE_A_CASE(TYPE, A, B) { \ - switch (A) { \ - case 1: \ - HANDLE_B_CASE(TYPE, 1, B); \ - break; \ - case 2: \ - HANDLE_B_CASE(TYPE, 2, B); \ - break; \ - default: \ - HANDLE_B_CASE(TYPE, -1, B); \ - break; \ - } \ -} - - if (THCTensor_canUse32BitIndexMath(state, a) && - THCTensor_canUse32BitIndexMath(state, b)) { - TensorInfo aInfo = - getTensorInfo(state, a); - - TensorInfo bInfo = - getTensorInfo(state, b); - - rearrangeDims(&aInfo, &bInfo); - aInfo.collapseDims(); - bInfo.collapseDims(); -#if CUDA_VERSION < 9000 - if (!(aInfo.isContiguous() && bInfo.isContiguous())) - grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); -#endif - - HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); - } else { - TensorInfo aInfo = - getTensorInfo(state, a); - - TensorInfo bInfo = - getTensorInfo(state, b); - - rearrangeDims(&aInfo, &bInfo); - aInfo.collapseDims(); - bInfo.collapseDims(); - - /* - Only instantiates the all 1D special case and the fallback all nD case for - large (64-bit indexed) tensors to reduce compilation time. - */ - if (aInfo.dims == 1 && bInfo.dims == 1) { - OffsetInfo - aOffset(aInfo); - OffsetInfo - bOffset(bInfo); - kernelPointwiseApply2 - <<>>( - aOffset, bOffset, (uint64_t) totalElements, op); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } else { -#if CUDA_VERSION < 9000 - grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); -#endif - OffsetInfo - aOffset(aInfo); - OffsetInfo - bOffset(bInfo); - kernelPointwiseApply2 - <<>>( - aOffset, bOffset, (uint64_t) totalElements, op); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - } -#undef HANDLE_CASE -#undef HANDLE_B_CASE -#undef HANDLE_A_CASE - - if (oldA) { - // Ignore overlaps when copying back; if we use THCTensor_copy - // instead, it will recursively try and invoke ourselves to make - // oldA contiguous. - THCTensor_copyIgnoringOverlaps(state, oldA, a); - THCTensor_free(state, a); - a = oldA; - } - - if (oldB) { - // Ignore overlaps when copying back; if we use THCTensor_copy - // instead, it will recursively try and invoke ourselves to make - // oldB contiguous. - THCTensor_copyIgnoringOverlaps(state, oldB, b); - THCTensor_free(state, b); - b = oldB; - } - - return true; -} - -template -bool THC_pointwiseApply3(THCState* state, - TensorTypeA* a, - TensorTypeB* b, - TensorTypeC* c, - const Op& op, - TensorArgType aType = ReadWrite, - TensorArgType bType = ReadOnly, - TensorArgType cType = ReadOnly) { - ptrdiff_t totalElements = THCTensor_nElement(state, a); - - if (totalElements != THCTensor_nElement(state, b) || - totalElements != THCTensor_nElement(state, c)) { - return false; - } - - if (THCTensor_nDimensionLegacyAll(state, a) > MAX_CUTORCH_DIMS || - THCTensor_nDimensionLegacyAll(state, b) > MAX_CUTORCH_DIMS || - THCTensor_nDimensionLegacyAll(state, c) > MAX_CUTORCH_DIMS) { - return false; - } - - if (THCTensor_nDimensionLegacyAll(state, a) == 0) { - // Zero-dim tensor; do nothing - return true; - } - - const dim3 block = getApplyBlock(); - - dim3 grid; - int curDevice = -1; - cudaGetDevice(&curDevice); - if (!getApplyGrid(state, totalElements, grid, curDevice)) { - return false; - } - - /* - Expands readable/writable tensors whose indices may be "overlapped." - This ensures that each element of the tensor is operated on once and only - once. - */ - TensorTypeA* oldA = NULL; - TensorTypeB* oldB = NULL; - TensorTypeC* oldC = NULL; - - if (aType == ReadWrite && - THCTensor_maybeOverlappingIndices(state, a)) { - // Must perform in contiguous space - oldA = a; - a = (TensorTypeA*)THCTensor_newContiguous(state, a); - } - if (bType == ReadWrite && - THCTensor_maybeOverlappingIndices(state, b)) { - // Must perform in contiguous space - oldB = b; - b = (TensorTypeB*)THCTensor_newContiguous(state, b); - } - if (cType == ReadWrite && - THCTensor_maybeOverlappingIndices(state, c)) { - // Must perform in contiguous space - oldC = c; - c = (TensorTypeC*)THCTensor_newContiguous(state, c); - } - -#define HANDLE_CASE(TYPE, A, B, C) \ - kernelPointwiseApply3 \ - <<>>( \ - OffsetInfo \ - (aInfo), \ - OffsetInfo \ - (bInfo), \ - OffsetInfo \ - (cInfo), \ - (TYPE) totalElements, op); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); - -#define HANDLE_C_CASE(TYPE, A, B, C) { \ - switch (C) { \ - case 1: \ - HANDLE_CASE(TYPE, A, B, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, A, B, 2); \ - break; \ - default: \ - HANDLE_CASE(TYPE, A, B, -1); \ - break; \ - } \ -} - -#define HANDLE_B_CASE(TYPE, A, B, C) { \ - switch (B) { \ - case 1: \ - HANDLE_C_CASE(TYPE, A, 1, C); \ - break; \ - case 2: \ - HANDLE_C_CASE(TYPE, A, 2, C); \ - break; \ - default: \ - HANDLE_C_CASE(TYPE, A, -1, C); \ - break; \ - } \ -} - -#define HANDLE_A_CASE(TYPE, A, B, C) { \ - switch (A) { \ - case 1: \ - HANDLE_B_CASE(TYPE, 1, B, C); \ - break; \ - case 2: \ - HANDLE_B_CASE(TYPE, 2, B, C); \ - break; \ - default: \ - HANDLE_B_CASE(TYPE, -1, B, C); \ - break; \ - } \ -} - - if (THCTensor_canUse32BitIndexMath(state, a) && - THCTensor_canUse32BitIndexMath(state, b) && - THCTensor_canUse32BitIndexMath(state, c)) { - TensorInfo aInfo = - getTensorInfo(state, a); - - TensorInfo bInfo = - getTensorInfo(state, b); - - TensorInfo cInfo = - getTensorInfo(state, c); - - rearrangeDims(&aInfo, &bInfo, &cInfo); - aInfo.collapseDims(); - bInfo.collapseDims(); - cInfo.collapseDims(); - -#if CUDA_VERSION < 9000 - if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous())) - grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); -#endif - HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims); - } else { - TensorInfo aInfo = - getTensorInfo(state, a); - - TensorInfo bInfo = - getTensorInfo(state, b); - - TensorInfo cInfo = - getTensorInfo(state, c); - - rearrangeDims(&aInfo, &bInfo, &cInfo); - aInfo.collapseDims(); - bInfo.collapseDims(); - cInfo.collapseDims(); - - /* - Only instantiates the all 1D special case and the fallback all nD case for - large (64-bit indexed) tensors to reduce compilation time. - */ - if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) { - OffsetInfo - aOffset(aInfo); - OffsetInfo - bOffset(bInfo); - OffsetInfo - cOffset(cInfo); - kernelPointwiseApply3 - <<>>( - aOffset, bOffset, cOffset, (uint64_t) totalElements, op); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } else { -#if CUDA_VERSION < 9000 - grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); -#endif - - OffsetInfo - aOffset(aInfo); - OffsetInfo - bOffset(bInfo); - OffsetInfo - cOffset(cInfo); - kernelPointwiseApply3 - <<>>( - aOffset, bOffset, cOffset, (uint64_t) totalElements, op); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - } -#undef HANDLE_CASE -#undef HANDLE_C_CASE -#undef HANDLE_B_CASE -#undef HANDLE_A_CASE - - if (oldA) { - // Ignore overlaps when copying back; if we use THCTensor_copy - // instead, it will recursively try and invoke ourselves to make - // oldA contiguous. - THCTensor_copyIgnoringOverlaps(state, oldA, a); - THCTensor_free(state, a); - a = oldA; - } - - if (oldB) { - // Ignore overlaps when copying back; if we use THCTensor_copy - // instead, it will recursively try and invoke ourselves to make - // oldB contiguous. - THCTensor_copyIgnoringOverlaps(state, oldB, b); - THCTensor_free(state, b); - b = oldB; - } - - if (oldC) { - // Ignore overlaps when copying back; if we use THCTensor_copy - // instead, it will recursively try and invoke ourselves to make - // oldC contiguous. - THCTensor_copyIgnoringOverlaps(state, oldC, c); - THCTensor_free(state, c); - c = oldC; - } - - return true; -} - -#undef THC_APPLY_THREADS_PER_BLOCK -#undef THC_APPLY_BLOCKS_PER_SM - -#endif // THC_APPLY_INC diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu index f4db80dfeb86a..fa1df622aff7c 100644 --- a/aten/src/THC/THCTensorCopy.cu +++ b/aten/src/THC/THCTensorCopy.cu @@ -1,35 +1,6 @@ -#include -#include -#include +#include +#include #include -#include -#include - -// Copy operator for the pointwise apply kernel -template -struct CopyOp { - __device__ __forceinline__ void operator()(T* dst, T* src) { -#if __CUDA_ARCH__ >= 350 - *dst = c10::static_cast_with_inter_type::apply(*src); -#else - *dst = c10::static_cast_with_inter_type::apply(*src); -#endif - } -}; - -template <> -struct CopyOp { - __device__ __forceinline__ void operator()(bool* dst, bool* src) { - *dst = ScalarConvert::to(*src); - } -}; - -template <> -struct CopyOp { - __device__ __forceinline__ void operator()(at::BFloat16* dst, at::BFloat16* src) { - *dst = ScalarConvert::to(*src); - } -}; #include #include diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 3941ef9599206..4301bccc0539b 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -44,22 +44,4 @@ void THCTensor_freeCopyTo(THCState *state, THCTensor *self, THCTensor THCTensor_free(state, self); } -template <> -void THCTensor_copyIgnoringOverlaps(THCState* state, THCTensor* dst, THCTensor* src) { - // Called when we are copying into an overlapping index `dst`, but - // we don't care which writer wins. Hacky but it works. - // This is itself invoked by pointwiseApply2 / THCTensor_copy in - // case that there are write overlaps. - // FIXME: really, overlapping writes should be illegal/an error in Torch - THC_pointwiseApply2( - state, dst, src, - CopyOp(), - ReadOnly, /* ignore overwrites */ - ReadOnly); -} - -void THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { - THCTensor_copyIgnoringOverlaps(state, dst, src); -} - #endif From 1a1fb31cfa3135e56c533da037b5d8dc6981b7fa Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Mon, 6 Sep 2021 23:55:53 -0700 Subject: [PATCH 517/530] Support `torch.concat` alias, add `cat` OpInfo & remove OpInfo test_out skips {cat, stack, hstack, vtack, dstack} (#62560) Summary: Fixes https://github.com/pytorch/pytorch/issues/61767 ## Changes - [x] Add `torch.concat` alias to `torch.cat` - [x] Add OpInfo for `cat`/`concat` - [x] Fix `test_out` skips (Use `at::native::resize_output` or `at::native::resize_output_check`) - [x] `cat`/`concat` - [x] `stack` - [x] `hstack` - [x] `dstack` - [x] `vstack`/`row_stack` - [x] Remove redundant tests for `cat`/`stack` ~I've not added `cat`/`concat` to OpInfo `op_db` yet, since cat is a little more tricky than other OpInfos (should have a lot of tests) and currently there are no OpInfos for that. I can try to add that in a subsequent PR or maybe here itself, whatever is suggested.~ **Edit**: cat/concat OpInfo has been added. **Note**: I've added the named tensor support for `concat` alias as well, maybe that's out of spec in `array-api` but it is still useful for consistency in PyTorch. Thanks to krshrimali for guidance on my first PR :)) cc mruberry rgommers pmeier asmeurer leofang AnirudhDagar asi1024 emcastillo kmaehashi heitorschueroff krshrimali Pull Request resolved: https://github.com/pytorch/pytorch/pull/62560 Reviewed By: saketh-are Differential Revision: D30762069 Pulled By: mruberry fbshipit-source-id: 6985159d1d9756238890488a0ab3ae7699d94337 --- aten/src/ATen/core/aten_interned_strings.h | 2 - aten/src/ATen/core/interned_strings.h | 3 + aten/src/ATen/native/Resize.cpp | 2 +- aten/src/ATen/native/Resize.h | 8 ++- aten/src/ATen/native/TensorShape.cpp | 28 ++++++-- aten/src/ATen/native/cuda/Shape.cu | 6 +- aten/src/ATen/native/native_functions.yaml | 9 +++ docs/source/torch.rst | 1 + test/test_autograd.py | 38 ---------- test/test_fx_experimental.py | 2 +- test/test_tensor_creation_ops.py | 69 ++++++++++++------- torch/_torch_docs.py | 7 ++ torch/csrc/jit/passes/normalize_ops.cpp | 1 + torch/overrides.py | 1 + .../_internal/common_methods_invocations.py | 50 ++++++++++---- 15 files changed, 137 insertions(+), 90 deletions(-) diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index abdf397544468..6da99dfc6a4d9 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -36,7 +36,6 @@ _(aten, _cast_Half) \ _(aten, _cast_Int) \ _(aten, _cast_Long) \ _(aten, _cast_Short) \ -_(aten, _cat) \ _(aten, _ceil) \ _(aten, _clamp_max) \ _(aten, _clamp_min) \ @@ -224,7 +223,6 @@ _(aten, bmm) \ _(aten, broadcast_tensors) \ _(aten, broadcast_to) \ _(aten, cartesian_prod) \ -_(aten, cat) \ _(aten, cauchy) \ _(aten, ceil) \ _(aten, celu) \ diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 69e5f97f7127a..8d49d82c5c8f4 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -306,6 +306,9 @@ namespace c10 { _(aten, bin) \ _(aten, pop) \ _(aten, insert) \ + _(aten, _cat) \ + _(aten, cat) \ + _(aten, concat) \ _(aten, vstack) \ _(aten, row_stack) \ _(prim, unchecked_unwrap_optional) \ diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index f4bff473d2333..1937a8b3d545a 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -8,7 +8,7 @@ namespace at { namespace native { // Returns true if resize is necessary bool resize_output_check(const Tensor& output, IntArrayRef shape) { - // Tests for resizing of tensors with one more elements + // Tests for resizing of tensors with one or more elements if (output.sizes().equals(shape)) { return false; } diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index 5e391a0ce7571..6fb52bc0803ac 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -10,7 +10,10 @@ namespace at { namespace native { // TODO: make all operations that resize given outputs use this function -// for consistency and maintainability +// for consistency and maintainability. +// Some operations like `cat` might not be able to make the use of +// resize_output directly. For more details to understand how it works in `cat`, +// see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 // Resizes outputs // Functions accepting output tensors, like with the "out" kwarg, should // call this function to handle resizing their output tensor. @@ -20,6 +23,9 @@ namespace at { namespace native { // Returns a bool saying whether or not the resize actually happened or not TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape); +// Utility for resize_output +// Returns a bool saying resize should happen or not and +// raises a warning if resizing for one or more elements TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape); TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index edbfa2329a02d..8f397862687ba 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -193,7 +192,10 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) { result_size[dim] = cat_dim_size; // skip resizing if size of result is same as expected - if (result.sizes() != result_size) { + // raise a warning while resizing if output has one or more elements + // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 + // for understanding why at::native::resize_output is not called directly. + if (at::native::resize_output_check(result, result_size)) { result.resize_(result_size, first_tensor_mem_format); } @@ -301,6 +303,23 @@ Tensor cat(TensorList tensors, Dimname dim) { return at::cat(tensors, dimname_to_position(tensors[0], dim)); } +// torch.concat, alias for torch.cat +Tensor& concat_out(TensorList tensors, Dimname dim, Tensor& result) { + return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); +} + +Tensor concat(TensorList tensors, Dimname dim) { + return at::cat(tensors, dimname_to_position(tensors[0], dim)); +} + +Tensor & concat_out(TensorList tensors, int64_t dim, Tensor & result) { + return at::cat_out(result, tensors, dim); +} + +Tensor concat(TensorList tensors, int64_t dim) { + return at::cat(tensors, dim); +} + static bool sizes_match_except(IntArrayRef s1, IntArrayRef s2, int64_t dim_except /* should already be wrapped */) { if (s1.size() != s2.size()) { return false; @@ -1497,9 +1516,8 @@ bool inline maybe_native_stack(Tensor& result, TensorList tensors, int64_t dim) result_sizes.insert(result_sizes.begin() + dim, tensors.size()); // skip resizing if size of result is same as expected - if (result.sizes() != result_sizes) { - result.resize_(result_sizes); - } + // raise a warning while resizing if output has one or more elements + at::native::resize_output(result, result_sizes); stack_serial_stub(kCPU, result, tensors, dim); return true; } diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index dec985447944e..05fa4c6e165c4 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -528,7 +529,10 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { size[dimension] = cat_dim_size; // skip resizing if size of result is same as expected - if (out.sizes() != size) { + // raise a warning while resizing if output has one or more elements + // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 + // for understanding why at::native::resize_output is not called directly. + if (at::native::resize_output_check(out, size)) { out.resize_(size, memory_format); } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index ca13e058411a4..3a1f75c588a83 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -974,6 +974,15 @@ - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) +# alias for torch.cat +- func: concat(Tensor[] tensors, int dim=0) -> Tensor + +- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + +- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor + +- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) + - func: block_diag(Tensor[] tensors) -> Tensor variants: function diff --git a/docs/source/torch.rst b/docs/source/torch.rst index 88cbc6986bf31..5aa5dbc9387b4 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -88,6 +88,7 @@ Indexing, Slicing, Joining, Mutating Ops :nosignatures: cat + concat conj chunk dsplit diff --git a/test/test_autograd.py b/test/test_autograd.py index 2da74cbd01938..61a46b439f213 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -2735,36 +2735,6 @@ def test_block_diag(self): lambda a, b, c: torch.block_diag(a, b, c), True, f_args_variable, f_args_tensor) - def test_cat(self): - f_args_variable = (torch.randn(1, S, S, dtype=torch.double, requires_grad=True), - torch.randn(2, S, S, dtype=torch.double, requires_grad=True), - torch.randn(3, S, S, dtype=torch.double, requires_grad=True), - 0) - f_args_tensor = deepcopy(unpack_variables(f_args_variable)) - run_functional_checks(self, "test_cat", "cat", - lambda a, b, c, dim: torch.cat((a, b, c), dim), - True, f_args_variable, f_args_tensor, check_forward_ad=True) - - def test_cat_negdim_1(self): - f_args_variable = (torch.randn(S, S, 1, dtype=torch.double, requires_grad=True), - torch.randn(S, S, 2, dtype=torch.double, requires_grad=True), - torch.randn(S, S, 3, dtype=torch.double, requires_grad=True), - -1) - f_args_tensor = deepcopy(unpack_variables(f_args_variable)) - run_functional_checks(self, "test_cat_negdim_1", "cat", - lambda a, b, c, dim: torch.cat((a, b, c), dim), - True, f_args_variable, f_args_tensor, check_forward_ad=True) - - def test_cat_negdim_2(self): - f_args_variable = (torch.randn(S, 1, S, dtype=torch.double, requires_grad=True), - torch.randn(S, 2, S, dtype=torch.double, requires_grad=True), - torch.randn(S, 3, S, dtype=torch.double, requires_grad=True), - -2) - f_args_tensor = deepcopy(unpack_variables(f_args_variable)) - run_functional_checks(self, "test_cat_negdim_2", "cat", - lambda a, b, c, dim: torch.cat((a, b, c), dim), - True, f_args_variable, f_args_tensor, check_forward_ad=True) - def test_cat_empty_legacy(self): f_args_variable = (torch.randn(0, dtype=torch.double, requires_grad=True), torch.randn(S, S, dtype=torch.double, requires_grad=True)) @@ -2776,14 +2746,6 @@ def test_cat_empty_legacy(self): False, f_args_variable, f_args_tensor, check_forward_ad=True) self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION)) - def test_cat_empty(self): - f_args_variable = (torch.randn(0, S, dtype=torch.double, requires_grad=True), - torch.randn(S, S, dtype=torch.double, requires_grad=True)) - f_args_tensor = deepcopy(unpack_variables(f_args_variable)) - run_functional_checks(self, "test_cat_empty", "cat", - lambda a, b: torch.cat((a, b)), - True, f_args_variable, f_args_tensor, check_forward_ad=True) - def test_var_mean_differentiable(self): dim = [2, 4] keepdim = False diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index e723ee4622991..fc90f494e3917 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -1497,7 +1497,7 @@ def test_normalize_operator_exhaustive(self, device, dtype, op): return # These ops currently don't trace in FX for various reasons (i.e. they take a list of tensors) - fx_fail = {"stack", "hstack", "vstack", "dstack", "linalg.multi_dot"} + fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot"} sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False) for sample_input in sample_inputs_itr: unsupported_arg_type = False diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index dcb49386c9ff8..a7496919eaeb8 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -695,6 +695,47 @@ def test_cat_preserve_channels_last(self, device): self.assertEqual(res1, res2) self.assertTrue(res1.is_contiguous(memory_format=torch.channels_last)) + @onlyCUDA + def test_cat_out_memory_format(self, device): + inp_size = (4, 4, 4, 4) + expected_size = (8, 4, 4, 4) + a_cuda = torch.randn(inp_size, device=device).contiguous(memory_format=torch.channels_last) + a_cpu = torch.randn(inp_size, device='cpu').contiguous(memory_format=torch.channels_last) + b_cuda = torch.randn(inp_size, device=device).contiguous(memory_format=torch.contiguous_format) + b_cpu = torch.randn(inp_size, device='cpu').contiguous(memory_format=torch.contiguous_format) + c_cuda = torch.randn(inp_size, device=device).contiguous(memory_format=torch.channels_last) + + # Case 1: if out= is the correct shape then the memory format of out= is respected + + out_cuda = torch.empty(expected_size, device=device).contiguous(memory_format=torch.contiguous_format) + res1_cuda = torch.cat((a_cuda, b_cuda), out=out_cuda) + + out_cpu = torch.empty(expected_size, device='cpu').contiguous(memory_format=torch.contiguous_format) + res1_cpu = torch.cat((a_cpu, b_cpu), out=out_cpu) + + self.assertTrue(res1_cuda.is_contiguous(memory_format=torch.contiguous_format)) + self.assertTrue(res1_cpu.is_contiguous(memory_format=torch.contiguous_format)) + + # Case 2: if out= is not the correct shape then the output it is resized internally + # - For the CPU variant the memory format is that of the first tensor + # - For the CUDA variant it only propagates memory format if all the tensors have + # the same memory format, otherwise it just uses contiguous_format as a default + + out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format) + # a_cuda and b_cuda have different memory_format + res2_cuda = torch.cat((a_cuda, b_cuda), out=out_cuda) + + out_cpu = torch.empty((0), device='cpu').contiguous(memory_format=torch.contiguous_format) + res2_cpu = torch.cat((a_cpu, b_cpu), out=out_cpu) + + self.assertTrue(res2_cuda.is_contiguous(memory_format=torch.contiguous_format)) + self.assertTrue(res2_cpu.is_contiguous(memory_format=torch.channels_last)) + + out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format) + # a_cuda and c_cuda have same memory_format + res3_cuda = torch.cat((a_cuda, c_cuda), out=out_cuda) + + self.assertTrue(res3_cuda.is_contiguous(memory_format=torch.channels_last)) @onlyCUDA @deviceCountAtLeast(2) @@ -713,8 +754,8 @@ def test_cat_different_devices(self, devices): def test_cat_stack_cross_devices(self, device): cuda = torch.randn((3, 3), device=device) cpu = torch.randn((3, 3), device='cpu') - out_cpu = cpu.clone() - out_cuda = cuda.clone() + + # cat with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): torch.cat((cuda, cpu)) @@ -722,18 +763,6 @@ def test_cat_stack_cross_devices(self, device): "Expected all tensors to be on the same device"): torch.cat((cpu, cuda)) - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): - torch.cat((cpu, cuda), out=out_cuda) - - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): - torch.cat((cpu, cpu), out=out_cuda) - - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): - torch.cat((cuda, cuda), out=out_cpu) - # Stack with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): @@ -742,18 +771,6 @@ def test_cat_stack_cross_devices(self, device): "Expected all tensors to be on the same device"): torch.stack((cpu, cuda)) - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): - torch.stack((cpu, cuda), out=out_cuda) - - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): - torch.stack((cpu, cpu), out=out_cuda) - - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): - torch.stack((cuda, cuda), out=out_cpu) - # TODO: reconcile with other cat tests # TODO: Compare with a NumPy reference instead of CPU @onlyCUDA diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index bbb8d981ab8a2..7dca8a7bdedbd 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -1856,6 +1856,13 @@ def merge_dicts(*dicts): -0.5790, 0.1497]]) """.format(**common_args)) +add_docstr(torch.concat, + r""" +concat(tensors, dim=0, *, out=None) -> Tensor + +Alias of :func:`torch.cat`. +""") + add_docstr(torch.ceil, r""" ceil(input, *, out=None) -> Tensor diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp index cc6444e8a9dfd..5ac36e1f1b76f 100644 --- a/torch/csrc/jit/passes/normalize_ops.cpp +++ b/torch/csrc/jit/passes/normalize_ops.cpp @@ -104,6 +104,7 @@ const std::unordered_map& getOperatorAliasMap() { {aten::multiply_, aten::mul_}, {aten::true_divide, aten::div}, {aten::true_divide_, aten::div_}, + {aten::concat, aten::cat}, {aten::row_stack, aten::vstack}, {aten::swapdims, aten::transpose}, {aten::swapdims_, aten::transpose_}, diff --git a/torch/overrides.py b/torch/overrides.py index 64b18b89eb401..aca14a6d4552b 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -360,6 +360,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.bucketize: lambda input, boundaries, out_int32=False, right=False, out=None: -1, torch.cartesian_prod: lambda *tensors: -1, torch.cat: lambda tensors, dim=0, out=None: -1, + torch.concat: lambda tensors, dim=0, out=None: -1, # alias for torch.cat torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1, torch.ceil: lambda input, out=None: -1, torch.celu: lambda input, alhpa=1., inplace=False: -1, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 5e009ee7f487e..ace4fa1c63c20 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2180,6 +2180,25 @@ def sample_inputs_stack(op_info, device, dtype, requires_grad, **kwargs): return (SampleInput(tensors, args=(0,)),) +def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs): + make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + + cases: Tuple[tuple, tuple, dict] = ( # type: ignore[assignment] + ((S, S), (S, S), {'dim': -1}), + ((S, S), (S, S), {'dim': 1}), + ((M, S), (S, S), {'dim': 0}), # different shapes + ((1, 2, 3), (1, 2, 3), {'dim': -2}), + ((0,), (0,), {'dim': 0}), # empty tensor + ((0, S), (S, S), {'dim': 0}), + ((1,), (1,), {}) # dim not passed, fallback to default + ) + + def generator(): + for input_shape1, input_shape2, kwargs in cases: + yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs) + + return list(generator()) + def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad, **kwargs): tensors = [ make_tensor((S, S), device, dtype, requires_grad=requires_grad), @@ -8582,17 +8601,11 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): OpInfo('stack', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_stack, - assert_autodiffed=True, - skips=( - # stack does not correctly warn when resizing out= inputs - SkipInfo('TestCommon', 'test_out'),),), + assert_autodiffed=True), OpInfo('hstack', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_hstack_dstack_vstack, - supports_forward_ad=True, - skips=( - # hstack does not correctly warn when resizing out= inputs - SkipInfo('TestCommon', 'test_out'),),), + supports_forward_ad=True), OpInfo('hypot', dtypes=floating_types(), dtypesIfCPU=floating_types_and(torch.bfloat16), @@ -8609,24 +8622,31 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): # JIT tests don't work with Tensor keyword arguments # https://github.com/pytorch/pytorch/issues/58507 SkipInfo('TestJit', 'test_variant_consistency_jit'),),), + OpInfo('cat', + ref=lambda input_seq, dim=0, **kwargs: np.concatenate(input_seq, axis=dim, **kwargs), + aliases=('concat',), + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + sample_inputs_func=sample_inputs_cat_concat, + supports_forward_ad=True, + assert_autodiffed=True, + skips=( + # RuntimeError: Arguments for call not valid. + # Expected a value of type 'List[Tensor]' for argument + # 'tensors' but instead found type 'Tensor (inferred)'. + SkipInfo('TestJit', 'test_jit_alias_remapping'),)), OpInfo('vstack', aliases=('row_stack',), dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_hstack_dstack_vstack, supports_forward_ad=True, skips=( - # vstack does not correctly warn when resizing out= inputs - SkipInfo('TestCommon', 'test_out'), # RuntimeError: _fn() Expected a value of type # 'Tensor (inferred)' for argument 't0' but instead found type 'tuple'. - SkipInfo('TestJit', 'test_jit_alias_remapping'))), + SkipInfo('TestJit', 'test_jit_alias_remapping'),)), OpInfo('dstack', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_hstack_dstack_vstack, - supports_forward_ad=True, - skips=( - # dstack does not correctly warn when resizing out= inputs - SkipInfo('TestCommon', 'test_out'),)), + supports_forward_ad=True), OpInfo('unfold', op=lambda x, *args: x.unfold(*args), dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), From 32fbeb170d57ab6a5af9ca6de23a54a6a910a433 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Tue, 7 Sep 2021 00:04:14 -0700 Subject: [PATCH 518/530] Update error messages that use LAPACK error codes (#63864) Summary: This PR updates the` batchCheckErrors` and `singleCheckErrors` functions so that the error messages are defined only once. `batchCheckErrors` function reuses `singleCheckErrors` now. Fixes https://github.com/pytorch/pytorch/issues/63220, fixes https://github.com/pytorch/pytorch/issues/59779 cc jianyuh nikitaved pearu mruberry heitorschueroff walterddr IvanYashchuk xwang233 Lezcano Pull Request resolved: https://github.com/pytorch/pytorch/pull/63864 Reviewed By: ngimel Differential Revision: D30672933 Pulled By: mruberry fbshipit-source-id: 0ba37ff98ef278efdb12c3890aa07d687047da7a --- aten/src/ATen/native/BatchLinearAlgebra.cpp | 10 +-- aten/src/ATen/native/LinearAlgebraUtils.h | 84 +++++++++++---------- test/test_linalg.py | 40 ++++++---- 3 files changed, 71 insertions(+), 63 deletions(-) diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index d80f9184567b1..498b51b38187c 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -1549,6 +1549,8 @@ Tensor cholesky_inverse(const Tensor &input, bool upper) { DEFINE_DISPATCH(lu_stub); +// TODO: remove check_errors argument +// https://github.com/pytorch/pytorch/issues/64014 std::tuple _lu_with_info(const Tensor& self, bool compute_pivots, bool check_errors) { TORCH_CHECK(self.dim() >= 2, "expected tensor with 2 or more dimensions, got size: ", self.sizes(), @@ -1566,14 +1568,6 @@ std::tuple _lu_with_info(const Tensor& self, bool comput // 'lu' tensor is modified in-place and must be a copy of 'self' Tensor lu = cloneBatchedColumnMajor(self); lu_stub(self.device().type(), lu, pivots_tensor, infos_tensor, compute_pivots); - - if (check_errors) { - if (self.dim() > 2) { - batchCheckErrors(infos_tensor, "lu", /*allow_singular=*/true); - } else { - singleCheckErrors(infos_tensor.item(), "lu", /*allow_singular=*/true); - } - } return std::make_tuple(lu, pivots_tensor, infos_tensor); } diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 19e41d7a8e815..abbf82ceb148c 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -213,62 +213,66 @@ static inline void squareCheckInputs(const Tensor& self) { "but they are ", self.size(-1), " by ", self.size(-2), " matrices"); } +/* + * Given a info int, obtained after a single operation, this function check if the computation + * has been successful (info = 0) or not, and report in case of the latter. + */ +static inline void singleCheckErrors(int64_t info, const char* name, int64_t batch_id=-1) { + std::string batch_string{""}; + if (batch_id >= 0) { + batch_string = ": (Batch element " + std::to_string(batch_id) + ")"; + } + if (info < 0) { + TORCH_INTERNAL_ASSERT(false, name, batch_string, + ": Argument ", -info, " has illegal value. Most certainly there is a bug in the implementation calling the backend library."); + } else if (info > 0) { + if (strstr(name, "inv")) { + // inv, inverse, cholesky_inverse, etc. + TORCH_CHECK(false, name, batch_string, + ": The diagonal element ", info, " is zero, the inversion could not be completed because the input matrix is singular."); + } else if (strstr(name, "solve")) { + // solve, linalg_solve, cholesky_solve, etc. + TORCH_CHECK(false, name, batch_string, + ": The diagonal element ", info, " is zero, the solve could not be completed because the input matrix is singular."); + } else if (strstr(name, "cholesky")) { + TORCH_CHECK(false, name, batch_string, + ": The factorization could not be completed because the input is not positive-definite (the leading minor of order ", info, " is not positive-definite)."); + } else if (strstr(name, "svd")) { + TORCH_CHECK(false, name, batch_string, + ": The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated singular values (error code: ", info, ")."); + } else if (strstr(name, "eig") || strstr(name, "syevd")) { + TORCH_CHECK(false, name, batch_string, + ": The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: ", info, ")."); + } else if (strstr(name, "lstsq")) { + TORCH_CHECK(false, name, batch_string, + ": The least squares solution could not be computed because the input matrix does not have full rank (error code: ", info, ")."); + } else { + TORCH_INTERNAL_ASSERT(false, name, ": Unknown error code: ", info, "."); + } + } +} + /* * Given a vector of int64_t infos, obtained after a batch operations, * this function checks if the computation over all these batches has been * successful (info = 0) or not, and report in case of the latter. */ -static inline void batchCheckErrors(std::vector& infos, const char* name, bool allow_singular=false) { +static inline void batchCheckErrors(const std::vector& infos, const char* name) { for (size_t i = 0; i < infos.size(); i++) { auto info = infos[i]; - if (info < 0) { - AT_ERROR(name, ": For batch ", i, ": Argument ", -info, " has illegal value"); - } else if (info > 0) { - if (strstr(name, "svd")) { - AT_ERROR(name, ": the updating process of SBDSDC did not converge (error: ", info, ")"); - } else if (strstr(name, "symeig") || strstr(name, "syevd")) { - AT_ERROR(name, ": For batch ", i, ": the algorithm failed to converge; ", info, - " off-diagonal elements of an intermediate tridiagonal form did not converge to zero."); - } else if (!allow_singular) { - AT_ERROR(name, ": For batch ", i, ": U(", info, ",", info, ") is zero, singular U."); - } - } + singleCheckErrors(info, name, i); } } /* * This is an overloaded case of the previous function for a tensor of infos. */ -static inline void batchCheckErrors(const Tensor& infos, const char* name, bool allow_singular=false, int info_per_batch=1) { - auto batch_size = infos.numel(); +static inline void batchCheckErrors(const Tensor& infos, const char* name) { auto infos_cpu = infos.to(at::kCPU); auto infos_data = infos_cpu.data_ptr(); - for (int64_t i = 0; i < batch_size; i++) { + for (int64_t i = 0; i < infos.numel(); i++) { auto info = infos_data[i]; - if (info < 0) { - AT_ERROR(name, ": For batch ", i/info_per_batch, ": Argument ", -info, " has illegal value"); - } else if (!allow_singular && info > 0) { - AT_ERROR(name, ": For batch ", i/info_per_batch, ": U(", info, ",", info, ") is zero, singular U."); - } - } -} - -/* - * Given a info int, obtained after a single operation, this function check if the computation - * has been successful (info = 0) or not, and report in case of the latter. - */ -static inline void singleCheckErrors(int64_t info, const char* name, bool allow_singular=false) { - if (info < 0) { - AT_ERROR(name, ": Argument ", -info, " has illegal value"); - } else if (info > 0) { - if (strstr(name, "svd")) { - AT_ERROR(name, ": the updating process of SBDSDC did not converge (error: ", info, ")"); - } else if (strstr(name, "eig")) { // this catches both "eig" and "symeig" - AT_ERROR(name, ": the algorithm failed to converge; ", info, - " off-diagonal elements of an intermediate tridiagonal form did not converge to zero."); - } else if (!allow_singular) { - AT_ERROR(name, ": U(", info, ",", info, ") is zero, singular U."); - } + singleCheckErrors(info, name, i); } } diff --git a/test/test_linalg.py b/test/test_linalg.py index fbd219b3c5981..96da8d559ff31 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -483,10 +483,10 @@ def test_cholesky_errors_and_warnings(self, device, dtype): r'1-dimensional array given\. Array must be at least two-dimensional'): np.linalg.cholesky(A.cpu().numpy()) - # if the input matrix is singular, an error should be raised + # if the input matrix is not positive definite, an error should be raised A = torch.eye(3, 3, dtype=dtype, device=device) - A[-1, -1] = 0 # Now A is singular - with self.assertRaisesRegex(RuntimeError, r'U\(3,3\) is zero, singular U\.'): + A[-1, -1] = 0 # Now A is not positive definite + with self.assertRaisesRegex(RuntimeError, r'minor of order 3 is not positive-definite'): torch.linalg.cholesky(A) with self.assertRaisesRegex(np.linalg.LinAlgError, r'Matrix is not positive definite'): np.linalg.cholesky(A.cpu().numpy()) @@ -495,8 +495,8 @@ def test_cholesky_errors_and_warnings(self, device, dtype): A = torch.eye(3, 3, dtype=dtype, device=device) A = A.reshape((1, 3, 3)) A = A.repeat(5, 1, 1) - A[4, -1, -1] = 0 # Now A[4] is singular - with self.assertRaisesRegex(RuntimeError, r'For batch 4: U\(3,3\) is zero, singular U\.'): + A[4, -1, -1] = 0 # Now A[4] is not positive definite + with self.assertRaisesRegex(RuntimeError, r'\(Batch element 4\): The factorization could not be completed'): torch.linalg.cholesky(A) # if out tensor with wrong shape is passed a warning is given @@ -674,7 +674,7 @@ def test_cholesky_ex_non_pd(self, device, dtype): A[-1, -1] = 0 # Now A is singular _, info = torch.linalg.cholesky_ex(A) self.assertEqual(info, 3) - with self.assertRaisesRegex(RuntimeError, r'U\(3,3\) is zero, singular U\.'): + with self.assertRaisesRegex(RuntimeError, r'minor of order 3 is not positive-definite'): torch.linalg.cholesky_ex(A, check_errors=True) # if at least one matrix in the batch is not positive definite, @@ -688,7 +688,7 @@ def test_cholesky_ex_non_pd(self, device, dtype): expected_info = torch.zeros(A.shape[:-2], dtype=torch.int32, device=device) expected_info[3] = 2 self.assertEqual(info, expected_info) - with self.assertRaisesRegex(RuntimeError, r'For batch 3: U\(2,2\) is zero, singular U\.'): + with self.assertRaisesRegex(RuntimeError, r'\(Batch element 3\): The factorization could not be completed'): torch.linalg.cholesky_ex(A, check_errors=True) @skipCUDAIfNoMagmaAndNoCusolver @@ -2892,6 +2892,16 @@ def test_svd_errors_and_warnings(self, device, dtype): # error from out_v svd(a, out=(out_u, out_s, out_v)) + # if input contains NaN then an error is triggered for svd + a = torch.full((3, 3), float('nan'), dtype=dtype, device=device) + a[0] = float('nan') + with self.assertRaisesRegex(RuntimeError, "The algorithm failed to converge"): + svd(a) + a = torch.randn(3, 33, 33, dtype=dtype, device=device) + a[1, 0, 0] = float('nan') + with self.assertRaisesRegex(RuntimeError, r"\(Batch element 1\): The algorithm failed to converge"): + svd(a) + @skipCUDAIfNoMagmaAndNoCusolver @skipCPUIfNoLapack @dtypes(*floating_and_complex_types()) @@ -3237,7 +3247,7 @@ def test_inv_ex_singular(self, device, dtype): A[-1, -1] = 0 # Now A is singular info = torch.linalg.inv_ex(A).info self.assertEqual(info, 3) - with self.assertRaisesRegex(RuntimeError, r'U\(3,3\) is zero, singular U\.'): + with self.assertRaisesRegex(RuntimeError, r'diagonal element 3 is zero, the inversion could not be completed'): torch.linalg.inv_ex(A, check_errors=True) # if at least one matrix in the batch is not positive definite, @@ -3251,7 +3261,7 @@ def test_inv_ex_singular(self, device, dtype): expected_info = torch.zeros(A.shape[:-2], dtype=torch.int32, device=device) expected_info[3] = 2 self.assertEqual(info, expected_info) - with self.assertRaisesRegex(RuntimeError, r'For batch 3: U\(2,2\) is zero, singular U\.'): + with self.assertRaisesRegex(RuntimeError, r'\(Batch element 3\): The diagonal element 2 is zero'): torch.linalg.inv_ex(A, check_errors=True) @slowTest @@ -3289,7 +3299,7 @@ def test_inverse_errors(self, device, dtype): def run_test_singular_input(batch_dim, n): x = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1) x[n, -1, -1] = 0 - with self.assertRaisesRegex(RuntimeError, rf'For batch {n}: U\(3,3\) is zero'): + with self.assertRaisesRegex(RuntimeError, rf'\(Batch element {n}\): The diagonal element 3 is zero'): torch.inverse(x) for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]: @@ -3306,7 +3316,7 @@ def test_inverse_errors_large(self, device, dtype): x = torch.empty((8, 10, 616, 616), dtype=dtype, device=device) x[:] = torch.eye(616, dtype=dtype, device=device) x[..., 10, 10] = 0 - with self.assertRaisesRegex(RuntimeError, r'For batch 0: U\(11,11\) is zero'): + with self.assertRaisesRegex(RuntimeError, r'\(Batch element 0\): The diagonal element 11 is zero'): torch.inverse(x) @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3, torch.float64: 1e-7, torch.complex128: 1e-7}) @@ -3428,7 +3438,7 @@ def test_inv_errors_and_warnings(self, device, dtype): def run_test_singular_input(batch_dim, n): a = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1) a[n, -1, -1] = 0 - with self.assertRaisesRegex(RuntimeError, rf"For batch {n}: U\(3,3\) is zero"): + with self.assertRaisesRegex(RuntimeError, rf"\(Batch element {n}\): The diagonal element 3 is zero"): torch.linalg.inv(a) for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]: @@ -3559,7 +3569,7 @@ def run_test_singular_input(batch_dim, n): a = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1) a[n, -1, -1] = 0 b = torch.randn(batch_dim, 3, 1, dtype=dtype, device=device) - with self.assertRaisesRegex(RuntimeError, rf'For batch {n}: U\(3,3\) is zero'): + with self.assertRaisesRegex(RuntimeError, rf'\(Batch element {n}\): The diagonal element 3 is zero'): torch.linalg.solve(a, b) for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]: @@ -4912,7 +4922,7 @@ def test_triangular_solve_singular(self, device, dtype): b = torch.rand(3, 1, dtype=dtype, device=device) A = torch.eye(3, 3, dtype=dtype, device=device) A[-1, -1] = 0 # Now A is singular - err_str = r"triangular_solve: U\(3,3\) is zero, singular U\." + err_str = r"triangular_solve: The diagonal element 3 is zero" with self.assertRaisesRegex(RuntimeError, err_str): torch.triangular_solve(b, A) @@ -7285,7 +7295,7 @@ def test_cholesky_inverse_errors_and_warnings(self, device, dtype): a = torch.randn(3, 3, device=device, dtype=dtype) a[1, 1] = 0 if self.device_type == 'cpu': - with self.assertRaisesRegex(RuntimeError, r"cholesky_inverse: U\(2,2\) is zero, singular U\."): + with self.assertRaisesRegex(RuntimeError, r"cholesky_inverse: The diagonal element 2 is zero"): torch.cholesky_inverse(a) # cholesky_inverse on GPU does not raise an error for this case elif self.device_type == 'cuda': From adbcc819cd40deaa2755383815896d8c9dffb881 Mon Sep 17 00:00:00 2001 From: Kefei Lu Date: Tue, 7 Sep 2021 04:00:49 -0700 Subject: [PATCH 519/530] Fix fx2trt SplitterBase non_tensor_input logic (#64286) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64286 During graph splitting, `_SplitterBase` supports taking into consideration whether the subnet boundary nodes produces "supported" outputs that will cross the acc/non-acc boundary. Specifically, if the backend only supports Tensor-based data passing cross boundary, then we cannot split the graph at a place where the node output is a non-Tensor type (e.g., `Tuple[Tensor]`). There's currently a bug in this logic that it does not correctly detect the output type of a Node. Instead of using `Node.meta['tensor_meta']`, we should instead check `Node.meta['type']`. `Node.meta['tensor_meta']` is not appropriate because this key will exist if the node output is an iterable and one of the element is of type `Tensor`. So `Tuple[Tensor]` will be wrongly considered "supported". Test Plan: arc lint run CI tests Reviewed By: yinghai, 842974287 Differential Revision: D30617147 fbshipit-source-id: e8ba70dfaddc05cafb8037d58fca73b7ccbb1a49 --- torch/fx/passes/splitter_base.py | 15 ++++++++++----- torch/fx/passes/tools_common.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py index 42087bde9ef89..65419055dad82 100644 --- a/torch/fx/passes/splitter_base.py +++ b/torch/fx/passes/splitter_base.py @@ -2,6 +2,7 @@ from collections import defaultdict from dataclasses import dataclass from typing import List, Dict, Optional, Tuple +import logging import torch from torch.fx.experimental.graph_manipulation import get_size_of_node @@ -20,8 +21,12 @@ Tensors, NodeList, NodeSet, + is_node_output_tensor, ) +_LOGGER = logging.getLogger(__name__) + + class _SplitterSettingBase: def __init__(self): parser = argparse.ArgumentParser() @@ -98,7 +103,7 @@ def reduce_acc_nodes_non_tensor_input_helper( for user in node.users: if user in self.acc_nodes: self.acc_nodes.remove(user) - if "tensor_meta" not in user.meta: + if not is_node_output_tensor(user): cpu_worklist.append(user) def reduce_acc_nodes_non_tensor_input(self): @@ -113,7 +118,7 @@ def reduce_acc_nodes_non_tensor_input(self): continue if node in self.acc_nodes: continue - if "tensor_meta" in node.meta: + if is_node_output_tensor(node): continue non_tensor_cpu_nodes.append(node) @@ -128,7 +133,7 @@ def reduce_acc_nodes_non_tensor_output(self): new_cpu_nodes: NodeList = [] for acc_node in self.acc_nodes: - if "tensor_meta" in acc_node.meta: + if is_node_output_tensor(acc_node): continue for user in acc_node.users: if user not in self.acc_nodes: @@ -461,7 +466,7 @@ def get_inputs(self, inputs): reports += "Checking inputs...\n" for n in submod.graph.nodes: if n.op == "placeholder": - if "tensor_meta" not in n.meta: + if not is_node_output_tensor(n): reports += f"Input {n.name} is not a tensor, this might cause problems during lowering!\n" else: total_input_bytes += get_size_of_node(submod, n)[0] @@ -473,7 +478,7 @@ def get_inputs(self, inputs): def get_bytes(node: torch.fx.Node): nonlocal total_output_bytes nonlocal reports - if "tensor_meta" not in node.meta: + if not is_node_output_tensor(node): reports += f"Output {node.name} is not a tensor, this might cause problems during lowering!\n" else: total_output_bytes += get_size_of_node(submod, node)[0] diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py index a996dc8b36521..8274f4bf3b625 100644 --- a/torch/fx/passes/tools_common.py +++ b/torch/fx/passes/tools_common.py @@ -48,6 +48,17 @@ def get_node_target(submodules: Dict[str, torch.nn.Module], node: torch.fx.Node) return node.target +def is_node_output_tensor(node: torch.fx.Node) -> bool: + """Checks if the node output produces a Tensor or not. + + NOTE: This requires to run `ShapeProp` on the containing fx graph before + calling this function. This is because it works by checking the `type` + metadata on the node. This metadata is produced by the `ShapeProp`. + """ + type_ = node.meta.get("type", None) + return type_ is not None and issubclass(type_, torch.Tensor) + + class FxNetAccFusionsFinder: """ Finds groups of connected ACC nodes that pass non-tensor data between each other. From 75b9e4a128496e9b2563078f62a2903f65a6d145 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Tue, 7 Sep 2021 08:04:50 -0700 Subject: [PATCH 520/530] [JIT] Freeze unrolls constant loops (#63614) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63614 There are a number of optimizations (`RemoveListMutation` in particular) that are tied to loop unrolling in `runOptimizations`. However, these were not invoked from `freeze_module` since the freezing pass should be idempotent. This diff makes `runOptimizations` run `UnrollConstantLoops` instead of `UnrollLoops`. `freeze_module` is then able to run these optimizations. Test Plan: Observed that `freeze_module` applies `RemoveListMutation` Reviewed By: eellison Differential Revision: D30437356 fbshipit-source-id: cba04bd958a48ad51b151aa3264f3d5bbb1fc2a4 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 2 +- test/onnx/test_utility_funs.py | 11 ++++++----- torch/csrc/jit/passes/freeze_module.cpp | 5 +++-- torch/csrc/jit/runtime/graph_executor.cpp | 14 +++++++++++--- torch/csrc/jit/runtime/graph_executor_impl.h | 2 +- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index ffeef00cc9ac6..54a116b57cb1d 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -1506,7 +1506,7 @@ def list_append(boxes: List[torch.Tensor]): class Min(torch.nn.Module): def forward(self, x): - boxes = [x, x, x] + boxes = [x for _ in range(3)] return list_append(boxes) x = torch.rand(5, 5) diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 02da90dd3066e..b87fa06d648a4 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -104,19 +104,20 @@ def forward(self, x, y, t): def test_output_list(self): class PaddingLayer(torch.jit.ScriptModule): @torch.jit.script_method - def forward(self, input_t): - # type: (Tensor) -> Tensor - for i in range(2): + def forward(self, input_t, n): + # type: (Tensor, int) -> Tensor + for i in range(n): input_t = input_t * 2 return input_t input_t = torch.ones(size=[10], dtype=torch.long) + n = 2 model = torch.jit.script(PaddingLayer()) - example_output = model(input_t) + example_output = model(input_t, n) with self.assertRaises(RuntimeError): torch.onnx.export(model, - (input_t, ), + (input_t, n), "test.onnx", opset_version=self.opset_version, example_outputs=[example_output]) diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index df1c64bcc4740..0debc97ac8241 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -90,8 +90,9 @@ class AttributePropagator { }; auto applyOptimizations = [](std::shared_ptr& subgraph) { runOptimization( - subgraph, /* unroll? */ false, /* const_prop_user_classes? */ false); - RemoveListMutation(subgraph); + subgraph, + /* unroll_non_constant_loops? */ false, + /* const_prop_user_classes? */ false); LowerSimpleTuples(subgraph); }; diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp index 0187988680e80..39742c7815d3b 100644 --- a/torch/csrc/jit/runtime/graph_executor.cpp +++ b/torch/csrc/jit/runtime/graph_executor.cpp @@ -908,7 +908,7 @@ void runNondiffOptimization( void runOptimization( std::shared_ptr& graph, - bool unroll, + bool unroll_non_constant_loops, bool const_prop_user_classes) { // Basic graph preprocessing to eliminate noise. GRAPH_DEBUG( @@ -935,9 +935,17 @@ void runOptimization( // Unroll small loops, and eliminate expressions that are the same at every // iteration. - if (unroll) { - UnrollLoops(graph); + bool unroll_success = false; + if (unroll_non_constant_loops) { + unroll_success = UnrollLoops(graph); GRAPH_DEBUG("After UnrollLoops, before RemoveListMutation\n", *graph); + } else { + unroll_success = UnrollConstantLoops(graph); + GRAPH_DEBUG( + "After UnrollConstantLoops, before RemoveListMutation\n", *graph); + } + + if (unroll_success) { // run again with unrolled loops RemoveListMutation(graph); GRAPH_DEBUG("After RemoveListMutation, before PeepholeOptimize\n", *graph); diff --git a/torch/csrc/jit/runtime/graph_executor_impl.h b/torch/csrc/jit/runtime/graph_executor_impl.h index 516ad1f55c812..3815d26c87f4d 100644 --- a/torch/csrc/jit/runtime/graph_executor_impl.h +++ b/torch/csrc/jit/runtime/graph_executor_impl.h @@ -33,7 +33,7 @@ void packGradient(const Gradient& gradient, Node* dnode); bool needsGradient(const std::shared_ptr& graph); void runOptimization( std::shared_ptr& graph, - bool unroll = true, + bool unroll_non_constant_loops = true, bool const_prop_user_classes = true); void runNondiffOptimization( std::shared_ptr& graph, From f767cf668395baf29ca7c9f1fa80f0abed8c53c7 Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Tue, 7 Sep 2021 08:41:09 -0700 Subject: [PATCH 521/530] To change WarmUp Scheduler with ConstantLR and LinearLR (#64395) Summary: Partially unblocks https://github.com/pytorch/vision/issues/4281 Previously we have added WarmUp Schedulers to PyTorch Core in the PR : https://github.com/pytorch/pytorch/pull/60836 which had two mode of execution - linear and constant depending on warming up function. In this PR we are changing this interface to more direct form, as separating linear and constant modes to separate Schedulers. In particular ```Python scheduler1 = WarmUpLR(optimizer, warmup_factor=0.1, warmup_iters=5, warmup_method="constant") scheduler2 = WarmUpLR(optimizer, warmup_factor=0.1, warmup_iters=5, warmup_method="linear") ``` will look like ```Python scheduler1 = ConstantLR(optimizer, warmup_factor=0.1, warmup_iters=5) scheduler2 = LinearLR(optimizer, warmup_factor=0.1, warmup_iters=5) ``` correspondingly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64395 Reviewed By: datumbox Differential Revision: D30753688 Pulled By: iramazanli fbshipit-source-id: e47f86d12033f80982ddf1faf5b46873adb4f324 --- docs/source/optim.rst | 3 +- test/test_optim.py | 117 ++++++++++++++++--------------- torch/optim/lr_scheduler.py | 129 ++++++++++++++++++++++++----------- torch/optim/lr_scheduler.pyi | 7 +- 4 files changed, 156 insertions(+), 100 deletions(-) diff --git a/docs/source/optim.rst b/docs/source/optim.rst index 2ded57ff87a1b..695f0a2a03f6d 100644 --- a/docs/source/optim.rst +++ b/docs/source/optim.rst @@ -210,7 +210,8 @@ algorithms. lr_scheduler.MultiplicativeLR lr_scheduler.StepLR lr_scheduler.MultiStepLR - lr_scheduler.WarmUpLR + lr_scheduler.ConstantLR + lr_scheduler.LinearLR lr_scheduler.ExponentialLR lr_scheduler.CosineAnnealingLR lr_scheduler.ReduceLROnPlateau diff --git a/test/test_optim.py b/test/test_optim.py index fe282ef33b4de..d69e9351d33a0 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -12,7 +12,7 @@ from torch.autograd import Variable from torch import sparse from torch.optim.lr_scheduler import LambdaLR, MultiplicativeLR, StepLR, \ - MultiStepLR, WarmUpLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \ + MultiStepLR, ConstantLR, LinearLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \ _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler from torch.optim.swa_utils import AveragedModel, SWALR, update_bn from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \ @@ -274,16 +274,16 @@ def test_sgd(self): ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3), - [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="linear")] + [lambda opt: LinearLR(opt, start_factor=0.4, end_factor=0.8, total_iters=4)] ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3), - [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")] + [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)] ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3), [lambda opt: StepLR(opt, gamma=0.9, step_size=10), - lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4)] + lambda opt: LinearLR(opt, start_factor=0.4, end_factor=0.6, total_iters=4)] ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3), @@ -430,17 +430,17 @@ def test_adam(self): lambda weight, bias: optimizer( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3), - [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="linear")] + [lambda opt: LinearLR(opt, start_factor=0.4, total_iters=4)] ) self._test_basic_cases( lambda weight, bias: optimizer( self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3), - [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")] + [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)] ) self._test_basic_cases( lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True), - [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant"), + [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4), lambda opt: ExponentialLR(opt, gamma=0.9)] ) self._test_basic_cases( @@ -992,12 +992,12 @@ def test_exponential_lr_is_constant_for_constant_epoch(self): scheduler = ExponentialLR(self.opt, gamma=0.9) self._test_lr_is_constant_for_constant_epoch(scheduler) - def test_constant_warmup_lr_is_constant_for_constant_epoch(self): - scheduler = WarmUpLR(self.opt, warmup_method="constant") + def test_constantlr_is_constant_for_constant_epoch(self): + scheduler = ConstantLR(self.opt) self._test_lr_is_constant_for_constant_epoch(scheduler) - def test_linear_warmup_lr_is_constant_for_constant_epoch(self): - scheduler = WarmUpLR(self.opt, warmup_method="linear") + def test_linear_linearlr_is_constant_for_constant_epoch(self): + scheduler = LinearLR(self.opt) self._test_lr_is_constant_for_constant_epoch(scheduler) def test_step_lr(self): @@ -1051,76 +1051,78 @@ def test_multi_step_lr_with_epoch(self): scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9]) self._test_with_epoch(scheduler, targets, epochs) - def test__get_last_lr_constant_warmup_lr(self): + def test_get_last_lr_constantlr(self): # lr = 0.025 if epoch < 5 # lr = 0.005 if 5 <= epoch epochs = 10 single_targets = [0.025] * 5 + [0.05] * 5 targets = [single_targets, [x * epochs for x in single_targets]] - scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 2, warmup_iters=5, warmup_method="constant") + scheduler = ConstantLR(self.opt, factor=1.0 / 2, total_iters=5) self._test_get_last_lr(scheduler, targets, epochs) - def test__get_last_lr_linear_warmup_lr(self): + def test_get_last_lr_linearlr(self): # lr = 0.025 if epoch == 0 # lr = 0.03125 if epoch == 1 # lr = 0.0375 if epoch == 2 # lr = 0.04375 if epoch == 3 # lr = 0.005 if 4 <= epoch epochs = 10 - factor = 1.0 / 2 + start_factor = 1.0 / 4 + end_factor = 3. / 5 iters = 4 - interpolation = [factor + i * (1 - factor) / iters for i in range(iters)] - single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters) + interpolation = [start_factor + i * (end_factor - start_factor) / iters for i in range(iters)] + single_targets = [x * 0.05 for x in interpolation] + [0.05 * end_factor] * (epochs - iters) targets = [single_targets, [x * epochs for x in single_targets]] - scheduler = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + scheduler = LinearLR(self.opt, start_factor=start_factor, end_factor=end_factor, total_iters=iters) self._test_get_last_lr(scheduler, targets, epochs) - def test__constant_warmup_lr(self): + def test_constantlr(self): # lr = 0.025 if epoch < 5 # lr = 0.005 if 5 <= epoch epochs = 10 single_targets = [0.025] * 5 + [0.05] * 5 targets = [single_targets, [x * epochs for x in single_targets]] - scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 2, warmup_iters=5, warmup_method="constant") + scheduler = ConstantLR(self.opt, factor=1.0 / 2, total_iters=5) self._test(scheduler, targets, epochs) - def test__linear_warmup_lr(self): + def test_linearlr(self): # lr = 0.025 if epoch == 0 # lr = 0.03125 if epoch == 1 # lr = 0.0375 if epoch == 2 # lr = 0.04375 if epoch == 3 # lr = 0.005 if 4 <= epoch epochs = 10 - factor = 1.0 / 2 + start_factor = 1.0 / 2 iters = 4 - interpolation = [factor + i * (1 - factor) / iters for i in range(iters)] + interpolation = [start_factor + i * (1 - start_factor) / iters for i in range(iters)] single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters) targets = [single_targets, [x * epochs for x in single_targets]] - scheduler = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + scheduler = LinearLR(self.opt, start_factor=start_factor, total_iters=iters) self._test(scheduler, targets, epochs) - def test_constant_warmup_with_epoch(self): + def test_constantlr_with_epoch(self): # lr = 0.025 if epoch < 5 # lr = 0.005 if 5 <= epoch epochs = 10 single_targets = [0.025] * 5 + [0.05] * 5 targets = [single_targets, [x * epochs for x in single_targets]] - scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 2, warmup_iters=5, warmup_method="constant") + scheduler = ConstantLR(self.opt, factor=1.0 / 2, total_iters=5) self._test_with_epoch(scheduler, targets, epochs) - def test_linear_warmup_with_epoch(self): + def test_linearlr_with_epoch(self): # lr = 0.025 if epoch == 0 # lr = 0.03125 if epoch == 1 # lr = 0.0375 if epoch == 2 # lr = 0.04375 if epoch == 3 # lr = 0.005 if 4 <= epoch epochs = 10 - factor = 1.0 / 2 + start_factor = 1.0 / 2 + end_factor = 1. iters = 4 - interpolation = [factor + i * (1 - factor) / iters for i in range(iters)] + interpolation = [start_factor + i * (end_factor - start_factor) / iters for i in range(iters)] single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters) targets = [single_targets, [x * epochs for x in single_targets]] - scheduler = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + scheduler = LinearLR(self.opt, start_factor=start_factor, total_iters=iters) self._test_with_epoch(scheduler, targets, epochs) def test_exp_lr(self): @@ -1145,14 +1147,14 @@ def test_closed_form_step_lr(self): closed_form_scheduler = StepLR(self.opt, gamma=0.1, step_size=3) self._test_against_closed_form(scheduler, closed_form_scheduler, 20) - def test_closed_form_linear_warmup_lr(self): - scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="linear") - closed_form_scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="linear") + def test_closed_form_linearlr(self): + scheduler = LinearLR(self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4) + closed_form_scheduler = LinearLR(self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4) self._test_against_closed_form(scheduler, closed_form_scheduler, 20) - def test_closed_form_constant_warmup_lr(self): - scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="constant") - closed_form_scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="constant") + def test_closed_form_constantlr(self): + scheduler = ConstantLR(self.opt, factor=1.0 / 3, total_iters=4) + closed_form_scheduler = ConstantLR(self.opt, factor=1.0 / 3, total_iters=4) self._test_against_closed_form(scheduler, closed_form_scheduler, 20) def test_closed_form_multi_step_lr(self): @@ -1265,7 +1267,7 @@ def test_chained_lr2(self): epochs = 10 schedulers = [None] * 1 targets = [[0.02, 0.03, 0.04] + [0.05] * 9] - schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear") + schedulers[0] = LinearLR(self.opt, start_factor=0.4, total_iters=3) scheduler = ChainedScheduler(schedulers) self._test([scheduler], targets, epochs) @@ -1273,7 +1275,7 @@ def test_chained_lr3(self): epochs = 10 schedulers = [None] * 2 targets = [[0.02, 0.03, 0.04, 0.05] + [0.005] * 4 + [0.0005] * 3 + [0.00005] * 3] - schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear") + schedulers[0] = LinearLR(self.opt, start_factor=0.4, total_iters=3) schedulers[1] = MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1) scheduler = ChainedScheduler(schedulers) self._test([scheduler], targets, epochs) @@ -1286,7 +1288,7 @@ def test_chained_lr4(self): + [0.05 * 0.9 ** x * 0.1 for x in range(4, 6)] + [0.05 * 0.9 ** x * 0.01 for x in range(6, 9)]] schedulers[0] = ExponentialLR(self.opt, gamma=0.9) - schedulers[1] = WarmUpLR(self.opt, warmup_factor=0.2, warmup_iters=4, warmup_method="constant") + schedulers[1] = ConstantLR(self.opt, factor=0.2, total_iters=4) schedulers[2] = StepLR(self.opt, gamma=0.1, step_size=3) scheduler = ChainedScheduler(schedulers) self._test([scheduler], targets, epochs) @@ -1323,20 +1325,23 @@ def test_compound_exp_and_multistep_lr(self): schedulers[1] = ExponentialLR(self.opt, gamma=0.9) self._test(schedulers, targets, epochs) - def test_compound_exp_and_linear_warmup_lr(self): + def test_compound_exp_and_linearlr(self): epochs = 10 iters = 4 - factor = 0.4 + start_factor = 0.4 + end_factor = 0.9 schedulers = [None] * 2 single_targets = [0.05 * (0.9 ** x) for x in range(11)] for i in range(iters): - single_targets[i] *= factor + i / iters * (1 - factor) + single_targets[i] *= start_factor + i / iters * (end_factor - start_factor) + for i in range(iters, 11): + single_targets[i] *= end_factor targets = [single_targets, [x * epochs for x in single_targets]] - schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[0] = LinearLR(self.opt, start_factor=start_factor, end_factor=end_factor, total_iters=iters) schedulers[1] = ExponentialLR(self.opt, gamma=0.9) self._test(schedulers, targets, epochs) - def test_compound_step_and_constant_warmup(self): + def test_compound_step_and_constantlr(self): epochs = 10 iters = 4 factor = 0.4 @@ -1344,20 +1349,20 @@ def test_compound_step_and_constant_warmup(self): single_targets = [0.05 * 0.4] * 3 + [0.005 * 0.4] + [0.005] * 2 + [0.0005] * 3 + [0.00005] * 3 targets = [single_targets, [x * epochs for x in single_targets]] schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3) - schedulers[1] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant") + schedulers[1] = ConstantLR(self.opt, factor=0.4, total_iters=4) self._test(schedulers, targets, epochs) - def test_compound_linear_warmup_and_multistep_lr(self): + def test_compound_linearlr_and_multistep_lr(self): epochs = 10 iters = 4 - factor = 0.4 + start_factor = 0.4 schedulers = [None] * 2 single_targets = [0.05] * 2 + [0.005] * 3 + [0.0005] * 4 + [0.00005] * 2 for i in range(iters): - single_targets[i] *= factor + i / iters * (1 - factor) + single_targets[i] *= start_factor + i / iters * (1 - start_factor) targets = [single_targets, [x * epochs for x in single_targets]] schedulers[0] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9]) - schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[1] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters) self._test(schedulers, targets, epochs) def test_compound_cosanneal_and_step_lr(self): @@ -1387,19 +1392,19 @@ def test_compound_cosanneal_and_multistep_lr(self): schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9]) self._test(schedulers, targets, epochs) - def test_compound_cosanneal_and_linear_warmup_lr(self): + def test_compound_cosanneal_and_linearlr(self): epochs = 10 iters = 4 - factor = 0.4 + start_factor = 0.4 eta_min = 1e-10 schedulers = [None] * 2 single_targets = [eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2 for x in range(epochs)] for i in range(iters): - single_targets[i] *= factor + i / iters * (1 - factor) + single_targets[i] *= start_factor + i / iters * (1 - start_factor) targets = [single_targets, [x * epochs for x in single_targets]] - schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[0] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters) schedulers[1] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min) self._test(schedulers, targets, epochs) @@ -1485,14 +1490,14 @@ def test_compound_reduce_lr_on_plateau4(self): def test_compound_reduce_lr_on_plateau5(self): iters = 4 - factor = 0.4 + start_factor = 0.4 epochs = 22 for param_group in self.opt.param_groups: param_group['lr'] = 0.5 single_targets = [0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2 multipliers = [1] * 22 for i in range(iters): - multipliers[i] *= factor + i / iters * (1 - factor) + multipliers[i] *= start_factor + i / iters * (1 - start_factor) single_targets = [x * y for x, y in zip(single_targets, multipliers)] targets = [single_targets] targets = targets[1:] # test runs step before checking lr @@ -1500,7 +1505,7 @@ def test_compound_reduce_lr_on_plateau5(self): schedulers = [None] * 2 schedulers[0] = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs', mode='min', threshold=0.1) - schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear") + schedulers[1] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters) self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs) def test_cycle_lr_invalid_mode(self): diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index 761a4041668d6..42f7b511c54a5 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -427,25 +427,78 @@ def _get_closed_form_lr(self): for base_lr in self.base_lrs] -class WarmUpLR(_LRScheduler): - """Decays the learning rate of each parameter group by either a small constant - or linearly increasing small warmup factor until the number of epoch reaches a - pre-defined milestone: warmup_iters. Notice that such decay can happen - simultaneously with other changes to the learning rate from outside this scheduler. +class ConstantLR(_LRScheduler): + """Decays the learning rate of each parameter group by a small constant factor until the + number of epoch reaches a pre-defined milestone: total_iters. Notice that such decay can + happen simultaneously with other changes to the learning rate from outside this scheduler. When last_epoch=-1, sets initial lr as lr. Args: optimizer (Optimizer): Wrapped optimizer. - warmup_factor (float): The number we multiply learning rate in the first epoch. - If the warming up method is constant, the multiplication factor of the - learning rate stays the same in all epochs, but, in the linear case, it - starts increasing in the following epochs. Default: 1./3. - warmup_iters (int): The number of warming up steps. Default: 5. - warmup_method (str): One of `constant` and `linear`. In `constant` mode, the - learning rate will be multiplied with a small constant until a milestone - defined in warmup_iters. In the `linear` case, the multiplication factor - starts with warmup_factor in the first epoch then linearly increases to - reach 1. in the epoch number warmup_iters. Default: `linear`. + factor (float): The number we multiply learning rate until the milestone. Default: 1./3. + total_iters (int): The number of steps that the scheduler decays the learning rate. + Default: 5. + last_epoch (int): The index of the last epoch. Default: -1. + verbose (bool): If ``True``, prints a message to stdout for + each update. Default: ``False``. + + Example: + >>> # Assuming optimizer uses lr = 0.05 for all groups + >>> # lr = 0.025 if epoch == 0 + >>> # lr = 0.025 if epoch == 1 + >>> # lr = 0.025 if epoch == 2 + >>> # lr = 0.025 if epoch == 3 + >>> # lr = 0.05 if epoch >= 4 + >>> scheduler = ConstantLR(self.opt, factor=0.5, total_iters=4) + >>> for epoch in range(100): + >>> train(...) + >>> validate(...) + >>> scheduler.step() + """ + + def __init__(self, optimizer, factor=1.0 / 3, total_iters=5, last_epoch=-1, verbose=False): + if factor > 1.0 or factor < 0: + raise ValueError('Constant multiplicative factor expected to be between 0 and 1.') + + self.factor = factor + self.total_iters = total_iters + super(ConstantLR, self).__init__(optimizer, last_epoch, verbose) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn("To get the last learning rate computed by the scheduler, " + "please use `get_last_lr()`.", UserWarning) + + if self.last_epoch == 0: + return [group['lr'] * self.factor for group in self.optimizer.param_groups] + + if (self.last_epoch > self.total_iters or + (self.last_epoch != self.total_iters)): + return [group['lr'] for group in self.optimizer.param_groups] + + if (self.last_epoch == self.total_iters): + return [group['lr'] * (1.0 / self.factor) for group in self.optimizer.param_groups] + + def _get_closed_form_lr(self): + return [base_lr * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor)) + for base_lr in self.base_lrs] + + +class LinearLR(_LRScheduler): + """Decays the learning rate of each parameter group by linearly changing small + multiplicative factor until the number of epoch reaches a pre-defined milestone: total_iters. + Notice that such decay can happen simultaneously with other changes to the learning rate + from outside this scheduler. When last_epoch=-1, sets initial lr as lr. + + Args: + optimizer (Optimizer): Wrapped optimizer. + start_factor (float): The number we multiply learning rate in the first epoch. + The multiplication factor changes towards end_factor in the following epochs. + Default: 1./3. + end_factor (float): The number we multiply learning rate at the end of linear changing + process. Default: 1.0. + total_iters (int): The number of iterations that multiplicative factor reaches to 1. + Default: 5. last_epoch (int): The index of the last epoch. Default: -1. verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False``. @@ -457,24 +510,25 @@ class WarmUpLR(_LRScheduler): >>> # lr = 0.0375 if epoch == 2 >>> # lr = 0.04375 if epoch == 3 >>> # lr = 0.005 if epoch >= 4 - >>> scheduler = WarmUpLR(self.opt, warmup_factor=0.5, warmup_iters=4, warmup_method="linear") + >>> scheduler = LinearLR(self.opt, start_factor=0.5, total_iters=4) >>> for epoch in range(100): >>> train(...) >>> validate(...) >>> scheduler.step() """ - def __init__(self, optimizer, warmup_factor=1.0 / 3, warmup_iters=5, warmup_method="linear", - last_epoch=-1, verbose=False): - if warmup_method not in ("constant", "linear"): - raise ValueError( - "Only 'constant' or 'linear' warmup_method accepted, but " - "got {}".format(warmup_method) - ) - self.warmup_factor = warmup_factor - self.warmup_iters = warmup_iters - self.warmup_method = warmup_method - super(WarmUpLR, self).__init__(optimizer, last_epoch, verbose) + def __init__(self, optimizer, start_factor=1.0 / 3, end_factor=1.0, total_iters=5, last_epoch=-1, + verbose=False): + if start_factor > 1.0 or start_factor < 0: + raise ValueError('Starting multiplicative factor expected to be between 0 and 1.') + + if end_factor > 1.0 or end_factor < 0: + raise ValueError('Ending multiplicative factor expected to be between 0 and 1.') + + self.start_factor = start_factor + self.end_factor = end_factor + self.total_iters = total_iters + super(LinearLR, self).__init__(optimizer, last_epoch, verbose) def get_lr(self): if not self._get_lr_called_within_step: @@ -482,25 +536,18 @@ def get_lr(self): "please use `get_last_lr()`.", UserWarning) if self.last_epoch == 0: - return [group['lr'] * self.warmup_factor for group in self.optimizer.param_groups] + return [group['lr'] * self.start_factor for group in self.optimizer.param_groups] - if (self.last_epoch > self.warmup_iters or - (self.warmup_method == "constant" and self.last_epoch != self.warmup_iters)): + if (self.last_epoch > self.total_iters): return [group['lr'] for group in self.optimizer.param_groups] - if (self.warmup_method == "constant" and self.last_epoch == self.warmup_iters): - return [group['lr'] * (1.0 / self.warmup_factor) for group in self.optimizer.param_groups] - - return [group['lr'] * (1. + (1.0 - self.warmup_factor) / - (self.warmup_iters * self.warmup_factor + (self.last_epoch - 1) * (1 - self.warmup_factor))) + return [group['lr'] * (1. + (self.end_factor - self.start_factor) / + (self.total_iters * self.start_factor + (self.last_epoch - 1) * (self.end_factor - self.start_factor))) for group in self.optimizer.param_groups] def _get_closed_form_lr(self): - return [base_lr * (self.warmup_factor + - (1 - self.warmup_factor) * min(self.warmup_iters, self.last_epoch) / - self.warmup_iters * (self.warmup_method == "linear") + - (self.last_epoch >= self.warmup_iters) * (1 - self.warmup_factor) * - (self.warmup_method == "constant")) + return [base_lr * (self.start_factor + + (self.end_factor - self.start_factor) * min(self.total_iters, self.last_epoch) / self.total_iters) for base_lr in self.base_lrs] @@ -618,7 +665,7 @@ class ChainedScheduler(_LRScheduler): >>> # lr = 0.729 if epoch == 2 >>> # lr = 0.6561 if epoch == 3 >>> # lr = 0.59049 if epoch >= 4 - >>> scheduler1 = WarmUpLR(self.opt, warmup_factor=0.1, warmup_iters=2, warmup_method="constant") + >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2) >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9) >>> scheduler = ChainedScheduler([scheduler1, scheduler2]) >>> for epoch in range(100): diff --git a/torch/optim/lr_scheduler.pyi b/torch/optim/lr_scheduler.pyi index 821407e3ccca6..9b1b8ea63eed7 100644 --- a/torch/optim/lr_scheduler.pyi +++ b/torch/optim/lr_scheduler.pyi @@ -18,8 +18,11 @@ class StepLR(_LRScheduler): class MultiStepLR(_LRScheduler): def __init__(self, optimizer: Optimizer, milestones: Iterable[int], gamma: float=..., last_epoch: int=...) -> None: ... -class WarmUpLR(_LRScheduler): - def __init__(self, optimizer: Optimizer, warmup_factor: float=..., warmup_iters: int=..., warmup_method: str=..., last_epoch: int=...) -> None: ... +class ConstantLR(_LRScheduler): + def __init__(self, optimizer: Optimizer, factor: float=..., total_iters: int=..., last_epoch: int=...) -> None: ... + +class LinearLR(_LRScheduler): + def __init__(self, optimizer: Optimizer, start_factor: float=..., end_factor: float=..., total_iters: int=..., last_epoch: int=...) -> None: ... class ExponentialLR(_LRScheduler): def __init__(self, optimizer: Optimizer, gamma: float, last_epoch: int=...) -> None: ... From 26b7ff5aeab49cb63faed2cd51fa6ba70f665610 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 7 Sep 2021 08:57:43 -0700 Subject: [PATCH 522/530] deprecate dtype getters from `torch.testing` namespace (#63554) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63554 Following https://github.com/pytorch/pytorch/pull/61840#issuecomment-884087809, this deprecates all the dtype getters publicly exposed in the `torch.testing` namespace. The reason for this twofold: 1. If someone is not familiar with the C++ dispatch macros PyTorch uses, the names are misleading. For example `torch.testing.floating_types()` will only give you `float32` and `float64` skipping `float16` and `bfloat16`. 2. The dtype getters provide very minimal functionality that can be easily emulated by downstream libraries. We thought about [providing an replacement](https://gist.github.com/pmeier/3dfd2e105842ad0de4505068a1a0270a), but ultimately decided against it. The major problem is BC: by keeping it, either the namespace is getting messy again after a new dtype is added or we need to somehow version the return values of the getters. Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D30662206 Pulled By: mruberry fbshipit-source-id: a2bdb10ab02ae665df1b5b76e8afa9af043bbf56 --- test/test_autograd.py | 3 +- test/test_binary_ufuncs.py | 90 ++++++------ test/test_complex.py | 3 +- test/test_foreach.py | 45 +++--- test/test_linalg.py | 54 +++---- test/test_nn.py | 14 +- test/test_numpy_interop.py | 3 +- test/test_ops.py | 4 +- test/test_reductions.py | 41 +++--- test/test_shape_ops.py | 15 +- test/test_sort_and_select.py | 33 +++-- test/test_sparse.py | 23 +-- test/test_sparse_csr.py | 23 +-- test/test_tensor_creation_ops.py | 59 ++++---- test/test_testing.py | 5 +- test/test_torch.py | 134 ++++++++--------- test/test_type_promotion.py | 49 ++++--- test/test_unary_ufuncs.py | 55 +++---- test/test_view_ops.py | 43 +++--- torch/testing/_core.py | 128 +--------------- torch/testing/_deprecated.py | 31 +++- torch/testing/_dtype_getters.py | 138 ++++++++++++++++++ torch/testing/_internal/common_device_type.py | 3 +- torch/testing/_internal/common_dtype.py | 4 + torch/testing/_internal/common_jit.py | 2 +- .../_internal/common_methods_invocations.py | 12 +- torch/testing/_internal/common_modules.py | 3 +- torch/testing/_internal/opinfo_helper.py | 31 ++-- 28 files changed, 560 insertions(+), 488 deletions(-) create mode 100644 torch/testing/_dtype_getters.py create mode 100644 torch/testing/_internal/common_dtype.py diff --git a/test/test_autograd.py b/test/test_autograd.py index 61a46b439f213..e672e4b49e25e 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -42,6 +42,7 @@ onlyCPU, onlyCUDA, onlyOnCPUAndCUDA, dtypes, dtypesIfCUDA, deviceCountAtLeast, skipCUDAIfCudnnVersionLessThan, skipCUDAIf, skipMeta) +from torch.testing._internal.common_dtype import get_all_dtypes import pickle @@ -8474,7 +8475,7 @@ def test_copy_(self, device): # At the time of writing this test, copy_ is not generated from native_functions.yaml # there was a bug that bfloat16 was not recognized as floating. x = torch.randn(10, device=device, requires_grad=True) - floating_dt = [dt for dt in torch.testing.get_all_dtypes() if dt.is_floating_point] + floating_dt = [dt for dt in get_all_dtypes() if dt.is_floating_point] for dt in floating_dt: y = torch.empty(10, device=device, dtype=dt) y.copy_(x) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index 2695ab6a86115..f8c36adf8b781 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -18,7 +18,11 @@ instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA, skipCUDAIfRocm, skipIf, ops) -from torch.testing import all_types_and_complex_and, integral_types_and, make_tensor +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import ( + all_types_and_complex_and, integral_types_and, get_all_dtypes, get_all_int_dtypes, get_all_math_dtypes, + get_all_complex_dtypes, get_all_fp_dtypes, +) from torch.testing._internal.common_methods_invocations import binary_ufuncs if TEST_SCIPY: @@ -348,7 +352,7 @@ def test_inplace_division(self, device): id_after = id(t) self.assertEqual(id_before, id_after) - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) def test_div_rounding_modes(self, device, dtype): if dtype.is_floating_point: low, high = -10.0, 10.0 @@ -448,7 +452,7 @@ def test_divide_by_zero_rounding(self, device, dtype): actual = torch.divide(a, zero, rounding_mode=rounding_mode) self.assertEqual(actual, expect, exact_dtype=exact_dtype) - @dtypes(*torch.testing.get_all_dtypes( + @dtypes(*get_all_dtypes( include_bool=False, include_complex=False, include_bfloat16=False)) def test_div_rounding_numpy(self, device, dtype): info = (torch.finfo(dtype) if dtype.is_floating_point @@ -892,7 +896,7 @@ def test_pow_cuda_complex_extremal_failing(self, device, dtype): self.assertEqual(cpu_out, cuda_out) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_dtypes(include_bool=False, include_bfloat16=False))) + @dtypes(*(get_all_dtypes(include_bool=False, include_bfloat16=False))) def test_complex_scalar_pow_tensor(self, device, dtype): complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j, 1 + 0j] first_exp = make_tensor((100,), device, dtype, low=-2, high=2) @@ -1283,7 +1287,7 @@ def test_binary_ops_with_scalars(self, device): self.assertEqual(expected, python_op(first, second)) self.assertEqual(expected, torch_op(first, second)) - @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False), torch.testing.get_all_dtypes(include_complex=False))) + @dtypes(*product(get_all_dtypes(include_complex=False), get_all_dtypes(include_complex=False))) def test_maximum_minimum_type_promotion(self, device, dtypes): a = torch.tensor((0, 1), device=device, dtype=dtypes[0]) b = torch.tensor((1, 0), device=device, dtype=dtypes[1]) @@ -1291,7 +1295,7 @@ def test_maximum_minimum_type_promotion(self, device, dtypes): result = op(a, b) self.assertEqual(result.dtype, torch.result_type(a, b)) - @dtypes(*(torch.testing.get_all_int_dtypes() + [torch.bool])) + @dtypes(*(get_all_int_dtypes() + [torch.bool])) def test_maximum_minimum_int_and_bool(self, device, dtype): ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum), (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin)) @@ -1317,7 +1321,7 @@ def test_maximum_minimum_int_and_bool(self, device, dtype): self.assertEqual(out, numpy_result) @precisionOverride({torch.bfloat16: 1e-2}) - @dtypes(*(torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_fp_dtypes())) def test_maximum_minimum_float(self, device, dtype): ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum), (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin)) @@ -1345,7 +1349,7 @@ def test_maximum_minimum_float(self, device, dtype): self.assertEqual(tensor_result, numpy_result, exact_dtype=False) self.assertEqual(out, numpy_result, exact_dtype=False) - @dtypes(*(torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_fp_dtypes())) def test_maximum_minimum_float_nan_and_inf(self, device, dtype): # np.maximum and np.minimum functions compare input arrays element-wisely. # if one of the elements being compared is a NaN, then that element is returned. @@ -1381,7 +1385,7 @@ def test_maximum_minimum_float_nan_and_inf(self, device, dtype): self.assertEqual(tensor_result, numpy_result) self.assertEqual(out, numpy_result) - @dtypes(*product(torch.testing.get_all_complex_dtypes(), torch.testing.get_all_dtypes())) + @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) def test_maximum_minimum_complex(self, device, dtypes): for torch_op in (torch.maximum, torch.minimum, torch.max, torch.min, torch.fmax, torch.fmin): with self.assertRaisesRegex(RuntimeError, '.+not implemented for.+'): @@ -1439,7 +1443,7 @@ def test_mul_intertype_scalar(self, device, dtype): self.assertEqual(x, 4.5) @onlyCPU - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sub(self, device, dtype): m1 = torch.tensor([2.34, 4.44], dtype=dtype, device=device) m2 = torch.tensor([1.23, 2.33], dtype=dtype, device=device) @@ -1501,8 +1505,8 @@ def test_min_max_binary_op_nan(self, device, dtype): self.assertFalse(torch.isnan(ma[i]), "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i])) self.assertFalse(torch.isnan(mi[i]), "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i])) - @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False), - torch.testing.get_all_dtypes(include_complex=False))) + @dtypes(*product(get_all_dtypes(include_complex=False), + get_all_dtypes(include_complex=False))) def test_copysign(self, device, dtypes): def _test_copysign_numpy(a, b): torch_result = torch.copysign(a, b) @@ -1519,7 +1523,7 @@ def _test_copysign_numpy(a, b): expected = torch.from_numpy(np.copysign(np_a, np_b)) # To handle inconsistencies of type promotion between PyTorch and Numpy # Applied for both arguments having integral precision and bfloat16 - types = [torch.bool, torch.bfloat16] + torch.testing.get_all_int_dtypes() + types = [torch.bool, torch.bfloat16] + get_all_int_dtypes() if a.dtype in types or b.dtype in types: promoted_type = torch.promote_types(torch_result.dtype, expected.dtype) torch_result = torch_result.to(promoted_type) @@ -1564,7 +1568,7 @@ def _test_copysign_numpy(a, b): for case in cases: _test_copysign_numpy(torch.tensor([case], device=device, dtype=dtypes[0]), b) - if dtypes[1] in torch.testing.get_all_fp_dtypes(): + if dtypes[1] in get_all_fp_dtypes(): a = make_tensor((10, 10), device=device, dtype=dtypes[0], low=-9, high=9) for case in cases: _test_copysign_numpy(a, torch.tensor([case], device=device, dtype=dtypes[1])) @@ -1616,8 +1620,8 @@ def test_divmul_scalar(self, device, dtype): res = scale * x self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.) - @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) - @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) + @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) + @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) def test_floor_divide_tensor(self, device, dtype): x = torch.randn(10, device=device).mul(30).to(dtype) y = torch.arange(1, 11, dtype=dtype, device=device) @@ -1629,8 +1633,8 @@ def test_floor_divide_tensor(self, device, dtype): self.assertEqual(z.dtype, x.dtype) self.assertEqual(z, z_alt) - @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) - @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) + @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) + @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) def test_floor_divide_scalar(self, device, dtype): x = torch.randn(100, device=device).mul(10).to(dtype) @@ -1663,7 +1667,7 @@ def test_floor_divide_out(self, device, dtype): self.assertEqual(o, torch.floor_divide(x.float(), y.float())) @onlyCPU - @dtypes(*torch.testing.get_all_math_dtypes('cpu')) + @dtypes(*get_all_math_dtypes('cpu')) def test_rdiv(self, device, dtype): if dtype is torch.float16: return @@ -1675,7 +1679,7 @@ def test_rdiv(self, device, dtype): z = torch.tensor([30 / v.item() for v in x], device=device) self.assertEqual(y, z, exact_dtype=False) - @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False)) + @dtypes(*get_all_fp_dtypes(include_bfloat16=False)) def test_fmod_remainder_by_zero_float(self, device, dtype): fn_list = (torch.fmod, torch.remainder) for fn in fn_list: @@ -1687,7 +1691,7 @@ def test_fmod_remainder_by_zero_float(self, device, dtype): @onlyOnCPUAndCUDA # Check Issue https://github.com/pytorch/pytorch/issues/48130 @skipCUDAIfRocm # Error happens on both ROCM and XLA - @dtypes(*torch.testing.get_all_int_dtypes()) + @dtypes(*get_all_int_dtypes()) def test_fmod_remainder_by_zero_integral(self, device, dtype): fn_list = (torch.fmod, torch.remainder) for fn in fn_list: @@ -1712,7 +1716,7 @@ def test_fmod_remainder_by_zero_integral(self, device, dtype): value = 255 if dtype == torch.uint8 else -1 self.assertTrue(torch.all(fn(x, zero) == value)) - @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) def test_fmod_remainder(self, device, dtype): # Use numpy as reference def _helper(x, mod, fns_list): @@ -1749,7 +1753,7 @@ def _helper(x, mod, fns_list): # Mods: Integer, Float, Tensor, Non-contiguous Tensor mods = [3, 2.3, mod, mod.t()] # mod with floating-point dtype - if dtype in torch.testing.get_all_int_dtypes(): + if dtype in get_all_int_dtypes(): mod_float = make_tensor((10, 10), device=device, dtype=torch.float, low=-9, high=9) mod[mod == 0] = 1 mods.append(mod_float) @@ -1970,7 +1974,7 @@ def test_floor_divide_zero(self, device, dtype): a // b @unittest.skipIf(TEST_WITH_ASAN, "Integer overflows are not allowed under ASAN") - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_muldiv_scalar(self, device, dtype): x = make_tensor((10, 3), device, dtype, low=None, high=None) s = make_tensor((1,), 'cpu', dtype, low=None, high=None).item() @@ -1980,7 +1984,7 @@ def test_muldiv_scalar(self, device, dtype): self.assertEqual(x / s, x / y) self.assertEqual(s / x, y / x) - @dtypes(*tuple(itertools.combinations_with_replacement(torch.testing.get_all_dtypes(), 2))) + @dtypes(*tuple(itertools.combinations_with_replacement(get_all_dtypes(), 2))) def test_comparison_ops_type_promotion_and_broadcasting(self, device, dtypes): # issue #42660 # testing all combinations of broadcasting and type promotion @@ -2162,8 +2166,8 @@ def test_bitwise_shift_float(self, device): self.assertEqual(torch_op(a, 2.2), expected_op(a, 2.2)) @onlyOnCPUAndCUDA - @dtypes(*list(product(torch.testing.get_all_dtypes(include_complex=False), - torch.testing.get_all_dtypes(include_complex=False)))) + @dtypes(*list(product(get_all_dtypes(include_complex=False), + get_all_dtypes(include_complex=False)))) def test_heaviside(self, device, dtypes): input_dtype = dtypes[0] values_dtype = dtypes[1] @@ -2222,8 +2226,8 @@ def test_heaviside_cross_device(self, device): with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'): torch.heaviside(y, x) - @dtypes(*list(product(torch.testing.get_all_complex_dtypes(), - torch.testing.get_all_complex_dtypes()))) + @dtypes(*list(product(get_all_complex_dtypes(), + get_all_complex_dtypes()))) def test_heaviside_complex(self, device, dtypes): input_dtype = dtypes[0] values_dtype = dtypes[1] @@ -2265,15 +2269,15 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): getattr(a, op + '_')(b) self.assertEqual(expected_res, a) - @dtypes(*product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes())) + @dtypes(*product(get_all_dtypes(), get_all_dtypes())) def test_logical_xor(self, device, dtypes): self._test_logical(device, dtypes, 'logical_xor', [10, 0, 1, 0], [1, 0, 0, 10], [0, 0, 1, 1]) - @dtypes(*product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes())) + @dtypes(*product(get_all_dtypes(), get_all_dtypes())) def test_logical_and(self, device, dtypes): self._test_logical(device, dtypes, 'logical_and', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 0, 0]) - @dtypes(*product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes())) + @dtypes(*product(get_all_dtypes(), get_all_dtypes())) def test_logical_or(self, device, dtypes): self._test_logical(device, dtypes, 'logical_or', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 1, 1]) @@ -2377,7 +2381,7 @@ def test_logaddexp2(self, device, dtype): self._test_logaddexp(device, dtype, base2=True) def test_add(self, device): - dtypes = [torch.float, torch.double] + torch.testing.get_all_complex_dtypes() + dtypes = [torch.float, torch.double] + get_all_complex_dtypes() for dtype in dtypes: # [res] torch.add([res,] tensor1, tensor2) m1 = torch.randn(100, 100, dtype=dtype, device=device) @@ -2578,7 +2582,7 @@ def test_bool_tensor_comparison_ops(self, device): torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool, device=device)) self.assertFalse(a.equal(b)) - @dtypes(*torch.testing.get_all_dtypes(include_complex=False)) + @dtypes(*get_all_dtypes(include_complex=False)) def test_logical(self, device, dtype): if dtype != torch.bool: x = torch.tensor([1, 2, 3, 4], device=device, dtype=dtype) @@ -2755,8 +2759,8 @@ def test_pow_scalar_overloads_mem_overlap(self, device, dtype): self.unary_check_input_output_mem_overlap( doubles, sz, lambda input, out: torch.pow(42, input, out=out)) - @dtypes(*list(product(torch.testing.get_all_dtypes(include_bool=False), - torch.testing.get_all_dtypes(include_bool=False)))) + @dtypes(*list(product(get_all_dtypes(include_bool=False), + get_all_dtypes(include_bool=False)))) def test_float_power(self, device, dtypes): def to_np(value): if isinstance(value, torch.Tensor) and value.dtype == torch.bfloat16: @@ -2852,8 +2856,8 @@ def _promo_helper(x, y): torch.Tensor.float_power_(base.clone(), exp) @skipIf(not TEST_SCIPY, "Scipy required for the test.") - @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False, include_bfloat16=False), - torch.testing.get_all_dtypes(include_complex=False, include_bfloat16=False))) + @dtypes(*product(get_all_dtypes(include_complex=False, include_bfloat16=False), + get_all_dtypes(include_complex=False, include_bfloat16=False))) def test_xlogy_xlog1py(self, device, dtypes): x_dtype, y_dtype = dtypes @@ -2864,7 +2868,7 @@ def out_variant_helper(torch_fn, x, y): self.assertEqual(expected, out) def xlogy_inplace_variant_helper(x, y): - if x.dtype in torch.testing.get_all_int_dtypes() + [torch.bool]: + if x.dtype in get_all_int_dtypes() + [torch.bool]: with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): x.clone().xlogy_(y) @@ -2991,10 +2995,10 @@ def _compare_helper(x, y, torch_fn, reference_fn): _compare_helper(t, zeros, *xlog1py_fns) _compare_helper(t, 0., *xlog1py_fns) - @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False, - include_half=False, include_bfloat16=False), - torch.testing.get_all_dtypes(include_complex=False, - include_half=False, include_bfloat16=False))) + @dtypes(*product(get_all_dtypes(include_complex=False, + include_half=False, include_bfloat16=False), + get_all_dtypes(include_complex=False, + include_half=False, include_bfloat16=False))) @skipIf(not TEST_SCIPY, "Scipy required for the test.") def test_zeta(self, device, dtypes): x_dtype, q_dtype = dtypes diff --git a/test/test_complex.py b/test/test_complex.py index 45482efbae56d..eee7a6a51534e 100644 --- a/test/test_complex.py +++ b/test/test_complex.py @@ -1,11 +1,12 @@ import torch from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_dtype import get_all_complex_dtypes devices = (torch.device('cpu'), torch.device('cuda:0')) class TestComplexTensor(TestCase): - @dtypes(*torch.testing.get_all_complex_dtypes()) + @dtypes(*get_all_complex_dtypes()) def test_to_list(self, device, dtype): # test that the complex float tensor has expected values and # there's no garbage value in the resultant list diff --git a/test/test_foreach.py b/test/test_foreach.py index 123ef35bb7093..c6cf1302ffb5c 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -11,6 +11,9 @@ (instantiate_device_type_tests, dtypes, onlyCUDA, skipCUDAIfRocm, skipMeta, ops) from torch.testing._internal.common_methods_invocations import \ (foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db) +from torch.testing._internal.common_dtype import ( + get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes, +) # Includes some values such that N * N won't be a multiple of 4, # which should ensure we test the vectorized and non-vectorized @@ -133,7 +136,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis self._binary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True) if opinfo.supports_alpha_param: alpha = None - if dtype in torch.testing.get_all_int_dtypes(): + if dtype in get_all_int_dtypes(): alpha = 3 elif dtype.is_complex: alpha = complex(3, 3) @@ -170,7 +173,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis @ops(foreach_binary_op_db) def test_binary_op_tensorlists_fastpath(self, device, dtype, op): for N in N_values: - disable_fastpath = op.ref == torch.div and dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] if op.ref == torch.add and dtype == torch.bool: disable_fastpath = True self._test_binary_op_tensorlists(device, dtype, op, N, True, disable_fastpath) @@ -192,17 +195,17 @@ def _test_binary_op_scalar(self, device, dtype, opinfo, N, scalar, is_fastpath, @ops(foreach_binary_op_db) def test_binary_op_scalar_fastpath(self, device, dtype, op): for N, scalar in itertools.product(N_values, Scalars): - disable_fastpath = op.ref == torch.div and dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] if isinstance(scalar, int): disable_fastpath |= dtype == torch.bool if isinstance(scalar, float): - disable_fastpath |= dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool] if isinstance(scalar, bool): disable_fastpath |= dtype == torch.bool if op.ref in (torch.add, torch.mul): disable_fastpath = False if isinstance(scalar, complex): - disable_fastpath |= dtype not in torch.testing.get_all_complex_dtypes() + disable_fastpath |= dtype not in get_all_complex_dtypes() self._test_binary_op_scalar(device, dtype, op, N, scalar, True, disable_fastpath) @ops(foreach_binary_op_db) @@ -232,16 +235,16 @@ def _test_binary_op_scalarlist(self, device, dtype, opinfo, N, scalarlist, is_fa def test_binary_op_scalarlist_fastpath(self, device, dtype, op): for N in N_values: for type_str, scalarlist in getScalarLists(N): - bool_int_div = op.ref == torch.div and dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + bool_int_div = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] disable_fastpath = bool_int_div if type_str == "int": disable_fastpath |= dtype == torch.bool if type_str == "float": - disable_fastpath |= dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool] if type_str == "complex": - disable_fastpath |= dtype not in torch.testing.get_all_complex_dtypes() + disable_fastpath |= dtype not in get_all_complex_dtypes() if type_str == "mixed": - disable_fastpath |= True and dtype not in torch.testing.get_all_complex_dtypes() + disable_fastpath |= True and dtype not in get_all_complex_dtypes() self._test_binary_op_scalarlist(device, dtype, op, N, scalarlist, True, disable_fastpath) @ops(foreach_binary_op_db) @@ -298,7 +301,7 @@ def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fast @skipMeta @ops(foreach_pointwise_op_db) def test_pointwise_op_fastpath(self, device, dtype, op): - disable_fastpath = dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + disable_fastpath = dtype in get_all_int_dtypes() + [torch.bool] # for N, scalar in itertools.product(N_values, Scalars): for N in N_values: self._test_pointwise_op(device, dtype, op, N, True, disable_fastpath) @@ -356,7 +359,7 @@ def _test_unary(self, device, dtype, opinfo, N, is_fastpath): op, ref, inplace_op, inplace_ref = self._get_funcs(opinfo, 1) inputs = opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath), # note(mkozuki): Complex inputs for `_foreach_abs` go through slowpath. - if opinfo.name == "_foreach_abs" and dtype in torch.testing.get_all_complex_dtypes(): + if opinfo.name == "_foreach_abs" and dtype in get_all_complex_dtypes(): is_fastpath = False self._regular_unary_test(dtype, op, ref, inputs, is_fastpath) self._inplace_unary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath) @@ -367,7 +370,7 @@ def test_unary_fastpath(self, device, dtype, op): for N in N_values: self._test_unary(device, dtype, op, N, is_fastpath=True) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) @ops(foreach_unary_op_db) def test_unary_slowpath(self, device, dtype, op): for N in N_values: @@ -378,14 +381,14 @@ def _minmax_test(self, opinfo, inputs, is_fastpath, n_expected_cudaLaunchKernels self.assertEqual(ref(inputs), op(inputs, self.is_cuda, is_fastpath)) # note(mkozuki): in-place of foreach_minimum and foreach_maximum aren't implemented. - # @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False)) + # @dtypes(*get_all_dtypes(include_bfloat16=False, include_complex=False)) @ops(foreach_minmax_op_db) def test_minmax_fastpath(self, device, dtype, op): for N in N_values: inputs = tuple(op.sample_inputs(device, dtype, N) for _ in range(2)) self._minmax_test(op, inputs, True, N if dtype == torch.bool else 1) - @dtypes(*torch.testing.get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False)) + @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False)) @ops(foreach_minmax_op_db) def test_minmax_slowpath(self, device, dtype, op): for N in N_values: @@ -394,7 +397,7 @@ def test_minmax_slowpath(self, device, dtype, op): # note(mkozuki): ForeachFuncInfo's of both `_foreach_maximum` and `_foreach_minimum` include integer types. # so, manually limit dtypes to fp types for inf&nan tests. - @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=True, include_half=True)) + @dtypes(*get_all_fp_dtypes(include_bfloat16=True, include_half=True)) @ops(foreach_minmax_op_db) def test_minmax_float_inf_nan(self, device, dtype, op): inputs = ( @@ -413,7 +416,7 @@ def test_minmax_float_inf_nan(self, device, dtype, op): ) self._minmax_test(op, inputs, True, 1) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype): # TODO: enable empty list case for tensors in [[torch.randn([0])]]: @@ -423,7 +426,7 @@ def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype): torch._foreach_add_(tensors, 1) self.assertEqual(res, tensors) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) @ops(foreach_binary_op_db) def test_binary_op_scalar_with_overlapping_tensors(self, device, dtype, op): foreach_op, ref = op.method_variant, op.ref @@ -457,7 +460,7 @@ def test_binary_op_scalar_with_different_tensor_dtypes(self, device, dtype, op): runtime_error = e self.assertIsNone(runtime_error) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) @ops(foreach_binary_op_db) def test_binary_op_list_error_cases(self, device, dtype, op): foreach_op, foreach_op_, ref, ref_ = op.method_variant, op.inplace_variant, op.ref, op.ref_inplace @@ -513,7 +516,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op): return with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): foreach_op([tensor1], [tensor2]) - if dtype in torch.testing.get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div: + if dtype in get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div: with self.assertRaisesRegex(RuntimeError, "result type"): foreach_op_([tensor1], [tensor2]) else: @@ -522,7 +525,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op): @skipMeta @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found") - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) @ops(foreach_binary_op_db) def test_binary_op_list_slow_path(self, device, dtype, op): # note(mkozuki): why `n_expected_cudaLaunchKernels=0`? @@ -615,7 +618,7 @@ def test_binary_op_tensors_on_different_devices(self, device, dtype, op): self.assertEqual(actual, tensors1) @onlyCUDA - @dtypes(*torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False)) + @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) @ops(foreach_pointwise_op_db) def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op): # tensors1: ['cuda', 'cpu] diff --git a/test/test_linalg.py b/test/test_linalg.py index 96da8d559ff31..2b543431174d2 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -21,7 +21,11 @@ onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyOnCPUAndCUDA, dtypesIfCUDA, onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver) -from torch.testing import floating_and_complex_types, floating_types, all_types, make_tensor +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import ( + all_types, floating_types, floating_and_complex_types, get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, + get_all_fp_dtypes, +) from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9 from torch.distributions.binomial import Binomial @@ -89,7 +93,7 @@ def check(a_sizes_, b_sizes_): # Tests torch.outer, and its alias, torch.ger, vs. NumPy @precisionOverride({torch.bfloat16: 1e-1}) - @dtypes(*(torch.testing.get_all_dtypes())) + @dtypes(*(get_all_dtypes())) def test_outer(self, device, dtype): def run_test_case(a, b): if dtype == torch.bfloat16: @@ -772,7 +776,7 @@ def check(m, a, b, beta, alpha): check(m_scalar, a, b, beta, alpha) # test nans and infs are not propagated to the output when beta == 0 - float_and_complex_dtypes = torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes() + float_and_complex_dtypes = get_all_fp_dtypes() + get_all_complex_dtypes() if beta == 0 and dtype in float_and_complex_dtypes: m[0][10] = m[10][10] = m[20][20] = float('inf') m[1][10] = m[11][10] = m[21][20] = float('nan') @@ -785,7 +789,7 @@ def test_addr_bool(self, device, dtype): self._test_addr_vs_numpy(device, dtype, beta=False, alpha=False) self._test_addr_vs_numpy(device, dtype, beta=True, alpha=True) - @dtypes(*(torch.testing.get_all_int_dtypes())) + @dtypes(*(get_all_int_dtypes())) def test_addr_integral(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'argument beta must not be a floating point number.'): @@ -806,7 +810,7 @@ def test_addr_integral(self, device, dtype): self._test_addr_vs_numpy(device, dtype, beta=2, alpha=2) @precisionOverride({torch.bfloat16: 1e-1}) - @dtypes(*(torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) def test_addr_float_and_complex(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'Boolean beta only supported for Boolean results.'): @@ -819,11 +823,11 @@ def test_addr_float_and_complex(self, device, dtype): self._test_addr_vs_numpy(device, dtype, beta=0., alpha=2) # when beta is not zero self._test_addr_vs_numpy(device, dtype, beta=0.5, alpha=2) - if dtype in torch.testing.get_all_complex_dtypes(): + if dtype in get_all_complex_dtypes(): self._test_addr_vs_numpy(device, dtype, beta=(0 + 0.1j), alpha=(0.2 - 0.2j)) - @dtypes(*itertools.product(torch.testing.get_all_dtypes(), - torch.testing.get_all_dtypes())) + @dtypes(*itertools.product(get_all_dtypes(), + get_all_dtypes())) def test_outer_type_promotion(self, device, dtypes): a = torch.randn(5).to(device=device, dtype=dtypes[0]) b = torch.randn(5).to(device=device, dtype=dtypes[1]) @@ -831,9 +835,9 @@ def test_outer_type_promotion(self, device, dtypes): result = op(a, b) self.assertEqual(result.dtype, torch.result_type(a, b)) - @dtypes(*itertools.product(torch.testing.get_all_dtypes(), - torch.testing.get_all_dtypes(), - torch.testing.get_all_dtypes())) + @dtypes(*itertools.product(get_all_dtypes(), + get_all_dtypes(), + get_all_dtypes())) def test_addr_type_promotion(self, device, dtypes): a = make_tensor((5,), device=device, dtype=dtypes[0], low=-2, high=2) b = make_tensor((5,), device=device, dtype=dtypes[1], low=-2, high=2) @@ -5287,8 +5291,8 @@ def call_torch_fn(*args, **kwargs): self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape]) @dtypesIfCUDA(torch.cfloat, torch.cdouble, - *torch.testing.get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater))) - @dtypes(*(set(torch.testing.get_all_dtypes()) - {torch.half, torch.bool})) + *get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater))) + @dtypes(*(set(get_all_dtypes()) - {torch.half, torch.bool})) def test_blas_alpha_beta_empty(self, device, dtype): # This test is disabled on CUDA 9 due to: # See: https://github.com/pytorch/pytorch/issues/31006 @@ -5324,7 +5328,7 @@ def test_blas_alpha_beta_empty(self, device, dtype): self.assertEqual(torch.full((2, 3), beta * value, dtype=dtype, device=device), torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out)) - @dtypes(*(torch.testing.get_all_complex_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_complex_dtypes() + get_all_fp_dtypes())) def test_blas_nan_out(self, device, dtype): # These functions should work correctly with NaN filled outputs, # but need special handling, see [NOTE: cpu_zero] @@ -5950,9 +5954,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out= @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), - *torch.testing.get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)), - include_half=(not TEST_WITH_ROCM))) + @dtypesIfCUDA(*get_all_complex_dtypes(), + *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)), + include_half=(not TEST_WITH_ROCM))) @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble) def test_addmv(self, device, dtype): # have to use torch.randn(...).to(bfloat16) instead of @@ -5986,7 +5990,7 @@ def test_addmv(self, device, dtype): for m, v in itertools.product(ms, vs): self._test_addmm_addmv(torch.addmv, t, m, v, beta=0) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) + @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) @dtypes(torch.float, torch.double) def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype): # tests (o, s)*(s). o is output size, s is summed size. @@ -6017,9 +6021,9 @@ def _test(row_major, incx, incy, lda_tail): @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), - *torch.testing.get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) - @dtypes(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes()) + @dtypesIfCUDA(*get_all_complex_dtypes(), + *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) + @dtypes(*get_all_complex_dtypes(), *get_all_fp_dtypes()) @tf32_on_and_off(0.05) def test_addmm(self, device, dtype): M = torch.randn(10, 25, device=device).to(dtype) @@ -6052,7 +6056,7 @@ def maybe_transpose(cond, m): self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4) @dtypes(torch.float, torch.double) - @dtypesIfCUDA(*([torch.float, torch.double] + torch.testing.get_all_complex_dtypes())) + @dtypesIfCUDA(*([torch.float, torch.double] + get_all_complex_dtypes())) @tf32_on_and_off(0.005) def test_addmm_sizes(self, device, dtype): for m in [0, 1, 25]: @@ -6226,7 +6230,7 @@ def test_strided_mm_bmm(self, device, dtype): @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05}) @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1") @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes()) + @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) @tf32_on_and_off(0.05) def test_bmm(self, device, dtype): if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater: @@ -6338,7 +6342,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor): @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05}) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes()) + @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) @tf32_on_and_off(0.05) def test_addbmm(self, device, dtype): if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater: @@ -6411,7 +6415,7 @@ def generate_tensor(): @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5}) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes()) + @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) @tf32_on_and_off(0.05) def test_baddbmm(self, device, dtype): if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater: diff --git a/test/test_nn.py b/test/test_nn.py index 5008c7256acf7..2d66477ff826a 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -33,7 +33,7 @@ from torch.nn import Parameter from torch.nn.parameter import UninitializedParameter, UninitializedBuffer from torch.nn.parallel._functions import Broadcast -from torch.testing import get_all_fp_dtypes +from torch.testing._internal.common_dtype import integral_types, get_all_fp_dtypes, get_all_math_dtypes from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \ get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \ @@ -9406,9 +9406,9 @@ def test_cosine_embedding_loss_with_diff_type(self): input2 = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device) target = torch.tensor([1, -1], dtype=torch.int, device=device) expected = torch.nn.functional.cosine_embedding_loss(input1, input2, target) - for dt1 in torch.testing.get_all_math_dtypes(device): - for dt2 in torch.testing.get_all_math_dtypes(device): - for dt3 in torch.testing.get_all_math_dtypes(device): + for dt1 in get_all_math_dtypes(device): + for dt2 in get_all_math_dtypes(device): + for dt3 in get_all_math_dtypes(device): # dt3 is used as dtype for target = [1, -1], so let's skip unsigned type if dt3 == torch.uint8: continue @@ -9425,7 +9425,7 @@ def test_kl_div_with_diff_type(self): input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device) target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device) expected = torch.nn.functional.kl_div(input, target) - for input_dtype in torch.testing.get_all_math_dtypes(device): + for input_dtype in get_all_math_dtypes(device): if input_dtype.is_complex: continue for target_dtype in [torch.float32, torch.float64, torch.float16]: @@ -9441,7 +9441,7 @@ def test_kl_div_with_diff_type_log_target(self): input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device) target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device).log() expected = torch.nn.functional.kl_div(input, target, log_target=True) - for input_dtype in torch.testing.get_all_math_dtypes(device): + for input_dtype in get_all_math_dtypes(device): if input_dtype.is_complex: continue for target_dtype in [torch.float32, torch.float64, torch.float16]: @@ -9584,7 +9584,7 @@ def _input_grad(input, target, reduction): return input.grad for device, dtype, reduction in product(device_(), - torch.testing.integral_types(), + integral_types(), ('none', 'sum', 'mean')): input = torch.randn(2, 2, device=device, requires_grad=True) target = torch.randint(0, 9, (2, 2), device=device, dtype=dtype) diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py index be46f93bdf3a8..a6f5be036c7a6 100644 --- a/test/test_numpy_interop.py +++ b/test/test_numpy_interop.py @@ -7,6 +7,7 @@ (TestCase, run_tests) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, onlyCPU, dtypes) +from torch.testing._internal.common_dtype import get_all_dtypes # For testing handling NumPy objects and sending tensors to / accepting # arrays from NumPy. @@ -393,7 +394,7 @@ def test_has_storage_numpy(self, device): self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.long).storage()) self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.uint8).storage()) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_numpy_scalar_cmp(self, device, dtype): if dtype.is_complex: tensors = (torch.tensor(complex(1, 3), dtype=dtype, device=device), diff --git a/test/test_ops.py b/test/test_ops.py index b5b03c5b96ab9..a9d470fec5e44 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -5,8 +5,8 @@ import torch -from torch.testing import \ - (FileCheck, floating_and_complex_types_and, get_all_dtypes, make_tensor) +from torch.testing import FileCheck, make_tensor +from torch.testing._internal.common_dtype import floating_and_complex_types_and, get_all_dtypes from torch.testing._internal.common_utils import \ (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings) diff --git a/test/test_reductions.py b/test/test_reductions.py index a9c667564d118..9760eae52813d 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -9,8 +9,11 @@ import warnings from torch._six import inf, nan -from torch.testing import ( - integral_types_and, floating_and_complex_types_and, make_tensor) +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import ( + get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes, + integral_types_and, floating_and_complex_types_and +) from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict, IS_WINDOWS) @@ -625,7 +628,7 @@ def _test_out(dtype, other_dtype): # 'out' is favored over dtype, check error self.assertRaises(RuntimeError, lambda: fn(x, out=out, dtype=other_dtype)) - for dtype in [dtype for dtype in torch.testing.get_all_math_dtypes('cpu') if dtype != torch.float16]: + for dtype in [dtype for dtype in get_all_math_dtypes('cpu') if dtype != torch.float16]: x = torch.ones(shape, dtype=dtype) expected_dtype = dtype if dtype.is_floating_point or dtype.is_complex else torch.int64 self.assertIs(expected_dtype, fn(x).dtype) @@ -1273,7 +1276,7 @@ def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False): test_dtype_bfloat16(False, True) test_dtype_bfloat16(True, True) - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) def test_nansum(self, device, dtype): args = product( (True, False), # noncontiguous @@ -1326,15 +1329,15 @@ def _test_reduction_function_with_numpy(self, torch_func, np_func, device, dtype self.compare_with_numpy(torch_func_partial, np_func_partial, x, device=None, dtype=None, atol=atol, rtol=rtol, exact_dtype=exact_dtype) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + + get_all_complex_dtypes())) def test_count_nonzero(self, device, dtype): self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype) self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype, True) def _test_sum_reduction_vs_numpy(self, torch_fn, np_fn, device, dtype, with_keepdim=False, with_extremal=False): def is_integral(dtype): - return dtype in torch.testing.get_all_int_dtypes() + return dtype in get_all_int_dtypes() # On Windows CI, the current version of `numpy` promotes all lower integers # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking @@ -1363,27 +1366,27 @@ def is_integral(dtype): with_keepdim=with_keepdim, with_extremal=with_extremal) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) def test_sum_vs_numpy(self, device, dtype): self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype) self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_extremal=True) self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_keepdim=True) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) def test_nansum_vs_numpy(self, device, dtype): self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype) self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_extremal=True) self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_keepdim=True) - @dtypes(*(torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_complex_dtypes())) def test_nansum_complex(self, device, dtype): x = torch.randn((3, 3, 3), device=device, dtype=dtype) with self.assertRaisesRegex(RuntimeError, "nansum does not support complex inputs"): torch.nansum(x) def test_nansum_out_dtype(self, device): - dtypes = list(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)) + dtypes = list(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)) for inp_dtype, out_dtype in combinations(dtypes, 2): shape = _rand_shape(random.randint(2, 5), min_size=5, max_size=10) x = _generate_input(shape, inp_dtype, device, with_extremal=False) @@ -1392,7 +1395,7 @@ def test_nansum_out_dtype(self, device): np_fn = partial(np.nansum, dtype=np_out_dtype) self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) def test_argminmax_multiple(self, device, dtype): # Case: All Ones t = torch.ones(3, 3, device=device, dtype=dtype) @@ -1400,7 +1403,7 @@ def test_argminmax_multiple(self, device, dtype): self.compare_with_numpy(torch.argmin, np.argmin, t) # Case: With single `nan` present. - if dtype in torch.testing.get_all_fp_dtypes(): + if dtype in get_all_fp_dtypes(): t[2, 2] = float('nan') self.compare_with_numpy(torch.argmax, np.argmax, t) self.compare_with_numpy(torch.argmin, np.argmin, t) @@ -1477,8 +1480,8 @@ def verify_against_numpy(t): [0, 0]], device=device, dtype=dtype) verify_against_numpy(t) - @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False, - include_bool=True, include_complex=True))) + @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False, + include_bool=True, include_complex=True))) def test_all_any_vs_numpy(self, device, dtype): # Note [all, any uint8 compatibility]: However for compatibility reason, # for `uint8`, they return Tensor of same dtype `uint8`. @@ -1706,7 +1709,7 @@ def test_minmax_illegal_dtype(self, device): with self.assertRaisesRegex(RuntimeError, rmsg): torch.min(x, dim=0, out=(illegal_values, illegal_indices)) - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) def test_dim_arg_reduction_scalar(self, device, dtype): example = 4.0 @@ -1724,7 +1727,7 @@ def test_dim_arg_reduction_scalar(self, device, dtype): @precisionOverride({torch.float16: 1e-2, torch.bfloat16: 1e-2}) - @dtypes(*(set(torch.testing.get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8})) + @dtypes(*(set(get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8})) def test_dim_reduction(self, device, dtype): example = [[-1, 2, 1], [5, 3, 6]] @@ -2968,8 +2971,8 @@ def test_reduction_empty_any_all(self, device): shape = (2, 0, 4) x = torch.randn(shape, device=device) - for dtype in torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False, - include_bool=True, include_complex=True): + for dtype in get_all_dtypes(include_half=True, include_bfloat16=False, + include_bool=True, include_complex=True): # Refer: [all, any uint8 compatibility] if dtype == torch.uint8: out_dtype = torch.uint8 diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py index cb4ec3c18f82a..3f8c760264709 100644 --- a/test/test_shape_ops.py +++ b/test/test_shape_ops.py @@ -13,6 +13,7 @@ from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyOnCPUAndCUDA, dtypesIfCPU, dtypesIfCUDA, largeTensorTest) +from torch.testing._internal.common_dtype import get_all_dtypes # TODO: replace with make_tensor def _generate_input(shape, dtype, device, with_extremal): @@ -224,9 +225,9 @@ def test_diagonal_multidim(self, device, dtype): self.assertEqual(expected, result) @onlyOnCPUAndCUDA - @dtypesIfCPU(*torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_half=False, - include_bfloat16=False)) - @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False)) + @dtypesIfCPU(*get_all_dtypes(include_complex=False, include_bool=False, include_half=False, + include_bfloat16=False)) + @dtypesIfCUDA(*get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False)) def test_trace(self, device, dtype): def test(shape): tensor = make_tensor(shape, device, dtype, low=-9, high=9) @@ -338,7 +339,7 @@ def test_clamp_raises_arg_errors(self, device): with self.assertRaisesRegex(RuntimeError, error_msg): torch.clamp(X) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_flip(self, device, dtype): make_from_data = partial(torch.tensor, device=device, dtype=dtype) make_from_size = partial(make_tensor, device=device, dtype=dtype) @@ -437,7 +438,7 @@ def gen_data(): for dims in test_dims: self.assertEqual(size, list(data.flip(dims).size())) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_flip_errors(self, device, dtype): make_arg = partial(make_tensor, dtype=dtype, device=device) data = make_arg((2, 2, 2)) @@ -455,7 +456,7 @@ def test_flip_errors(self, device, dtype): def _rand_shape(self, dim, min_size, max_size): return tuple(torch.randint(min_size, max_size + 1, (dim,))) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_flip_numpy(self, device, dtype): make_arg = partial(make_tensor, dtype=dtype, device=device) @@ -564,7 +565,7 @@ def test_nonzero_no_warning(self, device): t.nonzero() self.assertEqual(len(w), 0) - @dtypes(*torch.testing.get_all_dtypes(include_complex=False)) + @dtypes(*get_all_dtypes(include_complex=False)) def test_nonzero(self, device, dtype): shapes = [ diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index e562e389a3fc8..52c32952a6965 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -5,7 +5,10 @@ from torch._six import nan from itertools import permutations, product -from torch.testing import all_types, all_types_and, make_tensor +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import ( + all_types, all_types_and, floating_types_and, get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, +) from torch.testing._internal.common_utils import \ (TEST_WITH_ROCM, TestCase, run_tests, slowTest) from torch.testing._internal.common_device_type import \ @@ -128,7 +131,7 @@ def test_sort(self, device): 'random with NaNs') # FIXME: remove torch.bool from unsupported types once support is added for cub sort - @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) + @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) def test_stable_sort(self, device, dtype): if TEST_WITH_ROCM and dtype == torch.bfloat16: return @@ -223,11 +226,11 @@ def test_topk_1d_output_discontiguous(self, device, dtype): self.assertEqual(values, values_cont) # FIXME: remove torch.bool from unsupported types once support is added for cub sort - @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) + @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) def test_stable_sort_against_numpy(self, device, dtype): if TEST_WITH_ROCM and dtype == torch.bfloat16: return - if dtype in torch.testing.floating_types_and(torch.float16, torch.bfloat16): + if dtype in floating_types_and(torch.float16, torch.bfloat16): inf = float('inf') neg_inf = -float('inf') nan = float('nan') @@ -288,7 +291,7 @@ def repeated_index_fill(t, dim, idxs, vals): idx_numpy = np.argsort(sample_numpy, axis=dim, kind='stable') self.assertEqual(idx_torch, idx_numpy) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_msort(self, device, dtype): if TEST_WITH_ROCM and dtype == torch.bfloat16: return @@ -634,7 +637,7 @@ def test_topk_bfloat16(self, device, dtype): for curr_size in (small, large): self._test_topk_dtype(device, dtype, False, curr_size) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float, torch.double, torch.bfloat16) def test_topk_nonfinite(self, device, dtype): if TEST_WITH_ROCM and dtype == torch.bfloat16: @@ -665,11 +668,11 @@ def test_topk_4d(self, device): self.assertEqual(ind, expected_ind, atol=0, rtol=0) @onlyOnCPUAndCUDA - @dtypesIfCUDA(*(torch.testing.get_all_dtypes(include_complex=False, - include_bool=False, - include_half=False, - include_bfloat16=True))) - @dtypes(*(torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False))) + @dtypesIfCUDA(*(get_all_dtypes(include_complex=False, + include_bool=False, + include_half=False, + include_bfloat16=True))) + @dtypes(*(get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False))) def test_topk_zero(self, device, dtype): if TEST_WITH_ROCM and dtype == torch.bfloat16: return @@ -726,8 +729,8 @@ def ensure_tuple(x): self.assertEqual(expected_inverse.view(additional_shape), y_inverse) self.assertEqual(expected_counts, y_counts) - @dtypesIfCPU(*set(torch.testing.get_all_dtypes()) - {torch.complex64, torch.complex128}) - @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) + @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128}) + @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) def test_unique(self, device, dtype): if dtype is torch.half and self.device_type == 'cpu': return # CPU does not have half support @@ -786,8 +789,8 @@ def ensure_tuple(x): count += 1 self.assertEqual(j, count) - @dtypesIfCPU(*set(torch.testing.get_all_dtypes()) - {torch.complex64, torch.complex128}) - @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) + @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128}) + @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) def test_unique_consecutive(self, device, dtype): if dtype is torch.half and self.device_type == 'cpu': return # CPU does not have half support diff --git a/test/test_sparse.py b/test/test_sparse.py index fb0a660333583..f9ed0dc11ffbd 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -16,6 +16,9 @@ (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, onlyCPU, onlyCUDA, deviceCountAtLeast) from torch.testing._internal.common_methods_invocations import \ (sparse_unary_ufuncs) +from torch.testing._internal.common_dtype import ( + floating_and_complex_types, floating_and_complex_types_and, get_all_dtypes, get_all_int_dtypes, +) if TEST_SCIPY: import scipy.sparse @@ -286,7 +289,7 @@ def test_ctor_size_checks(self, device, dtype): RuntimeError, lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1]))) - @dtypes(*torch.testing.floating_and_complex_types_and(torch.float16)) + @dtypes(*floating_and_complex_types_and(torch.float16)) def test_to_dense(self, device, dtype): def test_tensor(x, res): x.to_dense() # Tests triple to_dense for memory corruption @@ -1943,7 +1946,7 @@ def test_narrow(self, device, dtype, coalesced): def _test_log1p_tensor(self, sparse_tensor, coalesced): def is_integral(dtype): - return dtype in torch.testing.get_all_int_dtypes() + return dtype in get_all_int_dtypes() dense_tensor = sparse_tensor.to_dense() expected_output = dense_tensor.log1p() @@ -1977,8 +1980,8 @@ def is_integral(dtype): sparse_tensor.requires_grad_() @coalescedonoff - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, - include_bfloat16=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_half=False, + include_bfloat16=False, include_complex=False)) def test_log1p(self, device, dtype, coalesced): if coalesced: input_coalesced = torch.sparse_coo_tensor( @@ -2086,7 +2089,7 @@ def test_neg_negative(self, device, dtype, coalesced): def _test_asin_arcsin(self, sparse_tensor, coalesced): def is_integral(dtype): - return dtype in torch.testing.get_all_int_dtypes() + return dtype in get_all_int_dtypes() is_integral_dtype = is_integral(sparse_tensor.dtype) dense_tensor = sparse_tensor.to_dense() @@ -2125,8 +2128,8 @@ def is_integral(dtype): op(sparse_tensor) @coalescedonoff - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, - include_bfloat16=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_half=False, + include_bfloat16=False, include_complex=False)) def test_asin_arcsin(self, device, dtype, coalesced): if coalesced: input_coalesced = torch.sparse_coo_tensor( @@ -2196,7 +2199,7 @@ def test_shape(di, dj, dk, nnz): y, _, _ = self._gen_sparse(2, 20, [10, 100], dtype, device, coalesced) res = x.mv(y) - @dtypes(*torch.testing.floating_and_complex_types()) + @dtypes(*floating_and_complex_types()) def test_sparse_add_coalesce(self, device, dtype): i = self.index_tensor([[1, 2, 1]], device=device) v = torch.tensor([3, 4, 5], dtype=dtype, device=device) @@ -2613,14 +2616,14 @@ def test_legacy_new(self, device): @onlyCPU # not really, but we only really want to run this once def test_dtypes(self, device): - all_sparse_dtypes = torch.testing.get_all_dtypes(include_complex=True) + all_sparse_dtypes = get_all_dtypes(include_complex=True) do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu')) if torch.cuda.is_available(): do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0')) @onlyCPU # not really, but we only really want to run this once def test_empty_full(self, device): - all_sparse_dtypes = torch.testing.get_all_dtypes(include_complex=True) + all_sparse_dtypes = get_all_dtypes(include_complex=True) do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu')) if torch.cuda.device_count() > 0: do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None) diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py index fbb2b30e46304..af99fa031fca3 100644 --- a/test/test_sparse_csr.py +++ b/test/test_sparse_csr.py @@ -9,6 +9,7 @@ (IS_MACOS, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, onlyCPU, onlyCUDA) +from torch.testing._internal.common_dtype import floating_types, get_all_dtypes # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -43,7 +44,7 @@ def test_csr_layout(self): self.assertEqual(str(torch.sparse_csr), 'torch.sparse_csr') self.assertEqual(type(torch.sparse_csr), torch.layout) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sparse_csr_constructor_shape_inference(self, device, dtype): crow_indices = [0, 2, 4] col_indices = [0, 1, 0, 1] @@ -56,7 +57,7 @@ def test_sparse_csr_constructor_shape_inference(self, device, dtype): self.assertEqual(dtype, sparse.dtype) self.assertEqual(torch.device(device), sparse.device) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sparse_csr_constructor(self, device, dtype): crow_indices = [0, 2, 4] col_indices = [0, 1, 0, 1] @@ -73,7 +74,7 @@ def test_sparse_csr_constructor(self, device, dtype): self.assertEqual(torch.tensor(col_indices, dtype=index_dtype), sparse.col_indices()) self.assertEqual(torch.tensor(values, dtype=dtype), sparse.values()) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sparse_csr_constructor_from_lists(self, device, dtype): # without size sparse = torch.sparse_csr_tensor([0, 2, 4], @@ -209,7 +210,7 @@ def test_factory_indices_invariants_check(self, device): device=device) @onlyCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_factory_device_type_inference(self, device, dtype): cpu_cuda = ('cpu', 'cuda') cpu_cuda_none = cpu_cuda + (None,) @@ -262,7 +263,7 @@ def test_sparse_csr_print(self, device): printed.append("# col_indices shape: {}".format(col_indices_shape)) printed.append("# values_shape: {}".format(values_shape)) for index_dtype in [torch.int32, torch.int64]: - for dtype in torch.testing.floating_types(): + for dtype in floating_types(): printed.append("########## {}/{} ##########".format(dtype, index_dtype)) x = torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=index_dtype), torch.tensor([0, 1, 0, 1], dtype=index_dtype), @@ -280,7 +281,7 @@ def test_sparse_csr_print(self, device): self.assertExpected('\n'.join(printed)) self.maxDiff = orig_maxDiff - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sparse_csr_from_dense(self, device, dtype): dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device) sparse = dense.to_sparse_csr() @@ -300,7 +301,7 @@ def test_sparse_csr_from_dense(self, device, dtype): self.assertEqual(torch.tensor([0, 1, 2] * 3, dtype=torch.int64), sparse.col_indices()) self.assertEqual(torch.tensor([2] * 9, dtype=dtype), sparse.values()) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sparse_csr_to_dense(self, device, dtype): mn = [5, 2, 0] for (m, n) in itertools.product(mn, mn): @@ -377,7 +378,7 @@ def test_mkl_matvec_warnings(self, device, dtype): self.assertIn("Pytorch is compiled with MKL LP64 and will convert col_indices to int32", str(w[1].message)) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_sparse_csr_from_dense_convert_error(self, device, dtype): size = (4, 2, 4) dense = make_tensor(size, dtype=dtype, device=device) @@ -445,7 +446,7 @@ def test_shape(di, dj, dk, nnz): test_shape(i, j, k, i * j // 2) test_shape(4, 4, 4, 0) - @dtypes(*torch.testing.floating_types()) + @dtypes(*floating_types()) def test_sparse_mm(self, device, dtype): def test_shape(d1, d2, d3, nnz, transposed): if transposed: @@ -459,7 +460,7 @@ def test_shape(d1, d2, d3, nnz, transposed): test_shape(7, 8, 9, 20, False) test_shape(7, 8, 9, 20, True) - @dtypes(*torch.testing.floating_types()) + @dtypes(*floating_types()) def test_sparse_addmm(self, device, dtype): def test_shape(m, n, p, nnz, broadcast, alpha_beta=None): if alpha_beta is None: @@ -514,7 +515,7 @@ def _test_spadd_shape(nnz, shape): _test_spadd_shape(10, [100, 1]) _test_spadd_shape(10, [1, 100]) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_coo_csr_conversion(self, device, dtype): for m, n in itertools.product([5, 2, 0], [5, 2, 0]): size = (m, n) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index a7496919eaeb8..4a2216d230203 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -17,6 +17,9 @@ instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, onlyCPU, largeTensorTest, precisionOverride, dtypes, onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU, skipMeta) +from torch.testing._internal.common_dtype import ( + get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes +) # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests from torch.testing._internal.common_methods_invocations import ( @@ -140,7 +143,7 @@ def test_vander_types(self, device, dtype): exact_dtype=False) def test_cat_all_dtypes_and_devices(self, device): - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): x = torch.tensor([[1, 2], [3, 4]], dtype=dt, device=device) expected1 = torch.tensor([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=dt, device=device) @@ -150,7 +153,7 @@ def test_cat_all_dtypes_and_devices(self, device): self.assertEqual(torch.cat((x, x), 1), expected2) def test_fill_all_dtypes_and_devices(self, device): - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): for x in [torch.tensor((10, 10), dtype=dt, device=device), torch.empty(10000, dtype=dt, device=device)]: # large tensor numel = x.numel() @@ -304,7 +307,7 @@ def run_test(shape, device, diagonal, dtype): (3, 1), (5, 3, 1), (7, 5, 3, 1), # very fat matrices (1, 3), (5, 1, 3), (7, 5, 1, 3), # very thin matrices (1, 3, 3, 3), (3, 1, 3, 3, 3)] # unsqueezed batch dimensions - dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.bfloat16] + dtypes = [dtype for dtype in get_all_dtypes() if dtype != torch.bfloat16] for s, d, dtype in product(shapes, diagonals, dtypes): run_test(s, device, d, dtype) @@ -987,8 +990,8 @@ def _test_special_stacks(self, dim, at_least_dim, torch_fn, np_fn, device, dtype np_fn(np_input) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + + get_all_complex_dtypes())) def test_hstack_column_stack(self, device, dtype): ops = ((torch.hstack, np.hstack), (torch.column_stack, np.column_stack)) for torch_op, np_op in ops: @@ -1007,8 +1010,8 @@ def test_hstack_column_stack(self, device, dtype): torch_result) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + + get_all_complex_dtypes())) def test_vstack_row_stack(self, device, dtype): ops = ((torch.vstack, np.vstack), (torch.row_stack, np.row_stack)) for torch_op, np_op in ops: @@ -1025,8 +1028,8 @@ def test_vstack_row_stack(self, device, dtype): self.assertEqual(actual, expected) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + + get_all_complex_dtypes())) def test_dstack(self, device, dtype): self._test_special_stacks(2, 3, torch.dstack, np.dstack, device, dtype) for i in range(5): @@ -1572,7 +1575,7 @@ def test_random_from_to_bool(self, device): lambda: t.random_(from_, to_) ) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_random_full_range(self, device, dtype): size = 2000 alpha = 0.1 @@ -1606,7 +1609,7 @@ def test_random_full_range(self, device, dtype): self.assertTrue(from_ <= t.to(torch.double).min() < (from_ + delta)) self.assertTrue((to_inc_ - delta) < t.to(torch.double).max() <= to_inc_) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_random_from_to(self, device, dtype): size = 2000 alpha = 0.1 @@ -1695,7 +1698,7 @@ def test_random_from_to(self, device, dtype): lambda: t.random_(from_, to_) ) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_random_to(self, device, dtype): size = 2000 alpha = 0.1 @@ -1753,7 +1756,7 @@ def test_random_to(self, device, dtype): lambda: t.random_(from_, to_) ) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_random_default(self, device, dtype): size = 2000 alpha = 0.1 @@ -1781,10 +1784,10 @@ def test_empty_full(self, device): device_type = torch_device.type if device_type == 'cpu': - do_test_empty_full(self, torch.testing.get_all_math_dtypes('cpu'), torch.strided, torch_device) + do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device) if device_type == 'cuda': - do_test_empty_full(self, torch.testing.get_all_math_dtypes('cpu'), torch.strided, None) - do_test_empty_full(self, torch.testing.get_all_math_dtypes('cpu'), torch.strided, torch_device) + do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, None) + do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device) # TODO: this test should be updated @suppress_warnings @@ -2472,7 +2475,7 @@ def test_empty_tensor_props(self, device): self.assertEqual(x.stride(), y.stride()) def test_eye(self, device): - for dtype in torch.testing.get_all_dtypes(): + for dtype in get_all_dtypes(): if dtype == torch.bfloat16: continue # Test the RuntimeError is raised when either m or n is a negative number @@ -2505,8 +2508,8 @@ def test_eye(self, device): self.assertEqual(res1, res2) @precisionOverride({torch.float: 1e-8, torch.double: 1e-10}) - @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False) + + get_all_complex_dtypes())) def test_linspace_vs_numpy(self, device, dtype): start = -0.0316082797944545745849609375 + (0.8888888888j if dtype.is_complex else 0) end = .0315315723419189453125 + (0.444444444444j if dtype.is_complex else 0) @@ -2543,7 +2546,7 @@ def test_logspace_vs_numpy_complex(self, device, dtype): device, dtype) @precisionOverride({torch.float: 1e-6, torch.double: 1e-10}) - @dtypes(*torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False)) + @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) def test_logspace_vs_numpy(self, device, dtype): start = -0.0316082797944545745849609375 end = .0315315723419189453125 @@ -2653,7 +2656,7 @@ def test_tensor_factories_empty(self, device): shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)] for shape in shapes: - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): self.assertEqual(shape, torch.zeros(shape, device=device, dtype=dt).shape) self.assertEqual(shape, torch.zeros_like(torch.zeros(shape, device=device, dtype=dt)).shape) @@ -2739,8 +2742,8 @@ def test_arange_bfloat16(self, device): bfloat16_tensor = torch.arange(0, 6, step=2, dtype=torch.bfloat16, device=device) self.assertEqual(ref_tensor, bfloat16_tensor) - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False)) - @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True)) + @dtypes(*get_all_dtypes(include_bool=False, include_half=False)) + @dtypesIfCUDA(*get_all_dtypes(include_bool=False, include_half=True)) def test_linspace(self, device, dtype): _from = random.random() to = _from + random.random() @@ -2854,12 +2857,12 @@ def _test_linspace(self, device, dtype, steps): # See NOTE [Linspace+Logspace precision override] @skipCPUIf(True, "compares with CPU") @precisionOverride({torch.half: 0.0039 + LINSPACE_LOGSPACE_EXTRA_EPS}) - @dtypes(*(torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) def test_linspace_device_vs_cpu(self, device, dtype): self._test_linspace(device, dtype, steps=10) @skipCPUIf(True, "compares with CPU") - @dtypes(*(torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) def test_linspace_special_steps(self, device, dtype): for steps in self.LINSPACE_LOGSPACE_SPECIAL_STEPS: self._test_linspace(device, dtype, steps=steps) @@ -2900,10 +2903,10 @@ def test_logspace_special_steps(self, device, dtype): self._test_logspace(device, dtype, steps=steps) self._test_logspace_base2(device, dtype, steps=steps) - @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) - @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) + @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) + @dtypesIfCUDA(*((get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) if TEST_WITH_ROCM - else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False))) + else get_all_dtypes(include_bool=False, include_half=True, include_complex=False))) def test_logspace(self, device, dtype): _from = random.random() to = _from + random.random() diff --git a/test/test_testing.py b/test/test_testing.py index a5ea232122e08..e45977f3a855e 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -19,12 +19,13 @@ deviceCountAtLeast) from torch.testing._internal.common_methods_invocations import op_db import torch.testing._internal.opinfo_helper as opinfo_helper +from torch.testing._internal.common_dtype import get_all_dtypes # For testing TestCase methods and torch.testing functions class TestTesting(TestCase): # Ensure that assertEqual handles numpy arrays properly - @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False, - include_bool=True, include_complex=True))) + @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False, + include_bool=True, include_complex=True))) def test_assertEqual_numpy(self, device, dtype): S = 10 test_sizes = [ diff --git a/test/test_torch.py b/test/test_torch.py index ae75ee8d66044..6de409be60d1d 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -48,6 +48,9 @@ import torch.backends.quantized import torch.testing._internal.data from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32 +from torch.testing._internal.common_dtype import ( + get_all_fp_dtypes, get_all_int_dtypes, get_all_math_dtypes, get_all_dtypes, get_all_complex_dtypes +) # Protects against includes accidentally setting the default dtype assert torch.get_default_dtype() is torch.float32 @@ -274,8 +277,8 @@ def get_tensor(size, dtype, device, contiguous): height = 5 width = 5 for device in torch.testing.get_all_device_types(): - for dt1 in torch.testing.get_all_dtypes(): - for dt2 in torch.testing.get_all_dtypes(): + for dt1 in get_all_dtypes(): + for dt2 in get_all_dtypes(): for contiguous in [True, False]: x1 = get_tensor((height, width), dt1, device, contiguous) x2 = get_tensor((height, width), dt2, device, contiguous) @@ -293,14 +296,14 @@ def get_tensor(size, dtype, device, contiguous): self.assertEqual(expected, result) def test_dtypes(self): - all_dtypes = torch.testing.get_all_dtypes() + all_dtypes = get_all_dtypes() do_test_dtypes(self, all_dtypes, torch.strided, torch.device('cpu')) if torch.cuda.is_available(): all_dtypes.remove(torch.bfloat16) # Remove once _th_zero_ is enabled on cuda for bfloat16 do_test_dtypes(self, all_dtypes, torch.strided, torch.device('cuda:0')) def test_copy_dtypes(self): - all_dtypes = torch.testing.get_all_dtypes() + all_dtypes = get_all_dtypes() for dtype in all_dtypes: copied_dtype = copy.deepcopy(dtype) self.assertIs(dtype, copied_dtype) @@ -722,7 +725,7 @@ def reference(x, k, o3, o32): self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k, 'F'), reference) def test_dtype_is_signed(self): - for dtype in torch.testing.get_all_dtypes(): + for dtype in get_all_dtypes(): self.assertEqual(dtype.is_signed, torch.is_signed(torch.tensor(0, dtype=dtype))) self.assertRaisesRegex(RuntimeError, 'not supported for quantized', lambda: torch.quint8.is_signed) @@ -959,7 +962,7 @@ def test_index_add(self): # https://github.com/pytorch/pytorch/issues/29153 def test_index_add_all_dtypes(self): for device in torch.testing.get_all_device_types(): - for dtype in torch.testing.get_all_math_dtypes(device): + for dtype in get_all_math_dtypes(device): for idx_dtype in [torch.int, torch.long]: size = [5, 5] if dtype.is_floating_point or dtype.is_complex: @@ -4297,13 +4300,13 @@ def _cond_fn(x): _sync_raises_helper(f, level) - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_log_normal(self, device, dtype): a = torch.tensor([10], dtype=dtype, device=device).log_normal_() self.assertEqual(a.dtype, dtype) self.assertEqual(a.size(), torch.Size([1])) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_geometric(self, device, dtype): a = torch.tensor([10], dtype=dtype, device=device).geometric_(0.5) self.assertEqual(a.dtype, dtype) @@ -4335,9 +4338,9 @@ def test_repeat_interleave(self, device): self.assertEqual(a_with_output.dtype, y.dtype) self.assertEqual(a_with_output.size(), torch.Size([3, 2])) - @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False))) - @dtypesIfCPU(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=True))) - @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) + @dtypesIfCPU(*(get_all_fp_dtypes(include_half=False, include_bfloat16=True))) + @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False))) def test_bernoulli_p(self, device, dtype): for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]): x = torch.tensor(trivial_p, dtype=dtype, device=device) @@ -4357,9 +4360,9 @@ def isBinary(t): self.assertTrue(isBinary(p)) # RngUniform not implemented for Integral type in XLA test - @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False))) - @dtypesIfCPU(*(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False))) - @dtypesIfCUDA(*(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False))) + @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False))) + @dtypesIfCPU(*(get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False))) + @dtypesIfCUDA(*(get_all_dtypes(include_bfloat16=False, include_complex=False))) def test_bernoulli_self(self, device, dtype): def isBinary(t): @@ -4371,8 +4374,7 @@ def isBinary(t): t.bernoulli_(0.5) self.assertTrue(isBinary(t)) - for p_dtype in torch.testing.get_all_fp_dtypes(include_half=device.startswith('cuda'), - include_bfloat16=False): + for p_dtype in get_all_fp_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False): p = torch.rand(10, dtype=p_dtype, device=device).expand(10, 10) t.fill_(2) t.bernoulli_(p) @@ -4387,8 +4389,8 @@ def isBinary(t): self.assertTrue(isBinary(t)) @slowTest - @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False))) - @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False))) + @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False))) def test_bernoulli_edge_cases(self, device, dtype): # Need to draw a lot of samples to cover every random floating point number. a = torch.zeros(10000, 10000, dtype=dtype, device=device) # probability of drawing "1" is 0 @@ -4399,7 +4401,7 @@ def test_bernoulli_edge_cases(self, device, dtype): num_zeros = (torch.bernoulli(b) == 0).sum() self.assertEqual(num_zeros, 0) - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_exponential(self, device, dtype): a = torch.tensor([10], dtype=dtype, device=device).exponential_(0.5) self.assertEqual(a.dtype, dtype) @@ -4482,7 +4484,7 @@ def check(msg, *args, **kwargs): check(r'aweights cannot be negative', a, aweights=torch.tensor([-1., -2.])) @skipIfNoSciPy - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_uniform_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -4494,8 +4496,8 @@ def test_uniform_kstest(self, device, dtype): self.assertTrue(res.statistic < 0.1) @skipIfNoSciPy - @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False)) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes(include_bfloat16=False)) + @dtypesIfCUDA(*get_all_fp_dtypes()) def test_normal_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -4506,7 +4508,7 @@ def test_normal_kstest(self, device, dtype): self.assertTrue(res.statistic < 0.1) @skipIfNoSciPy - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_lognormal_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -4520,7 +4522,7 @@ def test_lognormal_kstest(self, device, dtype): self.assertTrue(res.statistic < 0.1) @skipIfNoSciPy - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_exponential_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -4530,7 +4532,7 @@ def test_exponential_kstest(self, device, dtype): self.assertTrue(res.statistic < 0.1) @skipIfNoSciPy - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_cauchy_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -4551,7 +4553,7 @@ def test_cauchy_no_inf(self, device, dtype): self.assertFalse(x.isinf().sum()) @skipIfNoSciPy - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_geometric_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -5005,7 +5007,7 @@ def to_np(t): # All tensors appear contiguous on XLA @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_diff_noncontig(self, device, dtype): shapes = ( (1,), @@ -5025,9 +5027,9 @@ def test_diff_noncontig(self, device, dtype): self._test_diff_numpy(non_contig) # RngNormal not implemented for type f16 for XLA - @dtypes(*torch.testing.get_all_dtypes(include_half=False)) - @dtypesIfCPU(*torch.testing.get_all_dtypes()) - @dtypesIfCUDA(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes(include_half=False)) + @dtypesIfCPU(*get_all_dtypes()) + @dtypesIfCUDA(*get_all_dtypes()) def test_diff(self, device, dtype): shapes = ( (1,), @@ -5298,7 +5300,7 @@ def test_bool_tensor_value_change(self, device): self.assertEqual(x, torch.tensor([False, True], dtype=torch.bool, device=device)) def test_unfold_all_devices_and_dtypes(self, device): - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): if dt == torch.bool: x = torch.empty((0, 1, 3, 0), dtype=dt, device=device) @@ -5318,7 +5320,7 @@ def test_unfold_scalars(self, device): def test_copy_all_dtypes_and_devices(self, device): from copy import copy - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): x = torch.tensor([1, 2, 3, 4], dtype=dt, device=device) x_clone = x.clone() y = copy(x) @@ -5328,7 +5330,7 @@ def test_copy_all_dtypes_and_devices(self, device): self.assertEqual(x, y) def test_clone_all_dtypes_and_devices(self, device): - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): x = torch.tensor((1, 1), dtype=dt, device=device) y = x.clone() self.assertEqual(x, y) @@ -5346,8 +5348,8 @@ def test_clone_not_memory_dense(self): # should retain permutation after densification self.assertTrue(y.stride() == (1, 4)) - @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda'))) - @dtypes(*set(torch.testing.get_all_math_dtypes('cpu'))) + @dtypesIfCUDA(*set(get_all_math_dtypes('cuda'))) + @dtypes(*set(get_all_math_dtypes('cpu'))) def test_addcmul(self, device, dtype): # Returns floating or integral scalar corresponding to dtype def _number(floating, integer, dtype): @@ -5396,7 +5398,7 @@ def test_narrow_empty(self, device): sz[d] = 0 self.assertEqual(sz, y.size()) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_index_copy(self, device, dtype): # We just test for num_copy <= num_dest, as otherwise there are repeated indices # and the behavior is undefined @@ -5430,7 +5432,7 @@ def ref_index_copy(tgt, dim, idx, src): # onlyOnCPUAndCUDA due to an XLA error: # https://github.com/pytorch/pytorch/issues/53256 @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_index_copy_scalars(self, device, dtype): # Create the 8 possible combinations of scalar sizes for target / index / source scalars = ((make_tensor(size_t, dtype=dtype, device=device, low=None, high=None), @@ -5535,7 +5537,7 @@ def test_index_put_non_accumulate_deterministic(self, device) -> None: self.assertEqual(output, input_list) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_index_fill(self, device, dtype): x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device) index = torch.tensor([0], device=device) @@ -5552,7 +5554,7 @@ def test_index_fill(self, device, dtype): # The test fails for zero-dimensional tensors on XLA @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_index_select(self, device, dtype): num_src, num_out = 3, 5 @@ -5595,7 +5597,7 @@ def ref_index_select(src, dim, idx): out = source.index_select(0, idx) self.assertEqual(out.item(), source.item()) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_take(self, device, dtype): idx_size = (4,) @@ -5629,7 +5631,7 @@ def ref_take(src, idx): # The bool instance does not work on GPU. See # https://github.com/pytorch/pytorch/issues/54317 - @dtypes(*torch.testing.get_all_dtypes(include_bool=False)) + @dtypes(*get_all_dtypes(include_bool=False)) def test_put(self, device, dtype): src_size = (4,) @@ -5699,7 +5701,7 @@ def ref_put(dst, idx, src, accumulate): # The bool instance does not work on GPU. See # https://github.com/pytorch/pytorch/issues/54317 - @dtypes(*torch.testing.get_all_dtypes(include_bool=False)) + @dtypes(*get_all_dtypes(include_bool=False)) def test_put_accumulate(self, device, dtype): # Test for parallel adds with accumulate == True low_precision = dtype == torch.half or dtype == torch.bfloat16 @@ -5742,10 +5744,10 @@ def scatter_allow_reduce(self, device, dtype, reduceop): # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) # So, we are skipping it here. - @dtypes(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False, include_half=False) + - torch.testing.get_all_complex_dtypes())) - @dtypesIfCPU(*torch.testing.get_all_dtypes()) - @dtypesIfCUDA(*torch.testing.get_all_dtypes()) + @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + + get_all_complex_dtypes())) + @dtypesIfCPU(*get_all_dtypes()) + @dtypesIfCUDA(*get_all_dtypes()) def test_scatter_reduce_operations_to_large_input(self, device, dtype): index = torch.tensor([[1], [2]], device=device, dtype=torch.long) test_data = [ @@ -5772,10 +5774,10 @@ def test_scatter_reduce_operations_to_large_input(self, device, dtype): # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) # So, we are skipping it here. - @dtypes(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False, include_half=False) + - torch.testing.get_all_complex_dtypes())) - @dtypesIfCPU(*torch.testing.get_all_dtypes()) - @dtypesIfCUDA(*torch.testing.get_all_dtypes()) + @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + + get_all_complex_dtypes())) + @dtypesIfCPU(*get_all_dtypes()) + @dtypesIfCUDA(*get_all_dtypes()) def test_scatter_reduce_scalar(self, device, dtype): index = torch.tensor([[1], [2]], device=device, dtype=torch.long) test_data = [ @@ -5813,10 +5815,10 @@ def test_scatter_add_non_unique_index(self, device): # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) # So, we are skipping it here. - @dtypes(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False, include_half=False) + - torch.testing.get_all_complex_dtypes())) - @dtypesIfCPU(*torch.testing.get_all_dtypes()) - @dtypesIfCUDA(*torch.testing.get_all_dtypes()) + @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + + get_all_complex_dtypes())) + @dtypesIfCPU(*get_all_dtypes()) + @dtypesIfCUDA(*get_all_dtypes()) def test_scatter_reduce_non_unique_index(self, device, dtype): height = 2 width = 2 @@ -5840,8 +5842,8 @@ def test_scatter_reduce_non_unique_index(self, device, dtype): # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) # So, we are skipping it here. @onlyCUDA - @dtypesIfCUDA(*(torch.testing.get_all_complex_dtypes() + - torch.testing.get_all_int_dtypes())) + @dtypesIfCUDA(*(get_all_complex_dtypes() + + get_all_int_dtypes())) def test_scatter_reduce_multiply_unsupported_dtypes(self, device, dtype): height = 2 width = 2 @@ -5888,7 +5890,7 @@ def test_scatter_add_bool(self, device): [True, False, True, False, True]], device=device)) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_masked_scatter(self, device, dtype): dt = dtype with warnings.catch_warnings(record=True) as w: @@ -5973,7 +5975,7 @@ def test_masked_scatter_large_tensor(self, device): result = t.masked_scatter(t, t) self.assertEqual(result, result_cpu) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_masked_select(self, device, dtype): if device == 'cpu': warn = 'masked_select received a mask with dtype torch.uint8,' @@ -6039,7 +6041,7 @@ def test_masked_select_discontiguous(self, device): torch.masked_select(v, m, out=out_dc) self.assertEqual(out_dc, expected, atol=0, rtol=0) - @dtypes(*product(torch.testing.get_all_dtypes(), (torch.uint8, torch.bool))) + @dtypes(*product(get_all_dtypes(), (torch.uint8, torch.bool))) def test_masked_fill(self, device, dtypes): dtype = dtypes[0] mask_dtype = dtypes[1] @@ -6349,8 +6351,8 @@ def test_pdist_norm_large(self, device): self.assertEqual(expected_cpu, actual_gpu.cpu()) @onlyOnCPUAndCUDA - @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda'))) - @dtypes(*set(torch.testing.get_all_math_dtypes('cpu'))) + @dtypesIfCUDA(*set(get_all_math_dtypes('cuda'))) + @dtypes(*set(get_all_math_dtypes('cpu'))) def test_addcdiv(self, device, dtype): # Returns floating or integral scalar corresponding to dtype def _number(floating, integer, dtype): @@ -7093,7 +7095,7 @@ def compare_strides(s1, s2, div): _test_helper(x, op, unary=True) @skipMeta - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_dlpack_conversion(self, device, dtype): # DLpack does not explicitly support bool # It does it through uint8 type @@ -7693,8 +7695,8 @@ def _where_valid_scalar_tensor_combination(self, scalar_type, dtype): return False @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes() + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() + + get_all_complex_dtypes())) def test_where_scalar_invalid_combination_raises(self, device, dtype): def checkRaises(scalar_type, dtype, condition, x, scalar_1): @@ -7706,8 +7708,8 @@ def checkRaises(scalar_type, dtype, condition, x, scalar_1): self._test_where_scalar_template(device, dtype, checkRaises) @skipCUDAVersionIn([(11, 2)]) # test fails for 11.2, see https://github.com/pytorch/pytorch/issues/51980 - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes() + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() + + get_all_complex_dtypes())) def test_where_scalar_valid_combination(self, device, dtype): def checkResult(scalar_type, dtype, condition, x, scalar_1): diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py index bd48e38045a13..81411c058bca6 100644 --- a/test/test_type_promotion.py +++ b/test/test_type_promotion.py @@ -8,6 +8,9 @@ TEST_NUMPY, torch_to_numpy_dtype_dict) from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyOnCPUAndCUDA, dtypes, dtypesIfCUDA, onlyCPU, expectedFailureMeta) +from torch.testing._internal.common_dtype import ( + get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes +) if TEST_NUMPY: import numpy as np @@ -179,7 +182,7 @@ def test_bfloat16(self, device): self.assertEqual(bf + scalar, scalar + bf) # with tensor - for dtype in torch.testing.get_all_dtypes(): + for dtype in get_all_dtypes(): t = torch.tensor(1, dtype=dtype, device=device) self.assertEqual(bf + t, t + bf) if dtype in (torch.float16, torch.float32, torch.float64, torch.cfloat, torch.cdouble): @@ -254,8 +257,8 @@ def _get_test_tensor(self, device, dtype, remove_zeros=False): def test_many_promotions(self, device): # Can also include half on CPU in cases where it will be promoted to a # supported dtype - dtypes1 = torch.testing.get_all_math_dtypes('cuda') - dtypes2 = torch.testing.get_all_math_dtypes(device) + dtypes1 = get_all_math_dtypes('cuda') + dtypes2 = get_all_math_dtypes(device) ops = [torch.add, torch.sub, torch.mul, torch.div, torch.rsub] for dt1, dt2 in itertools.product(dtypes1, dtypes2): for op, non_contiguous in itertools.product(ops, [True, False]): @@ -331,7 +334,7 @@ def test_create_bool_tensors(self, device): # this seems like odd behavior but ints also create float tensors, numpy doesn't have this function. self.assertEqual(torch.scalar_tensor(False, device=device), torch.tensor(0., device=device)) - @dtypes(*itertools.product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes())) + @dtypes(*itertools.product(get_all_dtypes(), get_all_dtypes())) def test_result_type(self, device, dtypes): "Test result_type for tensor vs tensor and scalar vs scalar." @@ -460,8 +463,8 @@ def test_comparison_ops_with_type_promotion(self, device): ), ] for op in comparison_ops: - for dt1 in torch.testing.get_all_math_dtypes(device): - for dt2 in torch.testing.get_all_math_dtypes(device): + for dt1 in get_all_math_dtypes(device): + for dt2 in get_all_math_dtypes(device): if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"): continue val1 = value_for_type[dt1] @@ -511,8 +514,8 @@ def test_complex_assertraises(self, device): dict(name="ne", compare_op=lambda x, y: x != y, ), ] for op in comparison_ops: - for dt1 in torch.testing.get_all_math_dtypes(device): - for dt2 in torch.testing.get_all_math_dtypes(device): + for dt1 in get_all_math_dtypes(device): + for dt2 in get_all_math_dtypes(device): if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"): u = torch.tensor([1], dtype=dt1, device=device) v = torch.tensor([2], dtype=dt2, device=device) @@ -520,7 +523,7 @@ def test_complex_assertraises(self, device): @float_double_default_dtype def test_lt_with_type_promotion(self, device): - for dt in torch.testing.get_all_math_dtypes(device): + for dt in get_all_math_dtypes(device): x = torch.tensor([0], dtype=dt, device=device) expected = torch.tensor([True], dtype=torch.bool, device=device) @@ -553,7 +556,7 @@ def test_promote_types(self, device): @float_double_default_dtype def test_promote_self(self, device): - for dtype in torch.testing.get_all_dtypes(): + for dtype in get_all_dtypes(): self.assertEqual(torch.promote_types(dtype, dtype), dtype) @expectedFailureMeta @@ -758,12 +761,12 @@ def _run_all_tests_for_sparse_op(self, op_name, device, dtypes): @onlyOnCPUAndCUDA def test_sparse_add(self, device): self._run_all_tests_for_sparse_op('add', device, - dtypes=torch.testing.get_all_math_dtypes(device)) + dtypes=get_all_math_dtypes(device)) @onlyOnCPUAndCUDA def test_sparse_mul(self, device): self._run_all_tests_for_sparse_op('mul', device, - dtypes=torch.testing.get_all_math_dtypes(device)) + dtypes=get_all_math_dtypes(device)) @onlyOnCPUAndCUDA def test_sparse_div(self, device): @@ -774,7 +777,7 @@ def test_sparse_div(self, device): @onlyOnCPUAndCUDA def test_sparse_sub(self, device): self._run_all_tests_for_sparse_op('sub', device, - dtypes=torch.testing.get_all_math_dtypes(device)) + dtypes=get_all_math_dtypes(device)) @onlyOnCPUAndCUDA @dtypes(torch.bool, torch.short, torch.uint8, torch.int, torch.long) @@ -871,7 +874,7 @@ def test_numpy_array_binary_ufunc_promotion(self, device, dtypes): @onlyOnCPUAndCUDA def test_cat_different_dtypes(self, device): - dtypes = torch.testing.get_all_dtypes(include_bfloat16=False) + dtypes = get_all_dtypes(include_bfloat16=False) for x_dtype, y_dtype in itertools.product(dtypes, dtypes): x_vals, y_vals = [1, 2, 3], [4, 5, 6] @@ -890,7 +893,7 @@ def test_cat_different_dtypes(self, device): @onlyOnCPUAndCUDA def test_cat_out_different_dtypes(self, device): - dtypes = torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False) + dtypes = get_all_dtypes(include_bfloat16=False, include_bool=False) for x_dtype, y_dtype, out_dtype in itertools.product(dtypes, dtypes, dtypes): out = torch.zeros(6, device=device, dtype=out_dtype) x = torch.tensor([1, 2, 3], device=device, dtype=x_dtype) @@ -957,21 +960,21 @@ def test_computation_ignores_out(self, device): self.assertEqual(result, a - b, exact_dtype=False) self.assertNotEqual(result, a.double() - b, exact_dtype=False) - @dtypesIfCUDA(*itertools.product(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False), - torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False))) - @dtypes(*itertools.product(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, - include_complex=False), - torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, - include_complex=False))) + @dtypesIfCUDA(*itertools.product(get_all_dtypes(include_bfloat16=False, include_complex=False), + get_all_dtypes(include_bfloat16=False, include_complex=False))) + @dtypes(*itertools.product(get_all_dtypes(include_half=False, include_bfloat16=False, + include_complex=False), + get_all_dtypes(include_half=False, include_bfloat16=False, + include_complex=False))) def test_atan2_type_promotion(self, device, dtypes): dtype1, dtype2 = dtypes default_float = torch.get_default_dtype() def is_int(dtype): - return dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + return dtype in get_all_int_dtypes() + [torch.bool] def is_float(dtype): - return dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False) + return dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False) def get_binary_float_result_type(x, y): dtype1 = x.dtype diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 526b67a6b03da..c65ae980fd82a 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -18,8 +18,11 @@ instantiate_device_type_tests, ops, dtypes, onlyCPU, onlyOnCPUAndCUDA, onlyCUDA, dtypesIfCUDA, precisionOverride, skipCUDAIfRocm, dtypesIfCPU, OpDTypes) -from torch.testing import ( - floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, make_tensor) +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import ( + floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, get_all_dtypes, get_all_math_dtypes, + get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes +) if TEST_SCIPY: import scipy @@ -502,8 +505,8 @@ def test_out_arg_all_dtypes(self, device, dtype, op): out = torch.empty_like(input, dtype=out_dtype) self._test_out_arg(op, input, out, expected, **torch_kwargs) - @dtypes(*(torch.testing.get_all_int_dtypes() + [torch.bool] + - torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*(get_all_int_dtypes() + [torch.bool] + + get_all_fp_dtypes(include_bfloat16=False))) def test_nan_to_num(self, device, dtype): for contiguous in [False, True]: x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device) @@ -581,7 +584,7 @@ def test_digamma(self, device, dtype): self.compare_with_numpy(torch.digamma, scipy.special.digamma, tensor) @skipCUDAIfRocm - @dtypes(*torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False)) + @dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False)) def test_frexp(self, device, dtype): input = make_tensor((50, 50), device, dtype) mantissa, exponent = torch.frexp(input) @@ -595,7 +598,7 @@ def test_frexp(self, device, dtype): self.assertTrue(torch_to_numpy_dtype_dict[exponent.dtype] == np_exponent.dtype) @skipCUDAIfRocm - @dtypes(*torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False)) + @dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False)) def test_frexp_out(self, device, dtype): input = make_tensor((50, 50), device, dtype) outputs = ( @@ -622,20 +625,18 @@ def test_frexp_out(self, device, dtype): @skipCUDAIfRocm def test_frexp_assert_raises(self, device): - invalid_input_dtypes = torch.testing.get_all_int_dtypes() + \ - torch.testing.get_all_complex_dtypes() + \ + invalid_input_dtypes = get_all_int_dtypes() + \ + get_all_complex_dtypes() + \ [torch.bool] for dtype in invalid_input_dtypes: input = make_tensor((50, 50), device, dtype) with self.assertRaisesRegex(RuntimeError, r"torch\.frexp\(\) only supports floating-point dtypes"): torch.frexp(input) - for dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False): + for dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False): input = make_tensor((50, 50), device, dtype) - dtypes = list(torch.testing.all_types_and_complex_and(torch.bool, - torch.half, - torch.bfloat16)) + dtypes = list(all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16)) dtypes.remove(dtype) for mantissa_dtype in dtypes: mantissa = torch.empty_like(input, dtype=mantissa_dtype) @@ -1046,7 +1047,7 @@ def test_mish(self, device, dtype): # do ops like threshold need a test_unary(_nonufunc) test suite? @onlyCPU - @dtypes(*torch.testing.get_all_math_dtypes('cpu')) + @dtypes(*get_all_math_dtypes('cpu')) def test_threshold(self, device, dtype): if dtype != torch.uint8 and dtype != torch.float16 and not dtype.is_complex: # 100 is wide enough to use AVX2 instructions for all types @@ -1180,7 +1181,7 @@ def _i0_range_helper(self, range, device, dtype): t = torch.rand(1000, device=device).to(dtype) * r self._i0_helper(t) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_i0_range1(self, device, dtype): @@ -1188,7 +1189,7 @@ def test_i0_range1(self, device, dtype): # The domain is (-13.25, 13.25) self._i0_range_helper(13.25, device, dtype) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_i0_range2(self, device, dtype): @@ -1203,7 +1204,7 @@ def test_i0_range3(self, device, dtype): # The domain is (-709.75, 709.75) self._i0_range_helper(709.75, device, dtype) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_i0_special(self, device, dtype): @@ -1213,7 +1214,7 @@ def test_i0_special(self, device, dtype): t = torch.tensor([inf, -inf, nan], device=device, dtype=dtype) self.assertTrue(torch.i0(t).isnan().all()) - @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_special_i0_i1_vs_scipy(self, device, dtype): @@ -1305,7 +1306,7 @@ def test_abs_zero(self, device, dtype): for num in abs_zeros: self.assertGreater(math.copysign(1.0, num), 0.0) - @dtypes(*torch.testing.get_all_fp_dtypes()) + @dtypes(*get_all_fp_dtypes()) def test_isfinite_isinf_isnan(self, device, dtype): vals = (-float('inf'), float('inf'), float('nan'), -1, 0, 1) @@ -1321,7 +1322,7 @@ def test_isfinite_isinf_isnan_int(self, device, dtype): self.compare_with_numpy(torch.isinf, np.isinf, vals, device, dtype) self.compare_with_numpy(torch.isnan, np.isnan, vals, device, dtype) - @dtypes(*(torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_fp_dtypes())) def test_isposinf_isneginf_float(self, device, dtype): ops = ((torch.isposinf, np.isposinf), (torch.isneginf, np.isneginf)) vals = (-float('inf'), float('inf'), float('nan'), -1, 0, 1) @@ -1346,7 +1347,7 @@ def test_isposinf_isneginf_float(self, device, dtype): torch_op(t, out=out) self.assertEqual(out, t_target) - @dtypes(*(torch.testing.get_all_int_dtypes() + [torch.bool])) + @dtypes(*(get_all_int_dtypes() + [torch.bool])) def test_isposinf_isneginf_int_and_bool(self, device, dtype): ops = ((torch.isposinf, np.isposinf), (torch.isneginf, np.isneginf)) vals = (-1, 0, 1) @@ -1374,7 +1375,7 @@ def test_isposinf_isneginf_complex(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'does not support complex inputs'): torch_op(t, out=out) - @dtypes(*(torch.testing.get_all_dtypes(include_bool=False))) + @dtypes(*(get_all_dtypes(include_bool=False))) def test_isposinf_isneginf_non_boolean_output(self, device, dtype): # test non-boolean tensors as the `out=` parameters # boolean outputs are tested in the above testcases @@ -1406,7 +1407,7 @@ def test_isreal_complex(self, device, dtype): vals = (1, 1 + 1j, 2 + 0j, 3j, 2 - 1j, 2 - 0j) self.compare_with_numpy(torch.isreal, np.isreal, vals, device, dtype) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_isreal_noncomplex(self, device, dtype): vals = (1, 2, 3) # Manual check here since numpy doesn't support bfloat16 @@ -1467,7 +1468,7 @@ def assert_tuple_empty(tup, dim): self.assertEqual(1, len(z)) self.assertEqual(torch.empty(0, dtype=torch.long), z[0]) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_nonzero_noncontiguous(self, device, dtype): x = make_tensor((10, 10, 10), dtype=dtype, device=device, low=1, noncontiguous=False) @@ -1496,10 +1497,10 @@ def permute_storage(tensor, dims): self.assertEqual(nondense.nonzero(), expect) # TODO: rationalize with exp OpInfo - @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False) + - torch.testing.get_all_complex_dtypes())) - @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_half=True) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_fp_dtypes(include_half=False) + + get_all_complex_dtypes())) + @dtypesIfCUDA(*(get_all_fp_dtypes(include_half=True) + + get_all_complex_dtypes())) def test_exp(self, device, dtype): for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()): a = torch.tensor(v, dtype=dtype, device=device) * torch.arange(18, device=device) / 3 * math.pi diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 7bb6906ef1cc7..06aaf31423f3f 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -11,6 +11,9 @@ (TestCase, run_tests, suppress_warnings) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, onlyCPU, dtypes, onlyOnCPUAndCUDA) +from torch.testing._internal.common_dtype import ( + get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes +) # TODO: replace this with make_tensor() in common_utils.py def _generate_input(shape, dtype, device, with_extremal): @@ -114,14 +117,14 @@ def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1): else: return x.transpose(dim0, dim1) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_conj_self(self, device, dtype): t = torch.ones(5, 5, device=device) s = t.conj() self.assertTrue(s is t) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False), torch.complex64) + @dtypes(*get_all_fp_dtypes(include_bfloat16=False), torch.complex64) def test_view_dtype(self, device, dtype): int_dtype = { torch.half: torch.int16, @@ -227,7 +230,7 @@ def fn(contiguous_input=True, dim0=0, dim1=1): self.assertEqual(res.shape, torch.Size([0])) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_complex_dtypes(include_complex32=True)) + @dtypes(*get_all_complex_dtypes(include_complex32=True)) def test_view_as_real(self, device, dtype): def fn(contiguous_input=True): t = torch.randn(3, 4, dtype=dtype, device=device) @@ -265,7 +268,7 @@ def fn(contiguous_input=True): self.assertEqual(res.shape, torch.Size([2])) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_view_tensor_split(self, device, dtype): a = make_tensor((40, 30), device, dtype, low=-9, high=9) a_split_dim0 = a.tensor_split(7, 0) @@ -276,7 +279,7 @@ def test_view_tensor_split(self, device, dtype): self.assertTrue(self.is_view_of(a, a_split_dim1_tensor)) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_view_tensor_hsplit(self, device, dtype): t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9) t_hsplit = torch.hsplit(t, 2) @@ -286,7 +289,7 @@ def test_view_tensor_hsplit(self, device, dtype): self.assertEqual(t_hsplit[1][2, 0, 2], t[2, 2, 2]) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_view_tensor_vsplit(self, device, dtype): t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9) t_vsplit = torch.vsplit(t, 2) @@ -296,7 +299,7 @@ def test_view_tensor_vsplit(self, device, dtype): self.assertEqual(t_vsplit[1][0, 2, 2], t[2, 2, 2]) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_view_tensor_dsplit(self, device, dtype): t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9) t_dsplit = torch.dsplit(t, 2) @@ -306,7 +309,7 @@ def test_view_tensor_dsplit(self, device, dtype): self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2]) @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_real_imag_noncomplex(self, device, dtype): t = torch.ones((5, 5), dtype=dtype, device=device) @@ -317,7 +320,7 @@ def test_real_imag_noncomplex(self, device, dtype): torch.imag(t) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_complex_dtypes()) + @dtypes(*get_all_complex_dtypes()) def test_real_imag_view(self, device, dtype): def compare_with_numpy(contiguous_input=True): t = torch.randn(3, 3, dtype=dtype, device=device) @@ -348,7 +351,7 @@ def compare_with_numpy(contiguous_input=True): self.assertEqual(a[5:].imag, a.imag[5:]) @onlyOnCPUAndCUDA - @dtypes(*torch.testing.get_all_complex_dtypes()) + @dtypes(*get_all_complex_dtypes()) def test_conj_imag_view(self, device, dtype) -> None: t = _make_tensor((4, 5,), dtype, device) t_numpy_conj = torch.from_numpy(t.cpu().numpy().conj()).to(device=device) @@ -363,7 +366,7 @@ def test_conj_imag_view(self, device, dtype) -> None: self.assertTrue(v_imag.is_neg()) @onlyOnCPUAndCUDA - @dtypes(*product(torch.testing.get_all_complex_dtypes(), torch.testing.get_all_dtypes())) + @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) @suppress_warnings def test_set_real_imag(self, device, dtypes): x = torch.randn(10, dtype=dtypes[0], device=device) @@ -1216,8 +1219,8 @@ def _test_atleast_dim(self, torch_fn, np_fn, device, dtype): self.assertEqual(np_res, torch_res) # TODO: are these view ops? - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + - torch.testing.get_all_complex_dtypes())) + @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + + get_all_complex_dtypes())) def test_atleast(self, device, dtype): self._test_atleast_dim(torch.atleast_1d, np.atleast_1d, device, dtype) self._test_atleast_dim(torch.atleast_2d, np.atleast_2d, device, dtype) @@ -1253,7 +1256,7 @@ def test_broadcast_shapes(self, device): self.assertEqual(expected, actual) # Skip BFloat16 since numpy does not support it - @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False)) + @dtypes(*get_all_dtypes(include_bfloat16=False)) def test_broadcast_to(self, device, dtype): def can_broadcast(s0, s1): # s0.dim() <= s1.dim(), reverse s0 and s1 to compare trailing dimension @@ -1356,7 +1359,7 @@ def test_view(self, device): self.assertEqual(tensor.view(6, 2, 1), contig_tensor.view(6, 2, 1)) self.assertEqual(tensor.view(1, 6, 2, 1), contig_tensor.view(1, 6, 2, 1)) - @dtypes(*torch.testing.get_all_dtypes()) + @dtypes(*get_all_dtypes()) def test_reshape_view_semantics(self, device, dtype): tensor = make_tensor((15, 4), device, dtype) target = (20, 3) @@ -1383,7 +1386,7 @@ def test_contiguous(self, device): @onlyOnCPUAndCUDA # Skip BFloat16 since numpy does not support it - @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False)) + @dtypes(*get_all_dtypes(include_bfloat16=False)) def test_tensor_split_sections(self, device, dtype): input_sizes = [ (0,), @@ -1414,7 +1417,7 @@ def test_tensor_split_sections(self, device, dtype): @onlyOnCPUAndCUDA # Skip BFloat16 since numpy does not support it - @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False)) + @dtypes(*get_all_dtypes(include_bfloat16=False)) def test_tensor_split_indices(self, device, dtype): input_sizes = [ (0,), @@ -1493,20 +1496,20 @@ def test_tensor_split_errors(self, device): def test_resize_all_dtypes_and_devices(self, device): shape = (2, 2) - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device) x.resize_(shape) self.assertEqual(shape, x.shape) def test_resize_as_all_dtypes_and_devices(self, device): - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device) y = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=dt, device=device) x.resize_as_(y) self.assertEqual(y.shape, x.shape) def test_view_all_dtypes_and_devices(self, device): - for dt in torch.testing.get_all_dtypes(): + for dt in get_all_dtypes(): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device) self.assertEqual(x.view(6).shape, [6]) diff --git a/torch/testing/_core.py b/torch/testing/_core.py index 66060f8cbcee0..b3cc6f163c49f 100644 --- a/torch/testing/_core.py +++ b/torch/testing/_core.py @@ -13,27 +13,7 @@ __all__ = [ "FileCheck", - "all_types", - "all_types_and", - "all_types_and_complex", - "all_types_and_complex_and", - "all_types_and_half", - "complex_types", - "empty_types", - "floating_and_complex_types", - "floating_and_complex_types_and", - "floating_types", - "floating_types_and", - "double_types", - "floating_types_and_half", - "get_all_complex_dtypes", - "get_all_dtypes", "get_all_device_types", - "get_all_fp_dtypes", - "get_all_int_dtypes", - "get_all_math_dtypes", - "integral_types", - "integral_types_and", "make_non_contiguous", ] @@ -41,9 +21,7 @@ # False otherwise. # TODO: implement numpy-like issubdtype def is_integral(dtype: torch.dtype) -> bool: - # Skip complex/quantized types - dtypes = [x for x in get_all_dtypes() if x not in get_all_complex_dtypes()] - return dtype in dtypes and not dtype.is_floating_point + return dtype in (torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64) def is_quantized(dtype: torch.dtype) -> bool: return dtype in (torch.quint8, torch.qint8, torch.qint32, torch.quint4x2) @@ -216,109 +194,5 @@ def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor: return input.data -# Functions and classes for describing the dtypes a function supports -# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros - -# Verifies each given dtype is a torch.dtype -def _validate_dtypes(*dtypes): - for dtype in dtypes: - assert isinstance(dtype, torch.dtype) - return dtypes - -# class for tuples corresponding to a PyTorch dispatch macro -class _dispatch_dtypes(tuple): - def __add__(self, other): - assert isinstance(other, tuple) - return _dispatch_dtypes(tuple.__add__(self, other)) - -_empty_types = _dispatch_dtypes(()) -def empty_types(): - return _empty_types - -_floating_types = _dispatch_dtypes((torch.float32, torch.float64)) -def floating_types(): - return _floating_types - -_floating_types_and_half = _floating_types + (torch.half,) -def floating_types_and_half(): - return _floating_types_and_half - -def floating_types_and(*dtypes): - return _floating_types + _validate_dtypes(*dtypes) - -_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble) -def floating_and_complex_types(): - return _floating_and_complex_types - -def floating_and_complex_types_and(*dtypes): - return _floating_and_complex_types + _validate_dtypes(*dtypes) - -_double_types = _dispatch_dtypes((torch.float64, torch.complex128)) -def double_types(): - return _double_types - -_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)) -def integral_types(): - return _integral_types - -def integral_types_and(*dtypes): - return _integral_types + _validate_dtypes(*dtypes) - -_all_types = _floating_types + _integral_types -def all_types(): - return _all_types - -def all_types_and(*dtypes): - return _all_types + _validate_dtypes(*dtypes) - -_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble)) -def complex_types(): - return _complex_types - -_all_types_and_complex = _all_types + _complex_types -def all_types_and_complex(): - return _all_types_and_complex - -def all_types_and_complex_and(*dtypes): - return _all_types_and_complex + _validate_dtypes(*dtypes) - -_all_types_and_half = _all_types + (torch.half,) -def all_types_and_half(): - return _all_types_and_half - -def get_all_dtypes(include_half=True, - include_bfloat16=True, - include_bool=True, - include_complex=True, - include_complex32=False - ) -> List[torch.dtype]: - dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16) - if include_bool: - dtypes.append(torch.bool) - if include_complex: - dtypes += get_all_complex_dtypes(include_complex32) - return dtypes - -def get_all_math_dtypes(device) -> List[torch.dtype]: - return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'), - include_bfloat16=False) + get_all_complex_dtypes() - -def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]: - return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128] - - -def get_all_int_dtypes() -> List[torch.dtype]: - return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64] - - -def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]: - dtypes = [torch.float32, torch.float64] - if include_half: - dtypes.append(torch.float16) - if include_bfloat16: - dtypes.append(torch.bfloat16) - return dtypes - - def get_all_device_types() -> List[str]: return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py index 3cf7338bff889..60c6384ad13cb 100644 --- a/torch/testing/_deprecated.py +++ b/torch/testing/_deprecated.py @@ -5,10 +5,12 @@ import functools import warnings -from typing import Any, Callable, Optional, Tuple +from typing import Any, Callable, Dict, Optional, Tuple, Union import torch +from . import _dtype_getters + __all__ = [ "rand", @@ -17,17 +19,18 @@ ] -def warn_deprecated(instructions: str) -> Callable: +def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable: def outer_wrapper(fn: Callable) -> Callable: - msg = ( - f"torch.testing.{fn.__name__} is deprecated and will be removed in a future release. " - f"{instructions.strip()}" - ) + name = fn.__name__ + head = f"torch.testing.{name}() is deprecated and will be removed in a future release. " @functools.wraps(fn) def inner_wrapper(*args: Any, **kwargs: Any) -> Any: + return_value = fn(*args, **kwargs) + tail = instructions(name, args, kwargs, return_value) if callable(instructions) else instructions + msg = (head + tail).strip() warnings.warn(msg, FutureWarning) - return fn(*args, **kwargs) + return return_value return inner_wrapper @@ -84,3 +87,17 @@ def assert_allclose( check_is_coalesced=False, msg=msg or None, ) + + +def _dtype_getter_instructions(name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any], return_value: Any) -> str: + return f"This call to {name}(...) can be replaced with {return_value}." + + +# We iterate over all public dtype getters and expose them here with an added deprecation warning +for name in _dtype_getters.__all__: + if name.startswith("_"): + continue + fn = getattr(_dtype_getters, name) + + globals()[name] = warn_deprecated(_dtype_getter_instructions)(fn) + __all__.append(name) diff --git a/torch/testing/_dtype_getters.py b/torch/testing/_dtype_getters.py new file mode 100644 index 0000000000000..d16ca04f25778 --- /dev/null +++ b/torch/testing/_dtype_getters.py @@ -0,0 +1,138 @@ +"""This module exist to be able to deprecate the dtype getters publicly without doing so internally. The deprecated +public versions are defined in torch.testing._deprecated and exposed from torch.testing. The non-deprecated internal +versions should be imported from torch.testing._internal.dtype_getters +""" + +from typing import List + +import torch + +__all__ = [ + "_validate_dtypes", + "_dispatch_dtypes", + "all_types", + "all_types_and", + "all_types_and_complex", + "all_types_and_complex_and", + "all_types_and_half", + "complex_types", + "empty_types", + "floating_and_complex_types", + "floating_and_complex_types_and", + "floating_types", + "floating_types_and", + "double_types", + "floating_types_and_half", + "get_all_complex_dtypes", + "get_all_dtypes", + "get_all_fp_dtypes", + "get_all_int_dtypes", + "get_all_math_dtypes", + "integral_types", + "integral_types_and", +] + +# Functions and classes for describing the dtypes a function supports +# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros + +# Verifies each given dtype is a torch.dtype +def _validate_dtypes(*dtypes): + for dtype in dtypes: + assert isinstance(dtype, torch.dtype) + return dtypes + +# class for tuples corresponding to a PyTorch dispatch macro +class _dispatch_dtypes(tuple): + def __add__(self, other): + assert isinstance(other, tuple) + return _dispatch_dtypes(tuple.__add__(self, other)) + +_empty_types = _dispatch_dtypes(()) +def empty_types(): + return _empty_types + +_floating_types = _dispatch_dtypes((torch.float32, torch.float64)) +def floating_types(): + return _floating_types + +_floating_types_and_half = _floating_types + (torch.half,) +def floating_types_and_half(): + return _floating_types_and_half + +def floating_types_and(*dtypes): + return _floating_types + _validate_dtypes(*dtypes) + +_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble) +def floating_and_complex_types(): + return _floating_and_complex_types + +def floating_and_complex_types_and(*dtypes): + return _floating_and_complex_types + _validate_dtypes(*dtypes) + +_double_types = _dispatch_dtypes((torch.float64, torch.complex128)) +def double_types(): + return _double_types + +_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)) +def integral_types(): + return _integral_types + +def integral_types_and(*dtypes): + return _integral_types + _validate_dtypes(*dtypes) + +_all_types = _floating_types + _integral_types +def all_types(): + return _all_types + +def all_types_and(*dtypes): + return _all_types + _validate_dtypes(*dtypes) + +_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble)) +def complex_types(): + return _complex_types + +_all_types_and_complex = _all_types + _complex_types +def all_types_and_complex(): + return _all_types_and_complex + +def all_types_and_complex_and(*dtypes): + return _all_types_and_complex + _validate_dtypes(*dtypes) + +_all_types_and_half = _all_types + (torch.half,) +def all_types_and_half(): + return _all_types_and_half + +# The functions below are used for convenience in our test suite and thus have no corresponding C++ dispatch macro + +def get_all_dtypes(include_half=True, + include_bfloat16=True, + include_bool=True, + include_complex=True, + include_complex32=False + ) -> List[torch.dtype]: + dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16) + if include_bool: + dtypes.append(torch.bool) + if include_complex: + dtypes += get_all_complex_dtypes(include_complex32) + return dtypes + +def get_all_math_dtypes(device) -> List[torch.dtype]: + return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'), + include_bfloat16=False) + get_all_complex_dtypes() + +def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]: + return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128] + + +def get_all_int_dtypes() -> List[torch.dtype]: + return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64] + + +def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]: + dtypes = [torch.float32, torch.float64] + if include_half: + dtypes.append(torch.float16) + if include_bfloat16: + dtypes.append(torch.bfloat16) + return dtypes diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py index 8ec6e71d121ff..23e431d66bec2 100644 --- a/torch/testing/_internal/common_device_type.py +++ b/torch/testing/_internal/common_device_type.py @@ -14,8 +14,7 @@ skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \ IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard, TEST_SKIP_NOARCH from torch.testing._internal.common_cuda import _get_torch_cuda_version -from torch.testing import \ - (get_all_dtypes) +from torch.testing._internal.common_dtype import get_all_dtypes try: import psutil # type: ignore[import] diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py new file mode 100644 index 0000000000000..0ce2d80a18b4a --- /dev/null +++ b/torch/testing/_internal/common_dtype.py @@ -0,0 +1,4 @@ +"""The content of torch/testing/_dtype_getters.py should be moved here as soon as the deprecation period is over. +""" + +from torch.testing._dtype_getters import * # noqa: F401, F403 diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py index 3b62ced36f391..89533a6d7fb9d 100644 --- a/torch/testing/_internal/common_jit.py +++ b/torch/testing/_internal/common_jit.py @@ -7,7 +7,7 @@ import torch.jit.quantized # Testing utils -from torch.testing import floating_and_complex_types_and +from torch.testing._internal.common_dtype import floating_and_complex_types_and from torch.testing._internal.common_utils import TestCase, \ freeze_rng_state, TemporaryFileName, enable_profiling_mode_for_profiling_tests, is_iterable_of_tensors from torch.testing._internal.common_utils import enable_profiling_mode # noqa: F401 diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index ace4fa1c63c20..5113346a20e56 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -16,12 +16,12 @@ from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict -from torch.testing import \ - (make_non_contiguous, floating_types, floating_types_and, complex_types, - floating_and_complex_types, floating_and_complex_types_and, - all_types_and_complex_and, all_types_and, all_types_and_complex, - integral_types_and, all_types, double_types, make_tensor) -from .._core import _dispatch_dtypes +from torch.testing import make_non_contiguous, make_tensor +from torch.testing._internal.common_dtype import ( + _dispatch_dtypes, floating_types, floating_types_and, complex_types, floating_and_complex_types, + floating_and_complex_types_and, all_types_and_complex_and, all_types_and, all_types_and_complex, integral_types_and, + all_types, double_types, +) from torch.testing._internal.common_device_type import \ (onlyOnCPUAndCUDA, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, precisionOverride, toleranceOverride, tol) diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py index 6ef4de398a39e..a1059f6b718f4 100644 --- a/torch/testing/_internal/common_modules.py +++ b/torch/testing/_internal/common_modules.py @@ -2,7 +2,8 @@ from copy import deepcopy from functools import wraps, partial from itertools import chain -from torch.testing import floating_types, make_tensor +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import floating_types from torch.testing._internal.common_device_type import ( _TestParametrizer, _dtype_test_suffix, _update_param_kwargs, skipIf) from torch.testing._internal.common_nn import nllloss_reference, get_reduction diff --git a/torch/testing/_internal/opinfo_helper.py b/torch/testing/_internal/opinfo_helper.py index 5129af4f99e34..e4a18b48ca7a1 100644 --- a/torch/testing/_internal/opinfo_helper.py +++ b/torch/testing/_internal/opinfo_helper.py @@ -4,21 +4,22 @@ import torch from torch.testing._internal.common_cuda import (TEST_CUDA) -from torch.testing._core import _dispatch_dtypes -from torch.testing import (all_types_and_complex_and, - all_types_and_complex, - all_types_and_half, - all_types, - complex_types, - floating_and_complex_types, - floating_types_and_half, - floating_types, - integral_types, - floating_types_and, - floating_and_complex_types_and, - integral_types_and, - all_types_and, - ) +from torch.testing._internal.common_dtype import ( + all_types_and_complex_and, + all_types_and_complex, + all_types_and_half, + all_types, + complex_types, + floating_and_complex_types, + floating_types_and_half, + floating_types, + integral_types, + floating_types_and, + floating_and_complex_types_and, + integral_types_and, + all_types_and, + _dispatch_dtypes, +) COMPLETE_DTYPES_DISPATCH = ( all_types, From adb85b32d3cb98ad7fa333424f172959b2ef3e35 Mon Sep 17 00:00:00 2001 From: Yanli Zhao Date: Tue, 7 Sep 2021 09:28:30 -0700 Subject: [PATCH 523/530] minor fix for elastic doc (#64531) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64531 fix #64530 Test Plan: unit test Reviewed By: mrshenli Differential Revision: D30760879 fbshipit-source-id: 94ed1476e886513427d928a36f5be6b9bfff0826 --- torch/distributed/elastic/agent/server/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py index 6d389a7873a4a..d767233a2ae52 100644 --- a/torch/distributed/elastic/agent/server/api.py +++ b/torch/distributed/elastic/agent/server/api.py @@ -160,7 +160,7 @@ def __init__( # rank of the worker among all the workers with the same role # across all ``agent`` instances. - # Global rank is not stable between re-rendezvous. + # Role rank is not stable between re-rendezvous. self.role_rank: int = role_rank # total number of workers (globally). Due to elasticity From 43248d91122470e4f0a63997737451cfbc82780e Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Tue, 7 Sep 2021 11:02:11 -0700 Subject: [PATCH 524/530] [doc][hackathon] To add Adam Optimizer to the documentation (#63251) Summary: It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper https://github.com/pytorch/pytorch/issues/63236. In this PR we are adding description of Adam Algorithm to the documentation. For more details, we refer to the paper https://arxiv.org/abs/1412.6980 Screen Shot 2021-08-27 at 6 37 54 PM Pull Request resolved: https://github.com/pytorch/pytorch/pull/63251 Reviewed By: albanD Differential Revision: D30779163 Pulled By: iramazanli fbshipit-source-id: 319a80fc3952793b0d064d0e641ddc1de3c05a86 --- torch/optim/adam.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/torch/optim/adam.py b/torch/optim/adam.py index d7313be75f8fb..ea2ceaff67057 100644 --- a/torch/optim/adam.py +++ b/torch/optim/adam.py @@ -6,9 +6,37 @@ class Adam(Optimizer): r"""Implements Adam algorithm. - It has been proposed in `Adam: A Method for Stochastic Optimization`_. - The implementation of the L2 penalty follows changes proposed in - `Decoupled Weight Decay Regularization`_. + .. math:: + \begin{aligned} + &\rule{110mm}{0.4pt} \\ + &\textbf{input} : \gamma \text{ (lr)}, \beta_1, \beta_2 + \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)} \\ + &\hspace{13mm} \lambda \text{ (weight decay)}, \: amsgrad \\ + &\textbf{initialize} : m_0 \leftarrow 0 \text{ ( first moment)}, + v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex] + &\rule{110mm}{0.4pt} \\ + &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ + &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ + &\hspace{5mm}\textbf{if} \: \lambda \neq 0 \\ + &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ + &\hspace{5mm}m_t \leftarrow \beta_1 m_{t-1} + (1 - \beta_1) g_t \\ + &\hspace{5mm}v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2) g^2_t \\ + &\hspace{5mm}\widehat{m_t} \leftarrow m_t/\big(1-\beta_1^t \big) \\ + &\hspace{5mm}\widehat{v_t} \leftarrow v_t/\big(1-\beta_2^t \big) \\ + &\hspace{5mm}\textbf{if} \: amsgrad \\ + &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max}, + \widehat{v_t}) \\ + &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/ + \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big) \\ + &\hspace{5mm}\textbf{else} \\ + &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/ + \big(\sqrt{\widehat{v_t}} + \epsilon \big) \\ + &\rule{110mm}{0.4pt} \\[-1.ex] + &\bf{return} \: \theta_t \\[-1.ex] + &\rule{110mm}{0.4pt} \\[-1.ex] + \end{aligned} + + For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_. Args: params (iterable): iterable of parameters to optimize or dicts defining @@ -25,8 +53,6 @@ class Adam(Optimizer): .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 - .. _Decoupled Weight Decay Regularization: - https://arxiv.org/abs/1711.05101 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ From 85eeb4d682f98b2a47dcbc2e02a8c5543ae2ed2c Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Tue, 7 Sep 2021 11:23:52 -0700 Subject: [PATCH 525/530] Clean up op BC check list (#64584) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64584 It has been a while since last clean up. The list is really long. Test Plan: ci Reviewed By: hl475 Differential Revision: D30779350 fbshipit-source-id: 908b47d0b9a16b784aad6a34c5c87f923500c247 --- .../check_backward_compatibility.py | 69 ------------------- 1 file changed, 69 deletions(-) diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index e1dde921f102d..16b415a7368fa 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -36,86 +36,17 @@ # Internal, profiler-specific ops ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)), ("profiler::_record_function_enter", datetime.date(9999, 1, 1)), - ("aten::_qr_helper", datetime.date(2021, 1, 31)), - ("aten::fft", datetime.date(2021, 1, 31)), - ("aten::ifft", datetime.date(2021, 1, 31)), - ("aten::irfft", datetime.date(2021, 1, 31)), - ("aten::rfft", datetime.date(2021, 1, 31)), - ("aten::linalg_svd", datetime.date(2021, 5, 15)), - ("aten::linalg_cholesky.out", datetime.date(2021, 8, 30)), - ("aten::linalg_cholesky_ex", datetime.date(2021, 8, 30)), - ("aten::linalg_cholesky_ex.L", datetime.date(2021, 8, 30)), ("aten::_cholesky_helper", datetime.date(9999, 1, 1)), ("aten::_lstsq_helper", datetime.date(9999, 1, 1)), - ("aten::linalg_lstsq", datetime.date(2021, 5, 1)), - ("aten::_svd_helper", datetime.date(2021, 1, 31)), ("aten::_syevd_helper", datetime.date(9999, 1, 1)), ("aten::_lu_solve_helper", datetime.date(9999, 1, 1)), ("aten::_lu_with_info", datetime.date(9999, 1, 1)), ("aten::_linalg_solve_out_helper_", datetime.date(9999, 1, 1)), - ("aten::_cudnn_rnn_flatten_weight", datetime.date(2020, 12, 31)), - ("aten::_cudnn_rnn", datetime.date(2020, 12, 31)), - ("aten::_cudnn_rnn_backward", datetime.date(2020, 12, 31)), - ("aten::quantile", datetime.date(2021, 1, 31)), - ("aten::nanquantile", datetime.date(2021, 1, 31)), - ("aten::make_dual", datetime.date(2021, 2, 20)), - ("aten::unpack_dual", datetime.date(2021, 2, 20)), - ("aten::_fft_with_size", datetime.date(2021, 1, 31)), - ("aten::thnn_conv_depthwise2d_backward", datetime.date(2021, 1, 31)), - ("aten::slow_conv3d_backward", datetime.date(2021, 1, 31)), - ("aten::thnn_conv2d_backward", datetime.date(2021, 1, 31)), - ("aten::slow_conv_transpose3d_backward", datetime.date(2021, 1, 31)), - ("aten::slow_conv_transpose2d_backward", datetime.date(2021, 1, 31)), - ("aten::set_", datetime.date(2021, 1, 31)), - ("aten::native_layer_norm", datetime.date(2021, 1, 31)), - ("aten::native_layer_norm_backward", datetime.date(2021, 1, 31)), - ("aten::elu_backward", datetime.date(2021, 1, 31)), - ("aten::_multinomial_alias_setup", datetime.date(2021, 1, 31)), - ("aten::_multinomial_alias_draw", datetime.date(2021, 1, 31)), - ("prim::profile_optional", datetime.date(2021, 1, 31)), - ("aten::fake_quantize_per_tensor_affine_backward", datetime.date(2021, 2, 20)), - ("aten::fake_quantize_per_channel_affine_backward", datetime.date(2021, 2, 20)), ("aten::rowwise_prune", datetime.date(9999, 1, 1)), - ("aten::_mode*", datetime.date(2021, 5, 2)), - ("aten::linalg_multi_dot", datetime.date(2021, 3, 25)), - ("aten::coalesce", datetime.date(2021, 4, 15)), - ("aten::empty_meta", datetime.date(2021, 4, 1)), - ("aten::div", datetime.date(2021, 4, 28)), - ("aten::divide", datetime.date(2021, 4, 28)), - ("aten::_var", datetime.date(2021, 5, 28)), - ("aten::_std", datetime.date(2021, 5, 28)), - ("aten::batch_norm_backward_elemt", datetime.date(2021, 5, 1)), - ("aten::assert_async", datetime.date(2021, 5, 1)), - ("aten::cumprod_backward", datetime.date(2021, 5, 1)), ("aten::_triangular_solve_helper", datetime.date(9999, 1, 1)), - ("aten::_addmv_impl_", datetime.date(2021, 5, 15)), ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)), ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)), - ("aten::_amp_update_scale", datetime.date(2021, 6, 1)), ("aten::randperm", datetime.date(9999, 1, 1)), - ("aten::linalg_vector_norm", datetime.date(2021, 5, 15)), - ("aten::repeat_interleave", datetime.date(2021, 6, 26)), - ("aten::one_hot", datetime.date(2021, 6, 15)), - ("aten::slice", datetime.date(2021, 6, 30)), - ("aten::conj", datetime.date(2021, 8, 1)), - ("aten::_conj", datetime.date(2021, 8, 1)), - ("aten::conj.out", datetime.date(2021, 8, 1)), - ("aten::segment_reduce_backward", datetime.date(2021, 6, 15)), - ("aten::segment_reduce", datetime.date(2021, 8, 26)), - ("aten::_segment_reduce_backward", datetime.date(2021, 8, 26)), - ("aten::thnn_conv_depthwise2d", datetime.date(2021, 8, 27)), - ("aten::thnn_conv_depthwise2d.out", datetime.date(2021, 8, 27)), - ("aten::thnn_conv_depthwise2d_forward", datetime.date(2021, 8, 27)), - ("aten::thnn_conv_depthwise2d_forward.out", datetime.date(2021, 8, 27)), - ("aten::thnn_conv_depthwise2d_backward", datetime.date(2021, 8, 27)), - ("aten::thnn_conv_depthwise2d_backward.out", datetime.date(2021, 8, 27)), - ("aten::_view_as_real_physical", datetime.date(2021, 8, 27)), - ("aten::_view_as_real_physical", datetime.date(2021, 8, 1)), - ("aten::_bmm", datetime.date(2021, 8, 14)), - ("aten::_bmm.out", datetime.date(2021, 8, 14)), - ("aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams", datetime.date(2021, 8, 15)), - ("aten::_cumsum", datetime.date(2021, 8, 31)), - ("aten::_cumprod", datetime.date(2021, 8, 31)), ] ALLOW_LIST_COMPILED = [ From c9d6ca4c54c7aba7ccb3df728c41161817b9a936 Mon Sep 17 00:00:00 2001 From: Danielle Pintz <38207072+daniellepintz@users.noreply.github.com> Date: Tue, 7 Sep 2021 11:34:08 -0700 Subject: [PATCH 526/530] Add space in Feature Request issue template (#64563) Summary: Add space between emoji and text in Feature Request issue template Pull Request resolved: https://github.com/pytorch/pytorch/pull/64563 Reviewed By: janeyx99 Differential Revision: D30779429 Pulled By: seemethere fbshipit-source-id: 3625299923a7022fa66473633524a6620d58188b --- .github/ISSUE_TEMPLATE/feature-request.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index e1d2bc306eae8..6e1432dbd4474 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -1,5 +1,5 @@ --- -name: "\U0001F680Feature Request" +name: "\U0001F680 Feature Request" about: Submit a proposal/request for a new PyTorch feature --- From 7e4ebe06cab4aedc66c501239ca568a0d28fd755 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Tue, 7 Sep 2021 11:34:27 -0700 Subject: [PATCH 527/530] Fixes issue related torch.trapezoid broadcasting behavior and documentation (#64054) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64054 Fixes #63608 cc mruberry rgommers heitorschueroff Test Plan: Imported from OSS Reviewed By: saketh-are Differential Revision: D30617078 Pulled By: NivekT fbshipit-source-id: 815896ec56d447562790df4d662e94fd13457e2a --- aten/src/ATen/native/Integration.cpp | 37 ++++++++++++++++++++++------ test/test_binary_ufuncs.py | 9 +++++++ torch/_torch_docs.py | 11 +++++---- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/aten/src/ATen/native/Integration.cpp b/aten/src/ATen/native/Integration.cpp index 262519f69a61c..e57dc4505df4e 100644 --- a/aten/src/ATen/native/Integration.cpp +++ b/aten/src/ATen/native/Integration.cpp @@ -52,7 +52,21 @@ Tensor do_cumulative_trapezoid(const Tensor& y, double dx, int64_t dim) { return (dx /2. * (left + right)).cumsum(dim); } - +// Given the current shape of a Tensor and a target number of dimensions, +// returns a new shape with the same values as the original shape, +// but with '1's padded in the beginning to match the target number of dimensions. +// For example, curr_shape = (5,5,5) and target_n_dim = 6 ==> (1,1,1,5,5,5) +// Note that no padding will be added if the current shape has the greater than or equal +// number of dimensions than the target numbers of dimensions. +DimVector add_padding_to_shape(IntArrayRef curr_shape, int64_t target_n_dim) { + if (curr_shape.size() >= target_n_dim) + target_n_dim = curr_shape.size(); + DimVector new_shape(target_n_dim, 1); + for (decltype(curr_shape.size()) i = 0; i < curr_shape.size(); i++) { + new_shape[target_n_dim-i-1] = curr_shape[curr_shape.size()-i-1]; + } + return new_shape; +} } Tensor trapezoid(const Tensor& y, const Tensor& x, int64_t dim) { @@ -71,9 +85,15 @@ Tensor trapezoid(const Tensor& y, const Tensor& x, int64_t dim) { // Note: This behavior differs from numpy in that numpy tries to // broadcast 'dx', but this tries to broadcast 'x' to match 'y' instead. TORCH_CHECK(x.size(0) == y.size(dim), "trapezoid: There must be one `x` value for each sample point"); - DimVector sizes(y.dim(), 1); - sizes[dim] = x.size(0); - x_viewed = x.view(sizes); + DimVector new_sizes(y.dim(), 1); // shape = [1] * y. + new_sizes[dim] = x.size(0); // shape[axis] = d.shape[0] + x_viewed = x.view(new_sizes); + } else if (x.dim() < y.dim()) { + // When 'y' has more dimension than 'x', this step takes 'x' with dimension (n_1, n_2, ...), + // and add '1's as dimensions in front to become (1, 1, ..., n_1, n_2), matching the dimension of 'y'. + // This allows the subsequent slicing operations to proceed with any 'dim' without going out of bound. + DimVector new_sizes = add_padding_to_shape(x.sizes(), y.dim()); + x_viewed = x.view(new_sizes); } else { x_viewed = x; } @@ -110,9 +130,12 @@ Tensor cumulative_trapezoid(const Tensor& y, const Tensor& x, int64_t dim) { Tensor x_viewed; if (x.dim() == 1) { TORCH_CHECK(x.size(0) == y.size(dim), "cumulative_trapezoid: There must be one `x` value for each sample point"); - DimVector sizes(y.dim(), 1); // shape = [1] * y. - sizes[dim] = x.size(0); // shape[axis] = d.shape[0] - x_viewed = x.view(sizes); + DimVector new_sizes(y.dim(), 1); // shape = [1] * y. + new_sizes[dim] = x.size(0); // shape[axis] = d.shape[0] + x_viewed = x.view(new_sizes); + } else if (x.dim() < y.dim()) { + DimVector new_sizes = add_padding_to_shape(x.sizes(), y.dim()); + x_viewed = x.view(new_sizes); } else { x_viewed = x; } diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index f8c36adf8b781..7153902841aa5 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -2668,6 +2668,11 @@ def test_x(sizes, dim, x, device): test_x((1, 10), 0, [1.0], device) test_x((0, 2), 0, [], device) test_x((0, 2), 1, [1.0, 2.0], device) + test_x((2, 3, 4), -1, [1.0, 2.0, 3.0, 4.0], device) + test_x((2, 3, 4), 0, [1.0, 2.0], device) + test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device) + test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device) + test_x((2, 2, 4), -1, [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]], device) with self.assertRaisesRegex( IndexError, 'Dimension out of range'): @@ -2726,6 +2731,10 @@ def test_empty_x(sizes, dim, x, device): test_x((10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device) test_x((1, 10), 0, [1.0], device) test_x((0, 2), 1, [1, 2], device) + test_x((2, 3, 4), -1, [1.0, 2.0, 3.0, 4.0], device) + test_x((2, 3, 4), 0, [1.0, 2.0], device) + test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device) + test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device) test_empty_x((0, 2), 0, [], device) # SciPy failing when x == [], but our version returns empty diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 7dca8a7bdedbd..13112a9101296 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -10876,11 +10876,12 @@ def merge_dicts(*dicts): \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1}) \end{aligned} -When :attr:`y` is two or more dimensions, this computation is performed independently -along dimension :attr:`dim`. If :attr:`x` is also specified and is one-dimensional, -then that dimension defines the spacing for each computation. -If :attr:`x` is also specified and is not one-dimensional, then it is broadcast to -the shape of :attr:`y` and the corresponding sizes are used for each computation. +When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed. +The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x` +and :attr:`y`, the function computes the difference between consecutive elements along +dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have +the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1. +After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule. See the examples below for details. .. note:: From 8407ce7e382b009547f3c0d282540006afcfa1b9 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 7 Sep 2021 12:30:16 -0700 Subject: [PATCH 528/530] [small BE] .github: refactor concurrency into a common macro (#64587) Summary: By using a macro for these concurrency groups, we can edit just one place for the linux and windows workflows (vs 2). I wanted to loop all the other workflow files in as well, but since those aren't generated, the macros won't work the same way. Pull Request resolved: https://github.com/pytorch/pytorch/pull/64587 Reviewed By: mrshenli Differential Revision: D30783224 Pulled By: janeyx99 fbshipit-source-id: ae16ebb12d2d63a563d28f0ce88e280f68ed4b9b --- .github/templates/common.yml.j2 | 6 ++++++ .github/templates/linux_ci_workflow.yml.j2 | 4 +--- .github/templates/windows_ci_workflow.yml.j2 | 4 +--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 07ad771346399..aff01377ff665 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -5,6 +5,12 @@ {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #} {%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} +{%- macro concurrency(build_environment) -%} +concurrency: + group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true +{%- endmacro -%} + {%- macro display_ec2_information() -%} - name: Display EC2 information shell: bash diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 2d856704c3137..520a6a00a19f6 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -45,9 +45,7 @@ env: ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} -concurrency: - group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true +!{{ common.concurrency(build_environment) }} jobs: {%- if ciflow_config.enabled %} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 4f486dd75f3b7..20fe72238ffeb 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -60,9 +60,7 @@ env: USE_CUDA: 1 {%- endif %} -concurrency: - group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true +!{{ common.concurrency(build_environment) }} jobs: {%- if ciflow_config.enabled %} From 337c71be05f959799a305164e6edf86c686bb673 Mon Sep 17 00:00:00 2001 From: Anirudh Dagar Date: Tue, 7 Sep 2021 12:34:15 -0700 Subject: [PATCH 529/530] Array API: Add `torch.linalg.matmul` alias to `torch.matmul` (#63227) Summary: Fixes https://github.com/pytorch/pytorch/issues/62811 Add `torch.linalg.matmul` alias to `torch.matmul`. Note that the `linalg.matmul` doesn't have a `method` variant. Also cleaning up `torch/_torch_docs.py` when formatting is not needed. cc IvanYashchuk Lezcano mruberry rgommers Pull Request resolved: https://github.com/pytorch/pytorch/pull/63227 Reviewed By: mrshenli Differential Revision: D30770235 Pulled By: mruberry fbshipit-source-id: bfba77dfcbb61fcd44f22ba41bd8d84c21132403 --- aten/src/ATen/core/aten_interned_strings.h | 1 - aten/src/ATen/core/interned_strings.h | 2 ++ aten/src/ATen/native/LinearAlgebra.cpp | 9 ++++++++ aten/src/ATen/native/native_functions.yaml | 8 +++++++ docs/source/linalg.rst | 1 + torch/_torch_docs.py | 22 +++++++++---------- torch/csrc/jit/passes/normalize_ops.cpp | 1 + torch/linalg/__init__.py | 6 +++++ torch/overrides.py | 1 + .../_internal/common_methods_invocations.py | 1 + 10 files changed, 40 insertions(+), 12 deletions(-) diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index 6da99dfc6a4d9..df6b860a8a363 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -453,7 +453,6 @@ _(aten, margin_ranking_loss) \ _(aten, masked_fill) \ _(aten, masked_scatter) \ _(aten, masked_select) \ -_(aten, matmul) \ _(aten, matrix_rank) \ _(aten, matrix_exp) \ _(aten, max) \ diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 8d49d82c5c8f4..e7aef155a5656 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -211,6 +211,8 @@ namespace c10 { _(aten, linalg_norm) \ _(aten, linalg_vector_norm) \ _(aten, linalg_matrix_norm) \ + _(aten, matmul) \ + _(aten, linalg_matmul) \ _(aten, append) \ _(aten, item) \ _(aten, format) \ diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 2ae6202ce87e2..59950b987900f 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1567,6 +1567,15 @@ Tensor& matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &resul return result; } +// torch.linalg.matmul, alias for torch.matmul +Tensor linalg_matmul(const Tensor & tensor1, const Tensor & tensor2) { + return at::native::matmul(tensor1, tensor2); +} + +Tensor& linalg_matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &result) { + return at::native::matmul_out(tensor1, tensor2, result); +} + // helper methods for matrix_exp namespace { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 3a1f75c588a83..dbacca2750850 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -10272,6 +10272,14 @@ dispatch: CPU, CUDA: linalg_lstsq_out +# torch.linalg.matmul, alias for torch.matmul +- func: linalg_matmul(Tensor self, Tensor other) -> Tensor + python_module: linalg + variants: function + +- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + - func: linalg_slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) python_module: linalg variants: function diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst index 7a286d3d4051e..ffca583b706e9 100644 --- a/docs/source/linalg.rst +++ b/docs/source/linalg.rst @@ -67,6 +67,7 @@ Matrix Products :toctree: generated :nosignatures: + matmul matrix_power multi_dot householder_product diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 13112a9101296..6a5f61cfa1a7d 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -149,7 +149,7 @@ def merge_dicts(*dicts): absolute(input, *, out=None) -> Tensor Alias for :func:`torch.abs` -""".format(**common_args)) +""") add_docstr(torch.acos, r""" acos(input, *, out=None) -> Tensor @@ -211,7 +211,7 @@ def merge_dicts(*dicts): arccosh(input, *, out=None) -> Tensor Alias for :func:`torch.acosh`. -""".format(**common_args)) +""") add_docstr(torch.add, r""" add(input, other, *, alpha=1, out=None) -> Tensor @@ -2269,7 +2269,7 @@ def merge_dicts(*dicts): clip(input, min=None, max=None, *, out=None) -> Tensor Alias for :func:`torch.clamp`. -""".format(**common_args)) +""") add_docstr(torch.column_stack, r""" @@ -4468,7 +4468,7 @@ def merge_dicts(*dicts): inverse(input, *, out=None) -> Tensor Alias for :func:`torch.linalg.inv` -""".format(**common_args)) +""") add_docstr(torch.isin, r""" isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor @@ -5701,7 +5701,7 @@ def merge_dicts(*dicts): matrix_power(input, n, *, out=None) -> Tensor Alias for :func:`torch.linalg.matrix_power` -""".format(**common_args)) +""") add_docstr(torch.matrix_exp, r""" matrix_exp(input) -> Tensor @@ -6667,7 +6667,7 @@ def merge_dicts(*dicts): multiply(input, other, *, out=None) Alias for :func:`torch.mul`. -""".format(**common_args)) +""") add_docstr(torch.multinomial, r""" @@ -7017,7 +7017,7 @@ def merge_dicts(*dicts): negative(input, *, out=None) -> Tensor Alias for :func:`torch.neg` -""".format(**common_args)) +""") add_docstr(torch.nextafter, r""" @@ -7390,7 +7390,7 @@ def merge_dicts(*dicts): polygamma(n, input, *, out=None) -> Tensor Alias for :func:`torch.special.polygamma`. -""".format(**common_args)) +""") add_docstr(torch.positive, r""" @@ -8249,7 +8249,7 @@ def merge_dicts(*dicts): row_stack(tensors, *, out=None) -> Tensor Alias of :func:`torch.vstack`. -""".format(**common_args)) +""") add_docstr(torch.round, r""" @@ -9974,7 +9974,7 @@ def merge_dicts(*dicts): true_divide(dividend, divisor, *, out) -> Tensor Alias for :func:`torch.div` with ``rounding_mode=None``. -""".format(**common_args)) +""") add_docstr(torch.trunc, r""" @@ -10090,7 +10090,7 @@ def merge_dicts(*dicts): fix(input, *, out=None) -> Tensor Alias for :func:`torch.trunc` -""".format(**common_args)) +""") add_docstr(torch.unsqueeze, r""" diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp index 5ac36e1f1b76f..67637031868c1 100644 --- a/torch/csrc/jit/passes/normalize_ops.cpp +++ b/torch/csrc/jit/passes/normalize_ops.cpp @@ -102,6 +102,7 @@ const std::unordered_map& getOperatorAliasMap() { {aten::divide_, aten::div_}, {aten::multiply, aten::mul}, {aten::multiply_, aten::mul_}, + {aten::linalg_matmul, aten::matmul}, {aten::true_divide, aten::div}, {aten::true_divide_, aten::div_}, {aten::concat, aten::cat}, diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py index df3507f1b3561..f98930e471630 100644 --- a/torch/linalg/__init__.py +++ b/torch/linalg/__init__.py @@ -1284,6 +1284,12 @@ tensor([ 3.1623, 10.0000, 17.2627]) """) +matmul = _add_docstr(_linalg.linalg_matmul, r""" +linalg.matmul(input, other, *, out=None) -> Tensor + +Alias for :func:`torch.matmul` +""") + multi_dot = _add_docstr(_linalg.linalg_multi_dot, r""" linalg.multi_dot(tensors, *, out=None) diff --git a/torch/overrides.py b/torch/overrides.py index aca14a6d4552b..1bb98507f18b1 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -599,6 +599,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.masked_scatter: lambda input, mask, source: -1, torch.masked_select: lambda input, mask, out=None: -1, torch.matmul: lambda input, other, out=None: -1, + torch.linalg.matmul: lambda input, other, out=None: -1, # alias for torch.matmul torch.matrix_power: lambda input, n: -1, torch.linalg.matrix_power: lambda input, n, out=None: -1, torch.matrix_rank: lambda input, tol=None, symmetric=False: -1, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 5113346a20e56..b38d4afc4af3a 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -7116,6 +7116,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): supports_out=False, ), OpInfo('matmul', + aliases=('linalg.matmul',), dtypes=floating_types(), dtypesIfCPU=all_types_and_complex(), dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []), From acc9f9afc8f2be70d7f5d3248ca1760e0336b3b8 Mon Sep 17 00:00:00 2001 From: "haozhe.zhu" Date: Tue, 7 Sep 2021 12:59:00 -0700 Subject: [PATCH 530/530] enable bf16 mkldnn path for gemm (#61891) Summary: # Goal: Integrate mkldnn bf16 Gemm to pytorch ## BF16 Suport for mm, addmm, bmm, addbmm, baddbmm, mv, addmv, dot (with mkldnn matmul primitive): https://oneapi-src.github.io/oneDNN/group__dnnl__api__matmul.html For gemm related ops, we keep all inputs under plain format. So we will not introduce opaque tensor for these ops to save mem copy here. ![mkldnn bf16 gemm integration](https://user-images.githubusercontent.com/54701539/126263077-4b5134e1-52a7-4fad-94fb-19e13a0377f6.png) The minimized integration is only dispatch to mkldnn in addmm, but for gemm with 3-D input (with additional dim for"batch") this will call mkldnn gemm for "batch" times. Since mkldnn matmul support input with multiple dims, we directly dispatch to mkldnn gemm in {bmm, addbmm, baddbmm} to reduce the time to create mkldnn memory desc, primitive, etc. For the different definition for "bias" between mkldnn(which must be shape of (1, N)) and pytorch (which can be same shape with gemm result (M, N)), we use a fused sum to handle it. ## User Case: User case is exactly same with before because no opaque tensor's is introduced. Since the pytorch has already support bf16 data type with CPU tensor before, we can leverage the existed bf16 gemm UT. ## Gemm performance gain on CPX 28Cores/Socket: Note: data is collected using PyTorch operator benchmarks: https://github.com/pytorch/pytorch/tree/master/benchmarks/operator_benchmark (with adding bfloat16 dtype) ### use 1 thread on 1 core ### torch.addmm (M, N) * (N, K) + (M, K) | impl |16x16x16|32x32x32| 64x64x64 | 128x128x128| 256x256x256| 512x512x512|1024x1024x1024| |:---:|:---:| :---: | :---: | :---: | :---: | :---: | :---: | | aten-fp32| 4.115us|4.583us|8.230us|26.972us|211.857us|1.458ms|11.258ms| | aten-bf16 | 15.812us| 105.087us|801.787us|3.767ms|20.274ms|122.440ms|836.453ms| | mkldnn-bf16 |20.561us |22.510us|24.551us|37.709us|143.571us|0.835ms|5.76ms| We can see mkldnn-bf16 are better than aten bf16, but for smaller shapes, mkldnn bf16 are not better than aten fp32. This is because onednn overhead, this overhead more like a "constant" overhead and while problems get larger, we can ignore it. Also we are continue optimize the kernel efficiency and decrease the overhead as well. More shapes | impl |1x2048x2048|2048x1x2048| 2048x2048x1 | |:---:|:---:| :---: | :---: | | aten-fp32| 0.640ms|3.794ms|0.641ms| | aten-bf16 | 2.924ms| 3.868ms|23.413ms| | mkldnn-bf16 |0.335ms |4.490ms|0.368ms| ### use 1 socket (28 thread, 28 core) | impl | 256x256x256| 512x512x512|1024x1024x1024| 2048x2048x2048|4096x4096x4096| |:---:| :---: | :---: | :---: | :---: | :---: | | aten-fp32| 35.943us |140.315us|643.510us|5.827ms|41.761ms| | mkldnn-bf16 |53.432us|114.716us|421.858us|2.863ms|23.029ms| More shapes | impl |128x2048x2048|2048x128x2048| 2048x2048x128 | |:---:|:---:| :---: | :---: | | aten-fp32| 0.561ms|0.458ms|0.406ms| | mkldnn-bf16 |0.369ms |0.331ms|0.239ms| We dose not show aten-bf16 for this case since aten-bf16 always compute as single thread and the performance is extreme poor. The trend for this case is similar for 1 thread on 1 core. Pull Request resolved: https://github.com/pytorch/pytorch/pull/61891 Reviewed By: iramazanli Differential Revision: D29998114 Pulled By: VitalyFedyunin fbshipit-source-id: 459dc5874c638d62f290c96684ca0a694ded4b5a --- aten/src/ATen/native/Blas.cpp | 34 ++++++- aten/src/ATen/native/LinearAlgebra.cpp | 33 +++++++ aten/src/ATen/native/mkldnn/Matmul.cpp | 99 +++++++++++++++++++ aten/src/ATen/native/mkldnn/Matmul.h | 17 ++++ aten/src/ATen/native/mkldnn/Utils.h | 26 +++++ test/test_linalg.py | 51 +++++----- tools/build_variables.bzl | 1 + .../_internal/common_methods_invocations.py | 8 +- 8 files changed, 242 insertions(+), 27 deletions(-) create mode 100644 aten/src/ATen/native/mkldnn/Matmul.cpp create mode 100644 aten/src/ATen/native/mkldnn/Matmul.h diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 114de632a384a..eb025f47e9d76 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -3,6 +3,12 @@ #include #include #include +#include + +#if AT_MKLDNN_ENABLED() +#include +#include +#endif // AT_MKLDNN_ENABLED namespace at { namespace meta { @@ -62,6 +68,19 @@ TORCH_IMPL_FUNC(addmv_out_cpu)(const Tensor &self, const Tensor &mat, const Tens at::native::copy_(const_cast(result), *self_); } if (result.numel() != 0) { + +#if AT_MKLDNN_ENABLED() + NoNamesGuard guard; + // mkldnn matmul expect dim >= 2 + auto vec_ = vec.unsqueeze(1); + if (use_mkldnn_bf16_gemm(mat, vec_, /*result=*/Tensor())){ + mkldnn_matmul(mat, vec_, result.unsqueeze_(1), beta_.to(), alpha_.to()); + // recover tensor's dim = 1 + result.squeeze_(1); + return; + } +#endif // AT_MKLDNN_ENABLED + auto r_stride = result.stride(0); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, mat.scalar_type(), "addmv_impl_cpu", [&] { auto beta = beta_.to(); @@ -148,7 +167,20 @@ Tensor dot(const Tensor &self, const Tensor &other){ at::NoNamesGuard guard; dot_check(self, other); - return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, self.scalar_type(), "dot", [&] { +#if AT_MKLDNN_ENABLED() + // mkldnn matmul expect dim >= 2 + auto self_ = self.unsqueeze(0); + auto other_= other.unsqueeze(1); + if (use_mkldnn_bf16_gemm(self_, other_, /*result=*/Tensor())){ + // mkldnn matmul expect result have sizes info to create ideep tensor + auto r = at::empty({1, 1}, self.options()); + mkldnn_matmul(self_, other_, r, /*beta=*/0); + // recovery tensor's dim = 1 + return r.squeeze_(); + } +#endif // AT_MKLDNN_ENABLED + + return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] { Tensor result = at::empty({}, self.options()); result.fill_(dot_impl(self.numel(), self.data_ptr(), self.stride(0), other.data_ptr(), other.stride(0))); return result; diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 59950b987900f..0576bd667c3f6 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -23,6 +23,10 @@ #include #include +#if AT_MKLDNN_ENABLED() +#include +#include +#endif // AT_MKLDNN_ENABLED namespace at { namespace meta { @@ -1050,6 +1054,21 @@ static void addmm_impl_cpu_( // Always ensure the conjugation for c is resolved since there's no way to specify c's conjugation in the gemm call TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj()); +#if AT_MKLDNN_ENABLED() + if (use_mkldnn_bf16_gemm(a, b, c)){ + if (transpose_c){ + // m1, m2 are swapped + mkldnn_matmul(b, a, c, beta.to(), alpha.to()); + } else { + mkldnn_matmul(a, b, c, beta.to(), alpha.to()); + } + if (!c.is_same(result)) { + result.copy_(c); + } + return; + } +#endif // AT_MKLDNN_ENABLED + // Apply BLAS routine AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, result.scalar_type(), "addmm_impl_cpu_", @@ -1104,6 +1123,13 @@ static void addbmm_impl_( return; } +#if AT_MKLDNN_ENABLED() + if (use_mkldnn_bf16_gemm(batch1, batch2, result)){ + mkldnn_matmul(batch1, batch2, result, beta.to(), alpha.to()); + return; + } +#endif // AT_MKLDNN_ENABLED + auto adjusted_beta(beta); for (int64_t batch = 0; batch < num_batches; ++batch) { result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha); @@ -1254,6 +1280,13 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor& || (strides[1] == 1 && strides[2] >= sizes[1]); }; +#if AT_MKLDNN_ENABLED() + if (use_mkldnn_bf16_gemm(batch1, batch2, self_or_result)){ + mkldnn_matmul(batch1, batch2, self_or_result, beta.to(), alpha.to()); + return self_or_result; + } +#endif // AT_MKLDNN_ENABLED + if (contraction_size * res_rows * res_cols < 400) { if (is_bmm_out) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "bmm", [&] { diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp new file mode 100644 index 0000000000000..5327ce821ff1e --- /dev/null +++ b/aten/src/ATen/native/mkldnn/Matmul.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#if !AT_MKLDNN_ENABLED() + +namespace at { +namespace native { + +void mkldnn_matmul( + const Tensor &mat1, + const Tensor &mat2, + Tensor &result, + float beta, + float alpha) { + TORCH_CHECK(false, "mkldnn_matmul: ATen not compiled with MKLDNN support"); +} +} // namespace native +} // namespace at + +#else // AT_MKLDNN_EBABLED + +#include +#include + +namespace at { +namespace native { + +void mkldnn_matmul( + const Tensor &mat1, + const Tensor &mat2, + const Tensor &result, + float beta, + float alpha) { + TORCH_CHECK((mat1.dim() == 2 && mat2.dim() == 2) || (mat1.dim() == 3 && mat2.dim() == 3), + "mkldnn_matmul: expect mat1 to be 2-D or 3-D tensor"); + TORCH_CHECK(mat1.scalar_type() == at::kBFloat16 && + mat2.scalar_type() == at::kBFloat16 && + result.scalar_type() == at::kBFloat16, "mkldnn_matmul: only enabled for bf16 path"); + TORCH_CHECK(mkldnn_bf16_device_check(), + "mkldnn_matmul: mkldnn_matmul bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"); + ideep::attr_t op_attr; + // "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor + // but mkldnn matmul primitive only support bias be 1-D tensors + // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over + if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum(); + // If alpha = 0, dose not need actually do gemm computation + if (alpha == 0) + return; + + auto is_mkldnn_optimized_format = [&](const Tensor& t) { + if (t.is_contiguous()) return true; + const auto sizes = t.sizes(); + const auto strides = t.strides(); + if (t.dim() == 2){ + return strides[0] == 1 && strides[1] == sizes[0]; + } else { + // dim = 3 + return strides[0] == sizes[1] * sizes[2] && strides[1] == 1 && strides[2] == sizes[1]; + } + }; + + // Mkldnn only optimized for contiguous or transposed (transpose last 2 dim if 3-D tensor) format now + // Will remove this "contiguous" after mkldnn have fully supported + Tensor mat1_ = is_mkldnn_optimized_format(mat1) ? mat1 : mat1.contiguous(); + Tensor mat2_ = is_mkldnn_optimized_format(mat2) ? mat2 : mat2.contiguous(); + Tensor mat1_reshaped = mat1_; + Tensor mat2_reshaped = mat2_; + if (result.dim() == 2 && mat1.dim() == 3 && mat2.dim() == 3){ + // addbmm(batch1*batch2) [b,n,m] * [b,m,p] = [n,p] can be treated as: + // [n, b*m] * [b*m, p] = [n, p] + // For batch1: reorder from [b, n, m] to [n, b, m], reshape to [n, b*m] + // For batch2: reshape from [b, m, p] to [b*m, p] + auto mat1_size = mat1.sizes(); + auto mat2_size = mat2.sizes(); + mat1_ = mat1_size[0] > 1 ? mat1_.transpose(0, 1) : mat1_; + mat1_reshaped = mat1_.reshape({mat1_size[1], mat1_size[0] * mat1_size[2]}); + mat2_reshaped = mat2_.reshape({mat2_size[0] * mat2_size[1], mat2_size[2]}); + } + + // mkldnn_matmul only proceed CPU tensor + const ideep::tensor x = itensor_view_from_dense(mat1_reshaped); + const ideep::tensor w = itensor_view_from_dense(mat2_reshaped); + ideep::tensor y = itensor_view_from_dense(result); + ideep::matmul_forward::compute(x, w, y, alpha, beta, + ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr); + if (y.get_data_handle() != result.data_ptr()){ + // ideep will query onednn expect format of output + // if given output format is not expected, ideep will re-init an output buffer + // under this case, we need copy the re-inited buffer back to given buffer + ideep::tensor public_y = itensor_view_from_dense(result); + y.reorder_to(public_y); + } +} + +} // namespace native +} // namespace at + +#endif // AT_MKLDNN_EBABLED diff --git a/aten/src/ATen/native/mkldnn/Matmul.h b/aten/src/ATen/native/mkldnn/Matmul.h new file mode 100644 index 0000000000000..8cd5b5a9b3aeb --- /dev/null +++ b/aten/src/ATen/native/mkldnn/Matmul.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +// result = beta * result + alpha * gemm(mat1, mat2) +// need mat, mat2 to be 2-D or 3-D Tensors +TORCH_API void mkldnn_matmul( + const Tensor &mat1, + const Tensor &mat2, + const Tensor &result, + float beta=1, + float alpha=1); + +}} diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h index abfafd5230e98..49d51b286c097 100644 --- a/aten/src/ATen/native/mkldnn/Utils.h +++ b/aten/src/ATen/native/mkldnn/Utils.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -28,4 +29,29 @@ inline bool mkldnn_bf16_device_check() { && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512dq(); } +inline bool use_mkldnn_bf16_gemm( + const Tensor& mat1, + const Tensor& mat2, + const c10::optional& result_opt) { + c10::MaybeOwned result_maybe_owned = at::borrow_from_optional_tensor(result_opt); + const Tensor& result = *result_maybe_owned; + + static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16; + // if dim = 2, mat1's size = (m * n), mat2's size = (n * k) + // else dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k) + // only m * n * k are large enough we can get benefit from mkldnn optimized gemm kernel + // if some cases pytorch dose not have default impl for bf16 (such as "dot"), will use mkldnn impl anyway + int64_t m = mat1.dim() == 2? mat1.size(0) : mat1.size(1); + int64_t n = mat1.dim() == 2? mat1.size(1) : mat1.size(2); + int64_t k = mat2.dim() == 2? mat2.size(1) : mat2.size(2); + return ( + mat1.scalar_type() == kBFloat16 && + mat2.scalar_type() == kBFloat16 && + (!result.defined() || result.scalar_type() == kBFloat16) && + mat1.numel() != 0 && + mat2.numel() != 0 && + mkldnn_bf16_device_check() && + m * n * k >= mkldnn_gemm_min_size); +} + } diff --git a/test/test_linalg.py b/test/test_linalg.py index 2b543431174d2..5912111da4c0a 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -3991,8 +3991,14 @@ def _test_dot_vdot_vs_numpy(self, device, dtype, torch_fn, np_fn): def check(x, y): # Compare with numpy res = torch_fn(x, y) - ref = torch.from_numpy(np.array(np_fn(x.cpu().numpy(), y.cpu().numpy()))) - self.assertEqual(res.cpu(), ref) + if x.dtype == torch.bfloat16: + ref = torch.from_numpy(np.array(np_fn(x.cpu().float().numpy(), y.cpu().float().numpy()))) + else: + ref = torch.from_numpy(np.array(np_fn(x.cpu().numpy(), y.cpu().numpy()))) + if res.dtype == torch.bfloat16: + self.assertEqual(res.cpu(), ref.bfloat16()) + else: + self.assertEqual(res.cpu(), ref) # Test out variant out = torch.empty_like(res) @@ -4005,19 +4011,20 @@ def check(x, y): check(x, y) # Contiguous - x = torch.randn(10, dtype=dtype, device=device) - y = torch.randn(10, dtype=dtype, device=device) + x = torch.randn(200, dtype=dtype, device=device) + y = torch.randn(200, dtype=dtype, device=device) check(x, y) # 0 strided - y = torch.randn(1, dtype=dtype, device=device).expand(10) + y = torch.randn(1, dtype=dtype, device=device).expand(200) check(x, y) # 2 strided check(x[::2], y[::2]) - @dtypes(torch.float, torch.cfloat) - @precisionOverride({torch.cfloat: 1e-4, torch.float32: 5e-5}) + @dtypes(torch.float, torch.cfloat, torch.bfloat16) + @dtypesIfCUDA(torch.float, torch.cfloat) + @precisionOverride({torch.cfloat: 1e-4, torch.float32: 5e-5, torch.bfloat16: 1e-0}) def test_dot_vs_numpy(self, device, dtype): self._test_dot_vdot_vs_numpy(device, dtype, torch.dot, np.dot) @@ -6164,12 +6171,12 @@ def genf_int(x, y): return torch.randint(0, 100, (x, y), dtype=dtype, device=device) def genf_bfloat(x, y): - return torch.randn(x, y, dtype=torch.float32, device=device).to(dtype) + return torch.randn(x, y, dtype=torch.float32, device=device).to(dtype) * 0.1 def genf_float(x, y): return torch.randn(x, y, dtype=dtype, device=device) - for (n, m, p) in [(20, 10, 5), (15, 5, 10), (5, 18, 10)]: + for (n, m, p) in [(20, 10, 15), (15, 20, 10), (25, 18, 10)]: if (dtype == torch.int32) or (dtype == torch.int64): genf = genf_int elif (dtype == torch.bfloat16): @@ -6240,7 +6247,7 @@ def test_bmm(self, device, dtype): return batch_sizes = [1, 10] - M, N, O = 23, 8, 12 + M, N, O = 23, 15, 12 numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32 is_supported = True @@ -6262,8 +6269,8 @@ def invert_perm(p): def generate_inputs(num_batches): # transposed tensors for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2): - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) + b1 = make_tensor((num_batches, M, N), device, dtype, low=-0.1, high=0.1) + b2 = make_tensor((num_batches, N, O), device, dtype, low=-0.1, high=0.1) b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1)) b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2)) yield b1, b2 @@ -6271,8 +6278,8 @@ def generate_inputs(num_batches): for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6): shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1) shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) - b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) + b1 = make_tensor(shape1, device, dtype, low=-0.1, high=0.1).expand(num_batches, M, N) + b2 = make_tensor(shape2, device, dtype, low=-0.1, high=0.1).expand(num_batches, N, O) yield b1, b2 # zero-sized tensors for z1, z2, z3, z4 in itertools.product((True, False), repeat=4): @@ -6352,7 +6359,7 @@ def test_addbmm(self, device, dtype): return num_batches = 2 - M, N, O = 2, 3, 4 + M, N, O = 16, 17, 18 is_supported = True if dtype == torch.bfloat16: @@ -6378,8 +6385,8 @@ def generate_tensor(): # transposed tensors for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2): for perm3 in itertools.permutations((0, 1)): - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) + b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) * 0.1 + b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) * 0.1 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1)) b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2)) ref = torch.from_numpy( @@ -6391,8 +6398,8 @@ def generate_tensor(): for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6): shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1) shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) - b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) + b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) * 0.1 + b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) * 0.1 ref = torch.from_numpy( b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy() ).to(device=device, dtype=dtype).sum(0) @@ -6402,8 +6409,8 @@ def generate_tensor(): for z1, z2, z3, z4 in itertools.product((True, False), repeat=4): shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0) shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1) - b2 = make_tensor(shape2, device, dtype, low=-1, high=1) + b1 = make_tensor(shape1, device, dtype, low=-1, high=1) * 0.1 + b2 = make_tensor(shape2, device, dtype, low=-1, high=1) * 0.1 ref = torch.from_numpy( b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy() ).to(device=device, dtype=dtype).sum(0) @@ -6425,7 +6432,7 @@ def test_baddbmm(self, device, dtype): return num_batches = 10 - M, N, O = 12, 8, 5 + M, N, O = 12, 8, 50 is_supported = True if dtype == torch.bfloat16 and self.device_type == 'cuda': diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index c4731570e6d77..363503d89f9f5 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -865,6 +865,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/native/mkldnn/TensorShape.cpp", "aten/src/ATen/native/mkldnn/UnaryOps.cpp", "aten/src/ATen/native/mkldnn/Utils.cpp", + "aten/src/ATen/native/mkldnn/Matmul.cpp", "aten/src/ATen/native/quantized/cpu/init_qnnpack.cpp", "aten/src/ATen/record_function.cpp", "aten/src/ATen/SavedTensorHooks.cpp", diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index b38d4afc4af3a..0db9bb508ee40 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -5939,14 +5939,14 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): ), sample_inputs_func=sample_inputs_baddbmm), OpInfo('dot', - dtypes=all_types_and_complex_and(torch.float16), + dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []), assert_autodiffed=True, sample_inputs_func=sample_inputs_dot_vdot, supports_forward_ad=True, ), OpInfo('vdot', - dtypes=all_types_and_complex_and(torch.float16), + dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []), sample_inputs_func=sample_inputs_dot_vdot, supports_forward_ad=True, @@ -7118,7 +7118,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): OpInfo('matmul', aliases=('linalg.matmul',), dtypes=floating_types(), - dtypesIfCPU=all_types_and_complex(), + dtypesIfCPU=all_types_and_complex_and(torch.bfloat16), dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []), dtypesIfROCM=floating_types_and(torch.half, torch.bfloat16), backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, @@ -7910,7 +7910,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs): OpInfo('__rmatmul__', op=torch.Tensor.__rmatmul__, dtypes=floating_types(), - dtypesIfCPU=all_types_and_complex(), + dtypesIfCPU=all_types_and_complex_and(torch.bfloat16), dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else [], torch.complex64, torch.complex128), backward_dtypesIfCUDA=floating_types_and(torch.float16,
  • ^hE(+gdGSds{#+UwU-#ds$rg1`A zi+qJ%)=#uTx9K4_7b47GDF`>~i1SK4MV0z&nr!)D&OmpeC=B&9?Og^lXQC(QK}3<^ ze47bKT93fFTL_~4Rc5NkDz^K8`nS3$q-VHi|$5wf4H zB@3FEUZ(!WmkaOoHE6s@u@M9Hxw8Iv0xDq$;;a^AbV>>=sBIv@myS9}G78x?{c?Yl3)KJqF5cCP?4~5Plqh)!(dym`2Ca%73t4qP$Nn3n|T@d{0#`912tw@>jdkWPi@ff7Ji z)8>Dmw`JtAq-aO(!QIAZGazgu$>nVJ+dYMin2tex}dpbhO5I|QHl2K-ge zfsE@Ps!2&+!bDkiffVzr8Ep3WCGPodmV4kar_RqN6Sz^%tWV0Eq~(UgA~Pt#Ox261 zU{-+b%xNXL*sDw;o5Ba4Z%-bR1=9P>9%p;shfV&KXeo>8M9Cv!_t$Ox;O3fz_!}^V zeEPl(=a$y~n~1Vke<_s@;vhjUDSUIm6uvQ$*MfohxSwgr5EgW)#XghyCSpz!1tUGW ziCM8l8`MfIOxY*NLN-`1c~DGLF=H3u=}aTM(fD+r6ZY(kK{&AfEc|zc_6nnt>%8lwZ|D2&`Aqp#umxkmhmEG z1w_yVFHZ?z5E=O*YEBfBaGkMxgt*Hzu6UOa5z#z zFg-R81u514$KIQVQ`x_7qdgMKypYVZsASH(3@wsMhzw=OP^rvTW+^Q5Ol8QJgpwgL zWGFMWOl8ir5X!WWP^$O3Q_uIie|x|0AN$|EkK;K!adWS=KKEz1uJbz2^SUM;Kqa=* zvr_)y{0knnnLI@>;(}l$umem_My^Ms#~tZhZ7Q;a@YN996Jy|I?}dzya;bSyi$RU# zY_=t6!&3pU3wmvQHip+@@50Cnd&7KIMSsLH=bgNLOBwaXjT6Z?{$_o!$MA~n{hfLq z&pLqT_1L%w$8O_vD835F?Y_enIR|`fEQpznoX6r2Q;y@PAuMxQ2a9NET3(5> z^GB#iQHqA}_<%_fQ*r`xF103I+5Q_63=ckPN?(dU^VbWY<>dJXc8CsJdohOeespQ{ z)S3>xaJ}G?5lD(yuaBMFeQxKF#P9BIH|^9Dzuy?CG3Ro`;m2VXIoj`b;P}4B1x^WO z@@0G3vUwKe+##C?+0^f#;rt?0i?)Zd-LuJHHrF?Y^X{GviImy_Y2sC2N?LgB+*}@2 z1P$xL8{uFleZtu$e{dp5)vSPdAnXeUO%J;6eRF}avKy=MWN?jW=QMa9k=df^Dx?vnA2!!bZ7-6 z*=T8!l_IDOg|zoy$UAjNkl`KVX#?eA%2O8#uv>4FrEH%g)PH;?V%1gy2j6qBa;omT zLU(%^K+resX2FN3R?g*U83`Rl&`|9Csu-7AxAph)Ss?RUb^xn(N2WJk!si7h(X;Jc z9$v4I)-=ON$IYTc2Y1RvoyK%_2mBvo!0((IK*K^w)k6k6stC^FBMHQ*jk1sT&92Ve zJ8VSKBlX|hc0euDRGbk0+wFrJWhy%6v>X~G)n@@x{8(m zSD1>6AZI^fR7dP4Mprb^E8{Eq2^K-BdMP=RY?gMvf9w8!vaBgtiu&_+h}2+}SfrhH7wf~bQ1(Cazj z5=5n={6~>?nZuB`SyZiMo?ONAsdYjGi{92p{6Dw#QQWX1_cTyQ$gOpksCGlKf>d>+ z75cYw@m!cm=a=X~l+Mfi+P{~imlGoYFI8o9bAs5-ilQ5P&~+RP$225c7bvC5nM(8Z zeLGdO)KIP}Ywdzfq7J|#qRZCLzSVi~$-?eO;y}DCv@ZR_0q~OgDI}onIo?=@dd250 z`9T^*otjxmJ-ypocf>2av3@!pyvXtx+>U)X|GSDeoZ!fJ|t<+`J0i3J(T1(aJD!B1pz;{O~Rp4j4lG!L0h zq`At@w_A`|YK=!@DcH4`1If**r`b!vA}SQs=YXQS$v--zcmv!TXeFq~<%U?L?<$5+ zl2MhfMh!vvy$p#V1Fp&5&M!2p^+%*1ra5>aBqKhdawa(lQFfStjHHQk0iga^^10xF z+v~plt3`MQs&u61j*W12emjG46z9S(NPVO5T^Kv0)g-C;NcqnX2?sXe$TMRy{;(UG zM<#&l zMMz1c7`CP5lRI%eroVpv=nSS@Pp?6*bw*>>_34xWDRG46=d&N89?Wf(T(C=*{ak*9 zR~h>uyv$kNY1zmoqiPt_>sWGioJhb_p)bQeS%i3DeFc8Q1%O}(iF6M_`%~t!KA{ec zT%oN`Osx(LuckC}LGYE%Is*r2J6{QL!J&cj$m3suMw1dM%8>~z+^>NUD!W?VtmaGU zxN~D_NZH)^=SXM91LiIbULi+eCC}D)?6Vsv_bncV)oY6sVlaj=3R5Vv>$&%A#77<+ z|12{h@r~da>%FNjyN-XqG{!ylR~zs-0~XiIOe`G|)re8`MUWC(=PiY2tIrZNBnddr zA$V9^5An`|JX+7^&Pq-29|r#LG?cYEF&9dk!^le-J;^C1(%+80ZN@aW%MDrWU3W>5 zOqF`5LVRWch7+?qY89aG7=c957HXEhY`rYw*#5LV6sm;))mb8zN}U?f1m^fqt8mj# zJR%+dA3hK{7=weqHA!pQI>}~G5Va?bu^350k$@bl4ZPFhXP|DE1M}e26-(AAMt`e? z45xuIyKZ6Y{O2|0mSBX&yqP6EQ3neAbkmXrLZQc#N#XhcvIJkMc5lLwzi~L4~uNd{& zj&Kdy#_#0LeD^+6y&l1BC5MWPK!MAy-ZuRf1U391#|CRi8#A|#gU?KgEeBrMo2)O5 zP77ar-(PxFMX<0A;>)jqY<>WwfWUmCt#n3qo~pCytsST+Bqx5}3JkYr@_+Kctp9JB z$qhcpUBO4|upsHqW)0vokFm39vAN;xNM}?-WI`mfIoTZ8+VET*3cAgyoAh>#FEMR8 zVX9u{{F<-XyHS=qVqKln4}4k3(Bo=_*YP?(uhKDpy=NMU6EB^;rAvlRV+@>(N={K& z-%0K{(b$d8_-Aa_+c)bsAkusF{4|0o7?Hg=vLmS-B^V_)?tM-&DiO^us2Eqir8YHc zgJ+SA8*rUU7eBp_;(6~rb3SF)m-x8TAGfY=`#s(FC)foWAKqtwW5fHwnM`8?hmlM9 zW&nPE_W0cA`ui6Hej5yHiD-=hCZ10R*KXMI^w9tw-ink4TxTx|f|CgR-8EjgHWFI4 zEC5?m9t=|7r{)4=+j2(~+he-vN+2i)JV8GPxi;UNZwBAJK#bNM2|-BVzDEHr9P2W- ze}w!RI4U3}RNq|&EZ_k8jF==Vr7O52MtV+2*#DuxunnRaxOM}%#*xnE4i+a_a!AS& z?X52X7dS%ULTieB^!eQ==oS(M@piq8ge@r@Nqlk zr^^lXAY?=lgXkMKZTiIE+6a;k$ju(FLxzkCE^yct$r|hq^;cv>Zz(Jw@_l!tV3V$! z_7Vzx`jyC#E%g;l?~aTtaxHqdi;q4ha;+7U&kEO~ks+HVywKW3p?Xx|S~-K=7-D&( zi;UX)aKlO%wa-X{hnML}?&iBAgSbE!^oF0SM20g6u8ky9ZeZb$T#+G5s~*ta9qJKe zM86UXYhWGs9Y#h?$@Me9b?RftkX0Jy#q1`$eTU$htdi8{`*6(>LBcrKvn=6{j?u$~ zXqiEdg59D1j*O^knma3uj1n?xx+1z`u+skjN%;TuNvLiHrgI9PTEbxWo?_bqpX&3R zdz7>#taMHl5+7J{Uk;O%qIGq8bzR$DQA%&2S`eem_8tU*4&djO0A9k^e31cLFU ztRVX5px}P_gKt51cl`+T9EW-RiI2I!8s=m<{o(l?N;ONM2ERmWg+u~i zt<(2&E$J0rvKP9_*BTRP>@EGYte}NS2kkg_gMQIHc94tz`p~=$B5+ifMNixBVgEvfD1GYd2m!A8RUgL z-SoygvnDGAk5i9CDK%4v?>{Ikx^iAR=JxhS*6$FLErJfa0_x=$QzUB`$kXFpfZW#Q zB-%z7tg_!&iFJ8Pp4p+Y8!9z!i{ZA1z&lX|q9{3^FW1a~aES1+%MciwJRDw5Op|(` zT(go%KQaxg6SM(R=id>Og?qnIX~wHxKYh(0&2oTFLC+!AlgE5OqFi(!DXyT>E))c- zH0nqT2G~P1vr>?~oH z5U?;cmkg){27U)Vjx|I=8R-CWSG+m$^qPuEwGPx@D1AXTzXXC{s)ji*GJJ{*mmJ`x zA3(^>wcrIoHAC+f#Fj6h-qVJ=isT5yXKy~7E&0skFU;mM_Qz5s2)4)f$3={r;0(RS zP4?_Cv_SX)r6W7U5G&pP+b21o1r0$AZKLA*wH&Ong=)Rg0#F(;sDZEz%LhiZN!Tq# zu!VR0Z69w=EN|pkNd%Ow+X<{E(KrfPHY*@TUkXzS!-8{Wz>jI(UnNjxYy#X|tjV6N z!4Zmx_n;tQ7lp&*q6OfMaseLN<^bbQAJ4=e7$^7{apR8h-U8K+2BG==9OB!qmv#63 zlfVzMH(6@QSMz81d;YB#t3Ry76wVtdoLwoQTvwptdUe*CGbNH`y1zeQmFo2;D06p1kMm3i zfGWD>pyTNxLd-1|z0O6Y~&mNEpV7}k;Lyjv}AG)p`= z9dEtllQ3R*CP_SHt~g@j7a+`6#IHt~m{3wPyJU;h{QCL547l8ipsxHR&FU%Ly&^L% z)6kOX03q-jb5w*~Xz1`2#Kw25bXJ-0(v@Ehmn}J6^NA-R#TUau@hIod#=}Q;kcwoV z1~4#^7cr7OgAJg$%1#r}`Cb7E)y*(QuuP?IdK|a9kilrJDH7C%+jC{PK{_MeR8I*mO?bah!)p+v<0@9HuOV`Dd%;PGln<8lpB_bd$54N7muQ0Mpo zVEYEscSyslT>|B>+;Cs4ya$^i!rKK{PJC(W!OOFW9KW2|R>y7wLMFpN8U0%feiiDE z)(v)KI~TsA`ZG=*UUw<>`r5~7|4o!=x3iHY_;)M)-E#hn?jQt9$H)V~ffmh#@y5+| z;HL-eyQLDnGZH1+sgHlrY#xT1U5dBC06GGZN@Dw`Or>e1=&SX7h^Lw(UrD`-WcUB{ zZS`T+#fDw>4Od#@@=ULf&vh41KkIZGd}mqeePpCS zP0uAA*y(p~0*~kS#&|>|i%X4;cTr>y3|}s)Tq}l`>3ITXC|Vw8+lsj$J<=2m&{geYknJB~sg*uxvGBX>+zrS0(MJhs|_7II^9cspy!yH}D!xH0J8^ zs*LoJp-(q}nplZ6#qc1A9&6in>~EYXWO37%x|&>8;mre{V@LN5Mn*cs58CzRpF_+< z*zHq=MuH<~WJ(X5jN!VDSSH$>T`Ls0gbuIv1XWTy-Q5hsYFvw7PbFZOO7$AcimdC+ zqpP)6td)0frn;y9N)zqN9JTc`F%{(zxR#ghadEL+-epo9ckb;_?jJYt-Y`+=2ngZqLB(Y0GGYOkc- z$4C(ssDH9Eo$z)ujBd6^5g549EX)sEFJn4eWBO4(o+vv>T$lnn=r)(@d;KZXC9Q9peFQ(HAOxyNY8$Q8mad*4RnixM*BONSnszN%e zGs+jcq1DBMjbNjh7B#jy;l-4FvhenRiFoEs?!^nY3$ILch_7=uPB{andltdLXKVx6 zalxKvmd#gh@wK6OP5?HEBP7z?90pBJLi|E2Y@6T%{>FSiZEOzqmiDcJV9(Z8&%Tx? zR+RR7eO!xfZRy{r)mVjPSc7d;5?ZWmlt?C;podUum%r|QkSajlifvFI}sma4c_@!I*d@A}Km%lxtbV=3wv|672) z7ETIU4V2O{Fm`(oR~Fj*6hS(UCv;@3i52IlQe{!)9o|0>z&|@KFw9|J7{4vCp0e_j zC0k=OmQ|n~LN?BAJo>ga#RIb{R-#>)7Mv5;&13n~i`v3)&Up@SI!4XB!=LH*X_&`= zKSz^xi2Sx)`(~nj`E1NT$77nCQA|L<5=%}_{)VE4DAun6Y%nj^*Iew&IctfME5Y7X zcd${-e=A!+8dlNA64c{d7(yf~3RW^C+(mZy$a|deC{3A(?5C3)XQyvN%|m*kTD%#h zsW|&c|G19&A-(`9n~O|gle8L4rvw9EKl5A}=hvR0zM8g~{^2<|xrpMax=J&=?HuE{ zl7$)w8vuJ-Nb;V`_CK3%8|6z#mg2uJb#*9Sw7`_8%%Fx{K@UNQwn^}O!da}eB+$pm z6vEfJPW3gMmEgBovb9`eXXKaXHYhXa9TLBv+Zt6GZAZD}7PEaYMgCMi;|S7FVD+2+ zG9;63_<#4tCy4Y?$?^%pY_%f4D6nINT74>OsXP;R+=`F&boM1;!|7XY1ioggd6-CU z)_1JWe`KXS>XBu$DG_yn!GxnNt|hGL44ELh;X0K1X>yK^QA$w-gmb^e(27Ku9-X8XGv zC+MG;Tk}OL*G)494q>~FV05UdQu=~>t@M1B%^FUQAFRzW2#~>@nzB(Sb6Pd+sFT=s z07_XMiDHVUdLd<{X}&i{qgG@8YDq0A?#$BQlrlCZ)G8QSe6um|G>_QqYuG16DMjym zw0V5gJ-H}|=Ap>A5+;#klVd3^!SIU!C71pxdGBDi7A4bhkl3w`@_T;+g{@o{re^>Y z_EQ-9v~*hB2i+fsHXvHt&z-(&+=34HjDP_dDMbiC!AB;>=r6fNCDQmbTr{X!g7$25{ z;k%YLMe7y+22vwwMyaT+O7!5vnM-Q?KegsBKsXr0Jx+nAlZwIa)7Za4MSF?%3T8iP z^zeB!tJA9=U(w+tbA(M|C_ADT;=&|^_KyhXtln7YpQdD_bv`yQk930xG?wZr;m1%B z$r|?cM!6g1zmAerz?h*_AOK)}@T6CDkTG?x`l>)$IO&m4gj=Qty`qp{wv@F+n7v=7 z+nI~q1jWGMIgU3f7KWMZ5;imwYMKP0)O*wgwsnGj=UzRfaYQNfK7Q>9qpGmc?XFx&E!B%=lFTGsvNlTY{z@M>O#NhzGqz@kcx2b=0 z*3BnY;3PCpw(Z6Pnpr*o0X?yMEDGOqgsGo_%3Lq3f0(gAt)N=7*)QW)5PLlMT0jGA zGQQD*6+%rkJ8TAQZ`j(rw5+~-bk&NAG9`H{P`LCyqU@yS7Y~F{EQP7}cDr zGk2X=Lc9@^VwXHuWH-xV-N~#J>d7(@YCeETYG-UMqhwIj%(_8yE1qO1CfmOY!Jz}D zAR*mvm-E}i936g@ky-LFD*DjXnFHA1QapcTe0b?;RXYFRZrO)!#nTGeN7)_=UEARH zPSv$_AgbJT&dQ}`JKfi?Cx;snyo!+{wNa8h9k&yxDM&!Z;GNqt;?<~u9y54mY?wf= zsJB7Y?PvFa{eZ!z38j9drmVgd!Wfvg61RUH%XCqbyXtkEqPU0H(#-S6%hwqA0|mGB z(RU=PuIK3!M|@cQhc0q8(8dSXD(2#K@H~wxroV&&v}&z)NCB==-T8i9eZFA5g6(;Y zqbJgSEl-@C25u@ne`1;m%Dvi#d`apR!esP&W!f#IQrzE|%(gc>4dno7(rqDeiPltC zw5s+vHzpO7#)Dtdvc(33rsr_}L^cUiY5CEh6%{fLs!}S6w{|HXv|h@o_@V>1RQZg7 z<8wQ5<)YJ%)2i&XjY+4venP9@E^V* zwMyO;rB6qD!>v2+=Dk2hyi|yK{HrEirP$tJ{^-(ni7osf0W4WkoIj=^w8OJnd>hwk z^mzQ^?L>}vtR6N-OQTqAX@y+H1~zR5E9YGCyb88k2@ciCp5*I|rlbfyhcNCubgNZ6 z*VfkoM!EBZ_Q&)g$HtGEpJcM2y&PqZuNTtV&!LQ?YV!HI#UD%C$Ur>lr#5*m|8;D7 zwVypHnRQK{*h0pu$vGQYQfgx#0gFJM(>fYe?A1vuXi>^bAH}LAi55|Gzlmzef zAB}=V#x%F=qP{Q_DZAO-L`yv)?;!^0_3yntp$$^mNL?Jk*8Br5nx;Xr-K+z`R29`C z;ctkk-O?KY(_g3<`0&>C46rxx3l_6Ygkm>!23dXcEU464=I3`IXc$$bEyxDpCIUO} zWkjNrhM?Zl;Y+m=3c$oyr=cUzP<5jvbn$o(Vj~3ZRV}$af)F~`Y>3rqr+s~ ze6%%K{-l&MQ)}!dibsh+qwH>dS=cu5Q5DoJEb=G+39yd#9Dv!zlYP@mn@^af#_mR_ zoV96pBSQydUqKT9U{O=6MKQZZ?TGiCsI3C-l7`SE7unv?lroz%DtQj+%i3Nn{5}ca zL%GyZ4+z$>!7J$TlQ35W7wJP>OrGu&_RG6)3Gc&iaDdrq%Sn(4>4Czisy#0dUVOo> z*0fltzl-$_yYgl>y>&uKE>E{;LQG-}jObe!I72^5o2TRimmHp@hd=t3VRVR z(PPw@hIFLG-~Eha-ALkngUr+V+GAA5=DZV*YS3$t_klQbRtTx(WY#FV2<2QE@$H0o~ z&KW(Es%jYPPE-sgB%&YqdZny9IWXw;eCDywi=BqVfDfQ$cjWQ8Ccc=wf|VanwMuib zQPqlKvleefB6yo+>jN6Lx1QJEI(Znkk`UOmVSZ}1qqDQ+*t*5X!0JftS(_^^8PEi@ zZJTGm!%TP3a69^JC9J!0=#>BGN7+ zvS)VOV&8?b`xap`thrN)Eq>d}(DzqeY7ir%q9aysmpd!+%bY{4I|C}VVj0zi2RG#+ z>t_ZBC;mlcuAsJlng7e0z82ob@aiJ7aDWKCboj)I{4sWnri zXN#a;pd2dG%1$z&bZx=cKbwB3N8DPxylr3x?;HR2GGdeUwfg?0iPigjwc)D7bLt8w zOA{7WHF(mnn-U(5zPxrK0!%i{_q85Q>|j))(lMumIu1@SWgME3vi7$^sbLQKY%Zr) zXBn1%FA0(1s-`6RjwT6>Ala1H{h2OTCyi0ckd4LFX#t(WD{f1ZpU5N6zq{D{fTEZo z!Gf7dD~7}ElxxN}uf(`7Df(G9`*izSBB@tci$q2vF>>N$ml3#Xcq6&~64HwdAKuIB zac$bS!zce^BS+&~dIp|~^LZ21g`d=JC+ehabm?C4GA6nwGKnX?jX-N+%JOM`?O>Iw zyXj|R{Ejcqo(Qz7MO9azi;qo^bS6UgFSjh?39LN91dVLpxy^$=&RS&Hj`WK%_WpJ1 zDUZ5?t-IK&XM$)fT#zu6i37_&l$wsFruAT<_rijPs)2+v}MGHTB-)MTRpz^*;)b=fPLlG7TEjtY250YRm6dm z))_lBWL#^?#yE&8_Br4$Nn$jRTQSNt@$yCme` zLjgrUwD^0Ut6YoHZDm0u&1Q4_G$1H4*jHAaUL7wql1-hgcWRkdMsa>ue>{J{I&}zl zvR-N==Wfk?e}2=_CN}JT0lA42-z=^Zd4Twso9pvuUh5;Yg0-mgh~Z!WRe1=cxLPqh z3Ce=ad`fZO=U|=FO($QjW;?xeTCZ_*P$t+AG@ZoeiwgG`U}@V17S{Nrt_eJsk($RB%=*4Xr&94@Y#L-5{?l=&H( z>u$Y#UtKY;Tnfj7dA;HcDV$5HXH!M-;lP7yIq*5jxACe+qK1y39m_Z`A`qDg!_GIH z2{o6TNjMdF&W4;7uRkD@N#m+z1qy+Wfif$7V&|(4{1dpsZPxXkQj=-mwtr1O8GM6gT#XLR*oNLvVEI z+g0-*mUxsiSJb8P_2?tZ;pakDTZ@utl+oM8rw(gQ#Q zk`#Zs-LLwp?&~*BV|>yVFk97R`=)#7KsGkbmbOIGugQ>1Dn(OjqmE@ zSVALytxXyLVbr>Sl)MBUv-L5l{J`ho=};-ir>yY4cPGz=$eYFTs5E{EkJB8moQgu- z$zu@^Orts@uwN>P!JVq#w$8UJ3oX7jcx?0F1;uO~uhP=Tt3n-)A{)^w%+|hPu3^Gq z2S}iq~GLBxtnRsnC6JB>Rk^b;YDlh z8cglj^a|#9E5)mw7Z;Gen&kM>7BJ!5&OfXA^n z6sYnd>&?Bx;M%N#&&s@Rxq1X!qJHyPDE3w4GozTY$A^OlRHk+9RioX@Bkz_RSC|go z)c3bmooq6PCUOrOqw8PCERC!>Y20~}erAN|+$>c4675eqFynpV!My{^g2X(7i%PwxRdW^?#n4$J6n+G&O~awK0lo zb472S&c@yv@*pSX^=zf$XobVDNxsQ!Qylzf@ev=X_l3)BmypB*Wj-X$b?sY!?I*Oy9mnsTyOswxTt`h+kkCv2JJG|X>u_4C03gOox zWy=HyUa!f3X|S342b;fQnzX$fOw2xUNNAz{UE?j79;eT2{7ud=z*hLkUI6k{^}Wpb zM#RvaLu^;;m%Q+@)(#&JL>+ZQmF{I~r0cj@E&StK>{e4TiLEzG8Q(I)ah1(~<_&*Z zQ{@Q;hV0R=oF8Q<4vn945gw(8TSx%!#4;LTl4A z&-L6nKXHrCJU>p*)X}z=mAa$YhR_O0>J1;3Kt#!qEz%#ruiVFYpWFmv7(&mj)J3x* zhKWRptc!KO&!yXlN!mY-!S`F-+UO^CBr@FDQOBzbl;m10zbgxptV8k#ay#r>w^t11 zL9Z!%ner0NqVS_(3Ix~O;Ftg7{CKT-{&3lJjV?m#P;X+Y0_9()r;oHe)&V5Ha`JZx zP2R!^4DxQBxV&Uvt>ih9WOuQRxRPm-xCKGwAVDRNkoYMzl0xK3K&(>v(!;oF67!O`jR`KG9b?i%&bx`9zY}?Rw)zSo`T}l+ za-IBTLsQdSVBH)x+&e2{PF0GPg`k2b)NwzoacWNGY%2{JJtWtCn+ex65?He`a@pzf z1J2=wM(ba(erbvUr+ZAOY(N&SeF!B zmwaT}eRCA2kmCC|d9GP{lS`W6Tsb&#d!O4yOObdEOxCW!k+Pi2jE!;Nk)QWcpfG$v(*hG~X71Mv$(Uk(>+T zE?MhFCtq~7_1K_yl|kZslj3JvzUgxTR(EfUxpNe}J`#=zWA>YEaUFV1Jy7z7btpe5 ziCn-ewRqG!&^?uRmRc(C6=$iCxK||Ubei=)b_0So?Don@jD!3aU$c$pQV;ghTV9Bs zy>RdC!O~!vg~V6y%YtNS1qa--S8jh(Ey?iy`lv#I{!;C;EmcxitayJK zzeQDPKn4F39KR6Xx#_aQfwVH&i!cf9Bo^`S60OP+5|(3xNi=-hrwsqzP!0NwFtqDH zJZb1G^Z4g3Rjm*4rF)^O@i`OR3&??JWfwR#*a>6}jyMPrDbjAz?Y4f*It#;h-LUos z5W}*@sb;9v>VLQz)%X}a7<@Q>q2L$t{`6z^Jk+fQsZ$yd7vcv`)%e`Yd5q8@e_upo zZBq_6wUGH~^VY5cXScA9XqPO?MgcNw$B}-~Gz+_V0b=q|j?|NAQ=b zr~{kZc=XFYwo_c46|0L10|ka6j&qjO_q+a{-|R3hZ}z;qj?fQ#nT1mVnV6%z+a z?aa|qyalu^K=hPD!;cdd&i_`MC`ktZH)qHLief`p9_ze!jk=zcTrPLG;S7-5=rc6& zN%{Qe?qE>>nuFN;(Ae+}>MkE>yU@zZRnyq50wK+S$7p|~mpO_C(R_3Gyq#yd<_B^oJ&_EEs)g-t6hx7m=(~mq5ZV7}&dQFlHs!NaSfOY#7 z{J+g|8yUhLXZrh?0nNq{x-XZ)Zd28ee;sl45Rg<2fcmOb?Jb}f(|iZRhSp;Jnf6DC zx;JRV%Hzb~r)yb98@2x>-t315ojX6DhV0kOuN4ZkNT*(tzMU$*sk4IIuBV;c>;7Yt z7hLbw?|SamEP}pL82EJV2=WI1@Bit`!`=Hj&lmiMMDc(5Eq!kI$o;DZ zz`XoTxBMSp^Eu(}pRYdYJNLi29s@J>vsb~c>Uj49{nyt%I=H*LjL~PXX8Nxm|NHOr zCjqE)uRRaM)PFzlfBoZg2;BW;$`SWn4(fkh+5PqZDti6<$@PQCpC2tl8j;BLYBCRB z0tOn#V)jd+KZC=J+>)xv!gH5*yZdpkogf2O{7}SQ5~+P(fCCu2=SItHKNy+*e%t?p zInft;(C+J03l>v?i&_B0``OoYA zw|F99Ph&T14A+njBNnbLEc`Qfzp1vPefnd;7^ija{0ktJL8H zl5+HWnrLNja^g}MxK|GYqIc1_@Dxu<*eM|wErrwL_o3NG3_l+-VC`X&IuTkUT~h#0$cE zA1y|oKaR{VCCPx_>Ml^YX;r`#Hh-ybmb4w5&%7(^z>OpD#GYPv`xc+yxT&3<^`A;3;@AE4vt-uFty-sIktb+ zSkE~)5k5f^O;?2A6S=36fkzEIbxN%9Ux26=gIQ~r6sML=_CMab9@_mWX%6e|{#$g$ z5&RnAQEHHgDHo4k8i9@&R?rgXKG3jER)FHHRi<)OZqaLH@+>|de+`d1wp9Lp1dJeU z!p=f!8;aP23{*N_`VPe?xW#rYG=Cj|pD|}w=hiB4UhzMsyUVv|6@d9!I%}2#i{X$4 z-=*JOE4X1P@b75%9p0;#hio}nSTd9X(ml9wuicmVULZ+kM&L91Fyq)9t?1_v+BG8j zSYIGBHr0f#W1cd`p?wG7EI}t3o4dA<&<4pS0{Nc1FQ|pQVEk1xTjT{1X8Q%rC%;#& zm~f8)F*HshV-P=!P}btLzX7E)h9s_uw3{5JS6-A{7_ali2Qo&8784HWJ^0-W^?xwO zD-Z^_W7QKQ!3*-s2=fq@RV^FIcBLpqQqkYut`NL zEO3;;wvVvQBOdlrA-A|DOZM#G4ti|^IEp94~D#07D`EjyMgu+iKh!2A{^18*#o2B5B)9U;5rg`+Atyi5Jxl?H3 zV{tgN%8@g`vCUZQcR(S~!_mm_p#cc3OIlTf2yf<8TE~jUIIktJMW4`d$`PRUWe3Gl zyO%G<|C~u#NPD~MRfh-fAa4>mtRrH6haKt*2v|ee5GqFOAigg!%v)XCB4K!Cu10l2 zsh>#Ir=(&c`e(7|BmO~s6KMr)@B>H0{E4&C=sj|gBoc6+TJ4wy=cl@!E{n7|qj_`l z{9k`RN)SSW=VN|m4Wsg3^)&Ef!Dh*VHN)to!9ORjx*)u@_u6I8O5q205yt#c0VCuH z2snV)KiS}K{5@ihAUn6s`6q|q&_^MgPUBJgD0nN(smSr6viBu~iO4hfe|f_Z?AA8a zeV5?;sDujdm?dZ^Ih8#e7S-R^`Fbu4m)R}Kt%G`T7}*67!n8xeDdF(TkD_1K$hNhQ zu0qR;;ziKyR)9UhL_OF(J%Ew{NF$BQz*(jwT+~eE7Z`4r0t35Twx$Q|*pU3<8Gsf?~8JDK}f<&;I0z!tPEmYlZAJ=krqJT=HFVZ_~67}Z{)>kgd;1Nz3>HOpCbsH z8SeV96Fk!@rXoZ~fWUkmwz>!$EYiX{I4nJY=@~kIzd7qhrV%rllI>;DuaLZC_GDJX{nRQlmdj=t@4t#W0p#=__OUj@tUHF{zol50nm_1uNa2cNc<|wlmn>GNBei_(L zbP}kp2@s_s;NZmeivGoOm=8xwerz6df{x$>NA>l-rE1PsEVp}l)R$mSv zk8d|mZeIo-e$;}|X=baX^|~z(Y|60qNpC)UV}7|^73>676>b9&d>B?&Ms;#ZwcY88 zcc(?Ki2L8K59o@~m;|wEhdFu7i^Yud!;B}nq`G9nwykb~_vD}ytN0PSWnvD%j@W}aX;D_w%Sw^Ty-9pfM?QU)WrAl5yAOCqq;I9|J zEzD)n2cW~yaCx2lmX$q!6HM~)*-@(@tJC~k)BW#fKCx^4%}vll2TZL37QvUR z13jjYWR^e2b9@+pc}_CU*2i;?*^4cgWxwtGqAGD>?COGs_UOalEcR&;3K*TH@3(=v zpM8qIg#YK4LwBiJvieyaS-Gm)yF zmQgCeowxKliS)TknLBkV?{vNGv0hT2m~PldcxC%M{o`A!X;!O?$sl-;?yf%Hn@mioVqtnjGZtz4C3ZW&*~f#V z_-w&ukVRQn4Z`M}oax=UFVORn{O)v~#=hn2lJFC;gOe2JA`_GoeSOhiKOQNxE;o91z0mz#iy zn_4|-yIjX~>P1K0`D>PLoYrS+!7Z}Yyjq*}l*iMK?5Wmgh>$&=CJ(F%;?pWeRd6e6 z{Ax`pvSqzmoF|rfHt$Bf7<gXt!@^Z|QX9B1o*8Im?qwU&21}@6bJmJKTTl z{lR+5T!r_QH|YjfZ{dm+A3)zJn?cb{ohS3k@gG8|wuevLGquG?4E+-~N2cg|HQ72O zaGI|}&inqTy<~#wI^hC5@3-^;c9iQgmS*vL?IBx{a-+N_I^#Jt+)(x=I8$4_lWTqP z0&M2f>?TFgqwCilOYdumR^!AuZvEAq8bhz$y#Dwn$Kj0fLE_9g$U7n)Jp~vMWgETp z;nd9suhM0;7&dL}9wEkXU1h$pz7O%Msoilee=hh5NokS1AtbdGb)=Q@$g>eSSb-Ba z9^EUD(DHNP(l6?UFvrT_RqE+g03J{pG33wLzLp!lnH2Yk{fdc+-+@*#9Jg90+a|Bb zOL(XrIghT^*3==pHk5eN7tG5zoGm3Itd}xqTV)Z&fXh|YVUC}$eLn|7a`)PWcPiBf zq{v*U8t~h&uHkAsr+ZX5FdQFng0admG2_?5Utr4M3xJAvoWZVQJiw8r3si%t%91pf9<;bNOVvlKJJYK#1HU7xgRV)gL?~%S6S9uy) zr``>f+6KwqI;ju)JGVfGJl5=4q+_83lXXqo`9gk=&jQxq%r-*YgaD*<^Pb`qyR;X{ z3JD~b9zOOoB`w}zmC%&-cr`st+)fr?RH`Xq5j97A-9o&E0bbIM)3q2-tYH+u@b|4Q zx^?HwJC|0t#lDxAwQSTAXZiQB$q7vUn*MW;ZYAy^(srI&UJNnC>p@n*!Z?e4cvb#d zL`Zd&I(n18u!-)SwvHbhBq!KTWOuj-isGh!o1_UiQ}gpIe`3y-hakeI*9ATD(Al|m zRcrP`>u-Yjtk$Jj3KP5&`U#Q3YS!%V$s&IF*Mc{5^m%W5O6DvX25CuBbsm=Vbu2dG z`QwRde)sw^lU_-Q*@;U3Zga`7og+^AcJ-oWT3*S{KQgN07>VL7rf8d)8q0ll*sWxd zO|Oof+r-u3=#4;rgO-s8acvjm3z;VlimeHBf-_8tFG*9uq|`r}OHo>|meJC@?c0&_ zlZyg+NS1e19D7yVT+7YIt-p%$^ek2bZ_o1bzCYuQn?POxuFJ9sqfcwhw5Q>C;{~W-R)&TdxUZGSmoyU}y20 z#OlFl3y;N9)v}TvDm%hVukSPY&$CjLhAITByMZk;Ub|UAs@5w$DY&_frzpOnMSJA&Up7fI<6Slhfxi&D%sDIyhfBw%%Tj6OBZ6#24RK0DeZ(%M~w^J8G-qhV>K zDgzas{D4{w#o7-s;pe>+X{ND~bJZHl2i+v^G5H(3vWkBF5<%;Dw z4i|fXv?I-b1rPQL04#?;rNr6mBf;q~G>YSRc=9ljQjpu3wf2a1*LMijVh6cf3B^CX zu9b`_W>7}u-9Iyu{kF*pd(CR@Q0uvTt3#QP&{?NRtE#d-U@hwYsPh2vEHpO@Cnfr- zEb(y>`;Cel8SEopM3I{44NL1;dH{+#Ysws=i2 zu`c9wllkKcrrYG+ub!?qtPK9fMXv?W^XBL|LFk1=0te`R0z{rD}Zw#tiym-M16GZ3ebl zRN3veH`ad@o!@Hs9uSB2pjsO}#yD_q;=pZN!p7pT3`Q1xghZQPE)hIpzhIm3MQ(^+ z8$Do1_z)m9`APCxPZp`U@y@}h{Gr;J%^Jl2CV_i05@5$|m* zdiy*F3T$n@3CG6jR?~aBfiNqS+cs@PbR_^K3ydk~a;jWSC}c<57W5q8aYU)PXc3Ca zdoEF_-Fs)C@;Ccnhx|-zI`1i@0pqQw>IiF=1;_jM68O6}Rn7P2qKW*)mq`vDlb zh~pxfl{C4^J=ZGr?Y)BcNd+O3{Zj%qjun zxnL^!cD$rlOuI8KDzaB=^RyS>o73_%|jg~+sQku z^y~56LUb=Vh@o;-ypHd9mz$(438LG#+gb4?I^sw5%F~?;zoyH9DMibpKLD#h&kLr$t&&SYkQCW`KVZM-x2C4(;ts4P~1HK@cXP8T2^F zn-?ChLSUaT;P-2lQy3d`da#@Z1k0q#-B3!zmxsM{k67=p<8n8JKs032s)6m za?SLMZ*o!htvsaAruk*J+J%_B0T@zY`}97{uFr9MD9++k{D;J7_Oe4)9M*vBxYZV^ z<$wk}DTpt~6CHxq8Uq$}W+^A5B~8m|$PBc2viZCy?lSwMPml!oI%JmKXI>NobkxP` zd(XsgaptPt#OgfJPx?tHE&IE>%oI6^I|pYT9+n%>V)fVNi!ot2l>pq%pMeYPU159h zC;>kvj*#U0wYu%RvM!Ua(buV6XH8|H1740J`N5*MKVot#kR*?kWqpe;;qFLv#8=_V z*0iYdFa_|w8!xTlv6pxyX{;&7^q`47{y=;fRypTnEIsj%x7o)_-4k%G%`KfE4U$bYh?JWA^Uu(THZwyyxTR<8nOQrz2z_jOssG9@^!YS-Ne&U6(%JQw?g+r}3+ z>kCd-Gz}3e=FjebAvC-X1sA<;K5`u9)nsm~&%vX#$DM$oiCV?`psdxI5A)lT(k zM_H3C>lo4v4Ac&6RDD4p3)KWMO($%<(2h{!zh#jTgv|AIR#OE8-N|>z6BxDV8U1gy zS)I7spde;cMIRrqFx15zyT8J-TYh}&!KJos$nXw<(8?ZvhHuLy#vb)g4tJ()-cuMnSH?ml(S%xxrs&Rt*n}_5YO; z-ABMmY5$iO;XDgsd|h%~H{V1$0wpSta&(HWhDf4O0b?1iSB-)Kakx3KL#OEtB+*x} zwuL=h1}1qUX&NWhL^pGfTCE@NvjljbJM-s##e3-$_yncoTE!_uaFjB%hEbD@8LvVg zbw$wxgrVgc=krgp@p#|W1z;*6!>G^kJdyIgAFDRCE@~P855_{P##Csn_wXi!X_tR| z+pwoTeA)8|tYQ_$M=szW|+{rmjH7-mnIjS*~$RT3ceAH_~dUF^4KU<%MoKIKsAay{^C8 z({D}`$z=X9>w6Q#>Y;Z!%L>!x?|z^dJhRnt@ZrCOe-C8Mahtt33TrNuTrw+FaOY9B zKNL+Ezm>ELL^dDzU+sPOSCd`0Eh0kb2{lMf=oYFp2|d!K*-(lEq$nUwAV`rCdIzN! zK@n7>C?E(ZgeuhvhzNuhr3Mh`c=zM$d+xbo-185d@%bBuORx!}(vK^(X6cGfP2YjwXjFOylm zZMbZFhgAFHQEsQ=2fTpH#dd}Ed`OwQDWKaZB9ql14kXKSunVp1-ytP3j3MmEb^&KY zP`&8`1`W?pNGCM?+4G)btHh5CMPb7L;za2FhJ?Qgzo47Rs2;ynx(Igh^g1t2Jg<#b#vNjhY^G=SHL`-uP4oAI3*j@$aYV3Gkp- z0ml?Mh-9mq=5L<>IU)Uk$Z4{w*^`=9H0RUoL@fcZ&6yJkUR5eJrnZu|J)g&UvoX z45tQJpa5kp(_CB>Pu~$ypJcZ6lsxUvT`IbQ-(X5golC$SS8n1*Kjz2F01JaS_cv z%M-+eDFIJE)Y~$kLf4xUn3@%@zp5tL6?L#`^9oQF9#jV1*(z8K)^~4f9Lj!CU}Gk~ znIp@6d#d5krNLeH;LS0sRsOkvDO?FSpPTIGa2HpbC=}`S{Jtq3qCy9Q>qIltQaGbj z6>xCOiY8?36Wf_4g~%os96teivN;3BS5K^A9OsBvgC?@{HsSFhL(tWTMjFQPHugG8|5?M#CB|T}%%(FP}&R_IO7Q^AEss zU?*ZO!_t?(u;DwZOa243t468eswbhMz6AV|jQZPEeL*2YjrWvGN%pjXRl3>CrT@!2oX_y>Ltt_&%jNM|z6Y z-SN{=^P`!1-4GelP?X&u#1WvQ3sfd6o<^r3iBi>%Ry_Oo#qTsuvghIY$%d9~k5YA- zKOEK1!_krLnR45*>#~z?057ma#;4U6ngT7(Os(y$zys=N1Cvn1&+{cSa>j!!ojF&_ zy71YIw%dc5dpkc?40W2f1X63hMwTX2$$2x*axb%5tsQHiLJq>YeU8mDu2=ABY-2ZG z{$)JWFdqVnXS@|RE`Za=gx(HQE7D~e$iy2FYpn(fQU=ItB5|FXGx(5J<@S2}K-ap_ zUz8+$e6RB3!T3BSP@!-aNf5n6XW+8?k^37sZr{boKjk0huv#y`$kzdHx zs?7;k^xIRaZ#d0=96*;?%pZ(oWpjH^xq+5s)LvCEMkP{z zCx6#|cNJ(l2nPaTmpFybW?eYu<5)hnf^?SxK5ahoJuAynPvPtG8{H~*cgO64(@nnv zVViiuuVt@$(A9qR#FTXdaUbOT5B>qTISS90F0D`q;a5T@rZxe}VEr zP9R`i>(xOLuHN-`89%!8dJO@c#ahN2a{|nBs#Zs?A|dUWvnqv?dBi;L-Tu?w!Yagf z-K#oDqHC4)FbZyeWPnm48@k!Z zHDZq!sB?iNgySl|kRB(=3FB<}CP)dUFs-VE&lFEiZgL>hK2kJXePqh8aqNsf`Co)$ z11j~ZW!x~JIhsS&ncKLpXTs}!&hFMKYv9_=ln3@8XNuXp%b_%^`ILrFH|Fg(z z8FdfIb+0D(YUxFQ#w2U%+!Sm-N+ZbDf*z_Dk8=lnl({#3(XD5mUrL(gn(I`%1l2QQ z$An(mBL_0cJI#+PjLVNiy4aCy_$NBl*0e(`tPhrko{uP|LDz|@uO<*by`bH!%MaQwRi zZmG$1xI@)ZgwdHs^mW-g;wF8p_CO%X8i799$Nqavp+gNnz6f%t)be~k-rAICIxpIb zGuP4En1)pSmPBxQiSU7`igZ3|Np_gIg&t-du67C-XyHU`1EusBjAM|vh5Zl1GY+Gf zl#gY{EvEm0rd>QhZE!TVep#HFKZWXdXqU23gE+06D%JKcBJPx(WlGj9kcP*9yK@o9 z-ii(0yRYOTC8a{y4{#0)%?Ur2UX4Kv zt{g>$!lq!kgcBcr1UHQvi|@{tFYVQxo}ZAx+FiROpDd_`QkHVqZ}zSQ3fR zrHEvo(niF7KxjXe{Ha3(O$_HR{iT8BK>eHRUw#+@7(+a?sIPJZLT_W_7VRU$;_rvP zWG&+QL_)sddf~${E(xetq!gyt-ZHBN%8>ppMbrAp&r7r3OLK*l7f5tbGWYLvFugp* z#WZFgRIlFrp-F}g#uvkFkf-miFskZC3P&)lSYYgERc#*&3==aKwyn#}G&XnaH%`y5 zw)1pW8SXJ8DiyL1vxT?&&Kl+v{$sv0er41>pJm#c@E>9l{;O?b0pb?y7=JE?onkt4KuTIYGtOo@&r`VVm>(qJmYKMv8G}F>0#Sx| zNLvhdUT6n0H(J$xs2{E)JI$&Sk&+7vyTc>Tk;#SjL|r42k2D6yqP}Iv3+%k1?5tbq z17KF9bzV1GdAb1Z;@JDC&>nsGWylixNs~qlvd&_^O?&GOqRfGMFK*E)QjaAOC<3=Y zu6i&sT3l}}d{S`pQ`T?F{6!H+p!Q}K0k@ck@ zhte`Y=ZV%cMP=SONKM{krAFXzpn=&VMho~YrEmN7l0>oULYPo^DBKR75VU}61nha? z28jk25F?4hL8)c6@4!<}rrr2MRE>k4nwL26FDJwm8uL-?=K`oxrP`ru6jI#+4UF#i zsmUKYucPfF?|e_+q|qXS+kE`V$khYXU>oLjZXYeL8@$MRkxKU~Bw+%>E+(7;7%}A6 zV+UgS?Xd@zmX_6lmu_1FGu?;h|0vM@_r(gqZe@@>$vvc)q@`SAdI7hVDG~@x$drJq-Fh;+sJQVsA zN8H}`U2I*1bxPSCFRu!B2Q<9u4F#N?`|OYO9Z`~a>bpN}h5ru=2;#P=gGIBA{RuJx z%?=vMC`!Z?*a*}~KFUI4c?j%&J5%|0Mucfolf~V)b0?I?H9qN4*&oOIt)TtkXjarU zAXArn7EGpB@TfrUuTev)G)lUL&dwjI=aoV*Mnld|xc*t6*!z$%dPg_Zzzv?YMHj^8 z392J6VBc;7%SG*$Y+#A*4rE|zfGLKesN>BfPn})-VW4m1L&AOHTPZfA6(Nr z_;?aR7aaF))&9+7&O@@E3N=u0vBOaBP4g%7H{Akm!`Mc^qH);uOGx2VC6I~e&BiH4%7QtyqwoC5@Ew8bA0SQl9cTGcJ1{r6n72f_T9 zZb=+SCn8}&cPC?kR`Bi!kYwfw?*rQ7cIQk~90>sk7G2~p;0>~8LTVr=D#)5Zx7i+L z*RV|4y6OWGnRthu!=}&-eZEgjwM`)gPz|7#DVvWrKOvMe@Z*Alha6Zd-2Si5D zg}ggzVRVvn!*F&{3p4c!l<+!-KjPcP?;@Fm;$)<}DHOwoJFp{xq)2YJkt*CDW(pva zaB3s0T0H>6QxT9ob+w%<4A5|rO^iC=Wf`|$S}7&eF5mCpslue{w<5eUnMgG zi7y6D0cqK=B#5Kjm3uDnuWyDQ)Fbf?@;SUeM-PEQihy_oX{k>I1J+I)cA(?*fX2^& zeIOfe>k@`v6$1C}D%=_=>C%~FesMRX*;S$%NPFX^*5A9FkB;f$D$qr_d>GZy;V zWp`&R^v~#*l^GdCT{9$f9lR;roOm_(n!N0@vNFE=b#pIj@OFXE;P%ps;ZfJ%q0wFS zmCdG=MiHT~1DM3#&#s{iQK3Wh`kK@cG#nwcv0FeISjLGuMx?3px#({A(GIGz*t`I; z2g8ySIF5PLhh9#e+byM>GuQ&2PWtyXs2Mc&t7{#h4`54a5jqrEsD(ZN`o{?gnw&wS z{FbAn3k(|WJ0TgNG$QXpQAWJ< z>w0St+)=am*nE$$JR-;t`-$PGkZAuWStX z5X=I7QrF9XLD>UXv-wS~M&QNY-fF!KY?o{NEhm5`+-wx@O^HlvZ-eKrcfYOJ4uRI$ zwPNmOi9&vv*P!~-3(5h73jxnDsvIF)?kDke|4h4`0a$k@=+zbsHL1@fQAG`D9IPfk z2c);mcSOdjQ5X1CUu>mxFye z>;GogW}C!%C#$&qT*2C1Zy`=!p>cd7onxNW|t?50)=L26blymr?CXy(9E3InG^Bcg0%4W>e|jcRh_`rg$QI zPhYltZ}YNmr8i(|-GeIiO<*EVLV$n@uYPo183=Zyki+uU!7zO9pKucjT~?FqD8R?4Bn;#nXNI zHFn-7OZ9a<5ZEZQo`wv)-sItZ6yx_5<1xQ{s z7_lkGG-!(bEYdqZsJVgJ^aX3*u=~qzLj=7G?8PSRMLzwIMhZb+mQ`nt!t8<9@p-fR zt&+Z|m>yTW4U`_$2UR{YeanjOy%M<&HpYS&P0Y+*T4vM32GCj!+kA;Kj? z_5$(#7X=&TdC9U@``B|S7hOXr5rU_H-vf@l;Cw$=LW+ZbNRls>b;jJ#?N8}b>mm7% zTxcJhjTuf4M^D=^nTD*T=Tgetv`2b;6(vu5i2se!=?PTJt6YD=ZGC(T9(Ze>O5J@J z@FBh6)7bBWO^b(uXpAR9O@Hqzi(j`<}qhpWeQsY zin-kmLBmpcUBE)3&%weDVzuM|k%Kd&Qs(R3&O;~dzk?fV0qyOXM>mM+&$fr(94H9B z>DK#*jk(lzMR$+B@;--f)Tt#B3$@1fThpBGiFDW7AcwkvuDSWznR+328j==a$cA+x zmK6%&!|(S9D(y&)5V5{nBZxoWAZrSoIghqzR5A4WcJE;)K+#ru_03j?^oPlgMw&2` zw&I-p*p14qn}C68x)Rg9{}Ed}wsV23s;exZg&e`dSEuC1bs~rsmkuXX4s{)m<)+2> z(siU&?6ChpmcQ9}=Ai(YkKA5s`c+%$jz00|qbn7O%y@N6n_VUNgIP$&aog0W^(t8( zKC!AW*}Tzm?#RP^4eI;(33)j9c2F8W_1DJnjfy$o^y(w-jD(zdB;j7mU&nvd%_t^t z=aJ;Q8`h3qtQ3eAcYQN}SjXBVG3S20;$Pg{hDu9S0@q*RP!y!iavAF+LYvM{y5SY%8Pzgd(+5&deJ>K7x?=%~*NARAXP# z|B!w3eV$aijaJqW%0q+l9!p}kv~=(fBOhnUG&;e6nN&w0T)m*3sVWRPmCk~4K@b0s z`p+U(Q4c@uqj}t;QO6rErap+Z86l4Juj||I^*O7{`(LmxebV~MI&Wu*nhe|R#h5}B zLNW5vli><^b9MO)nSov|D>w6yUoQ$d(;U*x@DCCd=NHjF-hkb8Eu;|Ua{?q+#$unH zUcPFzQ29N7>CG|dkn~ZigR{v=v%f7W++TFg1Ux~7;cMK|a}w<}-T>iqem)KJ>G`(S zy~JT}7tkqRAX~F+cU6oyfeM+ud5eUy)W%4m=I{D9P4hS0eC7Si^LMmMUPCDQp%mE> z06wk-gc-2E9Yt(0}@wJSx+dK};4%>)i ze0xy1p+Y<4kH#ScN2qVLvSxs$Em`I&t%OzAz~!Kbvi7({EH)lPKKW8YB*dLDLyV{cvVkdo@<+74DMk z>`NGHUfa-`b$W;}A9Qf`y3Zk!UbR1{kWH9J5bz~x{GMpmWL=6WRNr`PT4L9?9G+Sz89C#}JH}c*@uiv( zm1i*4zf+QY73`I@%X%MnV5-MS0eAP^hXsFKCOIQ{l8vCeHk9VJ5d8P*l|O0gq!cwl z{)CA)S*BesLA*|QU`$NJTJ*h4k+_X0(?pXA@^*HX0)|hnJR&8tmDm7m!d%&{T9v9F zSg^loa&}=NzrT7Q#-k8x=b%dN<30biY(LJXEhn>i(eW()<|$meV-~w_o94}KNjBPG z_JR{UkGN=Uo~h#|42Jm7m!FFj8pDq1T$Ya!1eSw7I`jH5yl4|=7e%`PW*bDEOM|rF!P~J`+b?}dj{9& zYyBt*C)O4}MwztmW-F%}; zW;?{XHOh-dGSqgB)BM*WIx?2>n;rJbKY}2;&Qk{vfa8kQs&?d^%5rTgS4fAQNmTTF z{IqqtvyB^fVNn`tM-esCH`~{t67Adta+EPKLK##46WlF~xP9)n6UYp2;zJ^pvjSe6 zR{iFcqK=&Ze5g|gnnZA$5CD<1l&(w>w7DDGQpW=dics|fmd*=%N5`?<7nVCCQa@ji z$r<7N5h)#nW6&Vwr-U=>ezqea8}FbL+kcH@w{s%G!&lp#y}#K}p_QS=K%c~yH^x%4 zGSxdW=B}Ue&U;mR09wqxU{+&I-H68>!=m=>CUiIa@rw`5B2)VYu9ITZr;iR5Qp$T~ z21WREKk+C^$FR&)6`dt8VxM2Ji>ilPWs#FSNE;upSr&0>OFr-j_^~W*n#Rj7oDjPT zKL~=IyNMn_4=xAH-hi9uI#1VAjgCvFi*2A_^;42J?)hjuUv-s8D>K z(w~TGmXT_6+~#3<+CQU-s>3KHE+PjS)`d5JInFvW!oZrvF_kqOd!?R^M+d+2u%+{+ zi}aoC-du1-ynfM}z)oT`N2rq;GGcohv5E!qi-5e;x&H<<>dYRk*i@eo&Z{18xJfV> zn6=(LGaxpX{&0Mepm$X=h!xNZ+j4$Yf}FcyuwY=|C5y#XQt6TQL_WpeNBu|4eg!-fS0);cZLQika2=Sk?FQb(u=p@ZPq(iuirwF&QGZ$1(uHq05^ zFtHjFjDeRs15Vg5OVgK)xNj5+lfBgO^&bABn3XSLd|@yx`{ImaGbwp_s~^bS5AKdD z+XT-AVR9)9_WLhPR-1mja#o4L$5t`v5v~)ed*W4gfpBK-czBF{O!3Be*?PPS!*ijSYQKi5G;&PMM(ME$xbZzb z(t0mNkLH43pAL-Qm2?_htV(7B{w`}#;ro;q*Y?T*5@IY%jns}$pQd{sRfW_x0^&;~ z?fX1p3_{&PRg?vF6=$?8Q00nOx2qp|h1BPJz(P$|_|z*35mkkFYa?&%5Y6xBg2pY3 zUB;b=5kdOl4JPk>x=_!Bvqko#mMQRja=n$WG4B9Vi9z6b>?T>+y^-8e#|xNZ6p+KP zBYc&{UwuijTWvuXhkVQPqMbPlq>-=iZA|N-MJ#(;A##iwsn*IJM~FwhG?Qye0BX>jUjr~Nhn5^D)7x6# zZu8CYi4P)|S^|G9epxb_ZTO^B_H-Q^Y}@DB|651u5!9vp@#=dR=5*qL=MpW~^}8HYE?ykeNv#Q@b^f`}he zj;oL#G`gP`Xp83C1P>@?z*ak(qL^?c;f8i9rNXW$s4EZZo75pc?msGq`~qVU5sea^ zU_BnB^d%DXD>s=v5x;D8|9Wz+FCN&2IDSWmTf6BLXdh;RGwIJHWcJ9|@>RMX^-6o# zM+j>M7nZZ;4{eeMVJtJg==R4aB|fWz_(R{eV+k6XbD$2<5CzgW;v(fCN=4Q6F1ANR zn2f)KYZCDxf6>xM6E(BZ?4yuLiYe=-Wp7(<>F>RhqjqDkyGdG_@}UU7UY}b2M*}$m$R!A8=QH@vE)XQuT|Ate{3=?Tp8v zYJ^x?LdPyZI(`)ke)&7uiUvpfF{Jz`Aw$b4&|02{x@zNN^cCjc^m4a-Ls|EOJI(1K zIsx2FRZdyt!E>%3(yknXjl5`*+~$rQt{k{lPKW!rkqg3)lHaq~UpssI+Z=IouYW&u zBo_V>05~Pzb)-a-P#L~9>906O_3r9;sshXGqfM>wL<&;-Cr+KVa{ISJ<8Q|bj*wSh z@aO;Jk|HwW;M&`nTebv{&-9!>F}QRl31xg3G$_;?ZMBZHdJ2ez+t3d_?E6yEorw92= z7Iz}U`Z03OZOUFjAT}g0x6*dvC;C`cQncC%WQP=D;VlDZ<$5%FucsrsIMHoYGCHFo zB*lPh`ko2?TviP}Owv!!AUIqYD>>J`OfE%eh6^>yq~09alTMu8$bJFXoe%vvNjb1f ze1a#clmP-H4~VlY$tN%)66JdBazr7VjPj*Do90_Ar@!C-8AEh{;j)4pKJhdqj}DU? zh_&=K*VL6p>>_?r&|SdLd3a^sr!n!D^H9|Pc*JQc5Md>g!~RTs38D#ik8pyOdtN)X zTl58xQh&6n>H@=IQjLA{eTUv!OT=(PeiIDpc}B0(jzr#m$69el6&VIoLw=G+m-c|} z&kS1WST9$tts*s=N04<0bI*>=ZoccwNI{GZm*Sk?e#t%$DM(C;LV_?>mDuIr+}sv?SqqUG%*kp0Fqdqb&jY4~U!fSD$1K zRqbpjuM^(sjC=x?67TbXP3f2D){xa`*IaB&JX|`jIee{y-qZgeNW5 z(-<%2nn3Nt<|!XCbb`#vrF?z}rhq?cL1&9Us9BF-9x73rA9iNl9KQ5s=W~-F1*3gy zN`S|;NG-Ck`okJPZvG(uX&|k*EEh5aaQ;W2tD?B$?P>%<$0osnhL(6irda0tlWM{x z)~^x5$8aN4JYf>8&abD(00T|_P8iuIlcX4v>mxJi6s<)27I*!{9Np9_Tf=>L#k1f2!Z!U;)1n#j7v`pyccJUyVU zU{y80Rz81}Zap3s-^FSxvuIDC11)OwJ962Pg`;-}Q8`WAMT{)vCzW5k?p^|JKIGO& zno&x0Z=Q-z4y5FN)`9F9 zAsV+)E6XDFaa}@iH9OCg(B@b%rV#{W-7hPD6SZj_2<%lJjROJs49t#lA_=Hdn7SZ{ zR01gEbx@G~XkE#-)>~yGHwC9<)#WwF{t_I+Cn9p|%tO>I`Uto3NT1a;6qd$ASX&0B z+D6fxoIH)3LC(!A|2US=m_L?%U^jC1llC21OCXTa^;5rw9j#LziNW?L76X&=h6)>=PTnN3O!)6_hXwGROwoqZ4z&SwJM_3Rx`vEB+ zrBi)@dyp+{7rmI08HbT2xFEPXPfzE8@@)mG+{N-c+c5GX{3e`zGNI~}vE9>+A8qR( zbLY)`>_1Hia+!%Co$uzp5Z7=;z?pc8?-(3~}QkgiW%& zeekt(#(4*+>l+({6>U#t!#RqsjogLbHzXhi{13307Nn&PO8YMSgpml>;6}%OOODgG z=M~6vu#=(~^&TGQJac0D$!yO-?01WbK;$_Psx*LAp7+3HU$9*F3~0MQj<|7I8*>t@ ze?R&nNwR+m`>-R6B^hOV7cOH&5SS&uGI0s4+HdvK-B+^KJ|#(NIHzshcksMol8@+4 zdOf?1ur=G3KmSdDii+i%0aojLLaad7MQJ#Z z1u0Mx8yhPkinr&o*;<_kKB7^uKTi*GI>JJQY#wO3<9i&wv}rt%HA4PXisHgX!9Tf}RZx#nX~4xsV}E zkur%B7r~%lvP9t$8{#`7Q^62f@m5b5!;k6@zbq!@ZMpCk@Hj2>kQOWFPJ%OQK&1YO zM1R(X%UhodxC>YPNS6Y<$C$V(!u)Aaaw6o};X6%Kqx;BmdGmuP6iy4j-NKVZ7?EA} z`@Ovjq$mbGkpqw4x-2LkJG^$J+E)dI7tiz5E+ro(VHZzFM$;5|_4)2zSyB}DDZ3QN zS1#+9E?kqi5hKn?3N7#WH9~KSCUYdHM2n0p!KLRI2f3Q=TMnYGoZ_~{<8Ie|TKK#P z50M)*jq&_}Q}}+l!}kaGeoJ7y$Pb04=XF{V9{?!R+%l%HF&4WvpuCzDF8KED<*#^_ zI+gY8+KzoD*SV5jqsrR|@89eKoqIKmOP(GnjIkTW+HRN{m)r)&&B_ZGFN+om60(@N z22eB*h4ZY3j`lHn&gp>XRm9qjt;w14Dj}(1NGeK-MGAAe)*<9wf?%FbL-= zevRCTLC~umV;;Lgmo>paOyNt|Sprab(4-7G!To6;N4MwBI zXZVGQdU*AR`khiYBf_z2$%>`nlr}ul$mXg00G~&uqBGv2RE;}GS3<{{X)v9MF~WKH zHsj#aJpoxZ9FD>_2Tu)?ks*RWpA5SnHt+Uh zJ4=J(IuLb_h-1s{HlvwZ$?Tk*ZTLLhEy^O32Q42P96k2`v|GfNT(@ z$?4<9+9t~s8;SV!@2Ssd#y(<*%<}qvW5)%& zi$3is_`2bQxNT9|*OCOirBmsRjX4?+C6(5XrHaR_bnA8APJRf^9`@r^VoJypl9)jB7oWNAQJ`XM6r-8l z7;9}fc6_D*{Svv3Xk2hC5c1Au1^`@Bwy@9Mt<=@5zvZYsfY~KzoU^(M3bnFfu@Zaj z($kV??CbldTBb5LPL1nR@o!rgE;`+f;x;HWNQsmAWHQ-vi=cNV_Ueie{>np~nlRgw zG#RexlWT-)#BVT8qqdY+4zgontEX6(PTQZIVKO9zJ^U(ivNNRv-l;F9O0@3w&`cDfw;nUgoB^zxuroFf!_wKPwG)>M&q zRWOe_wuTA|S{z`=Md<|s@P zUydEII2!&m>JlS0P1NT7L(TW=9CbLG(QUx{rnD-|6jTLo|+Nyi96_}L{qAt(2<=o_cO^)n>`*K}8 zHqmn%M>d^ZAzduwuX@z->Ur3O;oHxQEeDrC6Yy?>&i)z6pL)rKXF&Va-&46m9N=wR zi{%imuSJbiA}B6`F4*i9F}6OZ1KuV$*ENo_S{y6}V16_`z~tQ4ac%qi)@?vX>5Fa$ zLbxrr!~m?6?>tcrl0_rHP8$4xUh@5mkC0}0f3F_gd4A~npR->xYUBIrv^G>#cn8(> zLbgX={j)vppVcIlo^1p9LQ9SK{jR~9tJKus*D|QVZ zB=#WLty#cX<9RjB9B^mk+aoa*cZFF!Xy`v?Pz_q7d9K^08@$8`?Lqdqj)0uk&p!&O zP4zuh%zD>o)Act$0sJWe@Su)qr<0&o9LEOg?LgD)+1SO~fPAuKyxeD^DDPmj45$x& z%LA6@I1<|+niXJtf&Z{l0rb&cKv)}ZILu2N%=aI^419g_jLY7Dl5cqt-8sdZfnVPCCFi~8E`3j&mAgK=u z_erYD5|Aq30oymu;&%dy#6dv(xdhdmfZev;3Fu9aRKq~ZE!wdTq$xeiD2IiKj`S;| zZZGH2o*B2(yLQZ{eSH{~nrup7;+D4ouv4D8hxXEQb9vrGNjRzb}7} z1iUqAOZUHM&cAQ^TjVVOBlDfyBJg*b|E`{YKK?%s{WB{68Qwp{Cbs`9q(9p$;XhmE ePi@*323*ZGLVuk?p5mYa{|wHUU@Nrk!v7E1<{bI} literal 0 HcmV?d00001 diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_2.png b/torch/ao/sparsity/experimental/pruner/images/prune_2.png new file mode 100644 index 0000000000000000000000000000000000000000..5aad9d0451bac3f45eb16dcc8d85fed98eac867b GIT binary patch literal 100490 zcmeFYbyOVPwmdS-QC>@9)bpUcMECUg1fuB+gHtdP>@*8*2>Vt91H^^780)vuky7QCrw>J+}H;}46y}qv?vZu4emvOZ$va4 zhD10be`g?A{&!6ng_6CZ&`1qTWqxoAO=t@}&bK&+iVO-WD*@QXL+T~Yo6Vv0&qj}5 z4+mPDVE*o|i03rqv%t7~cUN^6LNJpueIGWC(&PQw3yUGHJ5h4R#%2VjkMGIJ#p@+; z-(z9y_r3JBCq?^L0_;RE2KW#>Sax32RS?Yag?j-F5lohX6+5|hAw(E$uUE$#txRE+ z279ppf=4w!Z54|<>~qZoWAHxeQ$H-<8R2*p(zR-|=-Foo9fpqztoI~C@XH(0G#EYb z{-4uR5Ty+}#PP2d5@R_0wSjwg@&)T-IKL+$y%}23>Fzt*d=KpD-C5+kmvHxQ}zajC{Dg#C9L36bb^s96hlwWt`v>9u|+8+(h=Av zT@Dc8H-BSO|B)4>Rx;d!6=X*eSoA8DYG?~2U=DL4kjX{EZ&9VnLaB9tTrwDEGi~~y znpV72HahiESO!CV@JZz3df^S@pn$v{669NO2@<525zS0spJrG*SYa-l0?S74wYfp4 zGx%N!ts9vforKD>nD{FWVOcGGZ!GG1m<3Kq1EXm(uXY@6K_RN+_u;)LqD@~-9EoZ1 zms|ojN!vaea2WD}@1;c4!@{wF)l9EKD4k7OlVf(`7KQZMiYNqlTJ{j{rh2tM%t#YBgvR&~K zexm8ks53Z*Gle`I5BWu)t_=oAvp*hPyqvXq&oVAl#2oq}il*d+q#iC~9Ip7APqS+x z&cugNqJPX7%q{Ct4>_Z|S=~Cm3Q(`0uW)2WSi5C8m+l8qi2AVf5S5k+cU{%ATM*}c zBJ_plbL8}>aVF^}L;kACE^*cHYoQ+%+;HMO%_>I*>WMs-= z?21xJVfNtR@vXw}&1A}Fw$oo;LrxR#+IeI@O#J}gLu)OVR7d~ zP}M+Oa>4B%>>R{~*=Q2ExOVN|_X;8&I9swmTP8$}KEJqN_8b%TfL+dDpMSJivaeNM z)`#WE8>ZW$Hh4uvP}+>P{8a$?@Qa`h5{(bRK=)hc7hY&@I@-n1kTLkFl3Lp_an>HUy_^2K*;`AaELERk!CGR)p0Xa@bv9cR6Jbj- z)EJcYx2K4ma_k+OjJOiKZd&GDtJmE6FP5?d9*uc%VqY-WFUBK$WFf0Y#6c~ zH-Hu6wBl+99YM@aVy_kNNfraV$p0dC!HN13`AFgl_K-@EU}5$l%pnFL=Gta2K3Cb4~3w&vQ zb%8m4v#f>OmYlI2mUA7~_uT2N{#wGk{9b%VBL>DCX12_)epE42e=%b&Gj1KtGFCOK zr~w&co5q_E{B6vhD3Bhh9oaNKt2Q!GHYS>EEm$b|^Ven~WTJOWwYW-h>*tW2=J(PF z%cN31r`AKI1m=HVW=3URA*mul5&6)Y0A;IzECRN&W$HZYhhgr zpj;3-NFlZc^ht_CIyv`u?o4ixRMZd0B*p>QL5N||KxA@@&Q8^Iv4#@+-p8W;`Dw0E z4c*qx=FaA+j!Dv9z0yvtu)BnBl0^Md{fY^L37SfVO23K?RTR}1DsolsRKkjqp7=%k%5W>{$6u zw!6JS;W)oM%@NKp;FM&bvQeHQ^W{_RSD&vnV}eGxcGYJZTO3=;e@X)L-fHvQ4;+`# zrO`Fe&1oDq?lwl+r|!)z-*yfDjyO_UaU4uNBs}DqYgv4#3HlZESN~Txt!&^H#qg`N z!8GmW<`x4jW-Wq6rNy1ay%ww6?HkAq%_aMurhXf_E*Sl8)>0W_vC18L^fe`0p>BejWbf%Vi^TH%AsX);q;u z$zYmwxgPjPA|p^9B^Gi70xhBi;@DeBB=vWf#Pg(F)Xf~t)E%GoT!p@VBw?bmby=4# zMNAwnS#^v!CuFx<@1;=tCNSQZFFJxUQn?){znvfRODVGPojw6sNI}@EiuvjIimNsA zHBk2h%MUgVOZ(34ypz(C&#S{988ASgeF^7u-0$^YPAtD`ekUzpSEY8*uvI(_iu)Uf zO=+PY7?sSvDNrb1$X3G>5ts0@mb=8we0q99f3DjllA6#-zxm+0U~O2M;YT#*N{>)U zNOK7MDB=#~PSYswnVw60^WhuIuG?@ro}jkCCj~Sb74mJg*9gi8>EPD}r9JLx*wHmi zMS@r#sW2Tv*;gG_r)FYiQYv}ZA)EU>JN?qV3n`*QlEa)dBk}8EGBQl5UmO&Q#^u#$ zEK>GaT`tz?RO8H>@MEG9r?&QfN8N@Yadygv2EFomh5d^A>-RkTq{7dNOc$$Pf=t44 zxvAgK--9`LQh8fk0&l|EKNAia49_zmC&x1uxzuj#-&D9+RXXWc}3gzY9)OfU8`aVETE7_7@sUe~PQgzgK`Fom5ZLfq?>fCHQ-Z!gQtVmw8SdvzMtC1`^ zlgm8L9Lj95F|u)CRx(Mae&>>Eyl@d39on)NF-5QTr0b{iCqCl$2Q!vrbE<`NlPc%4 z5?XY1Q)&yU{#uMW9CMFGT9>(GZFkRtS#3{dzgCct;7VSe7OIB^00Y77b$Yq6-st1nn0LCnCex1q_R?R2r~J)byB<*mGX&Y ziJNf~QWN`2w)STzhr^})^$w=dv{7X0Tedmpp`-bgGoGzII%m!NmTi}lzej0pM5ZUE z1ug3y1&@_S8-bJPC8A|;n(Us;=es**^Jec#yEUv@V(ix@2W?pAEDe9t zTb#lU(GTP2r&_fy+pqaIh%UYMEzV3iPZ`cvIz4{5W%73fV2Nj$ieeKbeUIo!lgpH87AK+_4ISv7+gEgw0S5ot9hU7 zHAKg|u?dx>rswm*!Z(dC&KRWU=NB!^FTBTL+TW>aYaa!+w$jh(l-Ip5{!z@~M-_lD z{3b;(e*3cTM0goqK2^4@|D>Ccg6tTE_worA!7CchE|j|!$UQ>AD&mIH(lAuOX9O6y z7w=(S0iRv~FP;}4{^PUA3ksN*&%eXMzyz7V!2R1fGQd0ZFB*73uleWw<@Z1sc;Fib z@N&+8{r9usKp8Lp{qu`;;1~@5S3z-c;Qgzp&l=%4+doylQ6=^wQK`UD@F$*079Rn#35-~9`m#u*zhk}sEzg-S|<0dt> zx3}h?r+0F4qI3F8XJuyK8_IBPT$tV z+TO&j_s?^JolXApPL_86HZ5R+^w1;pjC2h2|1majDHrrt z4mlHNu(_&`i3K1t;2u0**uHQ*pZ{Nu{O2A2 z4rtl)-zo5)0(e2+f!7wDBUV;83=AKPxDfw0=NJ16$juKENxa93mUg-bcyB`Egea)U zC2T3bqsQ#Xm}JX+NzM`w$E_0)kia0hl5BWb$H+{P#GRnew*6q3bTd40)wbAQoPK1L z?!j$zWOvE4rCK({bGLs4_L#b=vK&5B9X=~|`0-OAn~xX`0S?yZ|L~Ro z7W02P482AU4MC6ey}Elm5ic{df{3o2coidRz8%TgJZCM4)1n>M&|yepDwQu_D$D~E!{1$dK~)^NnJH>agPmJd7fn+hJ`7iz&<@J zJ*5t@t%cjKhJ4EXVAag{@OafmR9N=i?eTv2iRk?oUbY7gllV#P&*?ESV6mB!(6?S?a{y4r-Lq@vU6?)h`>rM8ZR-QofJQCv?Bb z+9p2iI$1Tk#rt&Q(}x1?;pU0kA_h%bl3M7k8u8w#%Ht18Z>D+L#Ww8mTs6qLKHV*) zA3D}Gy<)6jDL$bl^ImuD=3JD=QS^K~OGn)=s!89i^199QdJlmlsmY^Z>ccWsB0@c# z4m*ilNC_t>0{kjE5 z>PgHo7(Uia-S+{hChevq;O)Swd)U2z434YCb{9HOEXU~EJrk4Rr=zEbw)4}(Hm-*? z-bZ}_1vthEM`}4IJ*l7Wr@e@bCy-A(6_8hGetzJk@`BnINEDcl5^~qMT>brdRHJ`3 zn?}q0s7XygbiRmS~Q!AP{FYFEq@x-;B2ZD1#~qLkR?q zBoB!fj`-;-)vl=&!uK_^Ky0H9j^dU5m-^0kLTYTV*h_`@uZEDjtJP5)_Km5f##kKT z7)MGrADyKLBK?y7PI*4;X(!ebE!_+`VN~4{J#35aB(~jyZB)5Vd&tHFiEjVug`9Nf zb$j`t{jF;~iLlD1_dE&fJYCM?eJ~-3hP-57j}{!|hbQ=kkiNm-1jopj&pdGCj)r;m zhxQQZS#4Fx7%U)gK5-1<#}$3xJhUfHt@yA;Q^TCFLbp)?z_FPPT_}Yxc}7s$X8}EKL6B7U>#^i!0KKnE`Kx?M5oa~af+o8tcz?9GwBbisxzpzdew$1@nGtpr*qk-Z`L^B zs6#xp!cF~}hu;Y=%Y$-gYp1i|()H6mutf!a7RK8-TO)d^hm*SlR74(&tXqe&ugSZL z^RQmO@v7x0^>LN{F$>|`=ggR`={-9C%01xB(pzsQ%5tr|?&ipe$%`W}_e$!t@K`Cb z9L>ES6k;s+rOwNiY|t^&6cDNX{Vjx|ICk@y^7dQr4tFlfygVAuQjaulHJ&g!XINkv zF8DuBugFFx8#y0ZMu5Wb1QR^K}XM>vTo2%@24{U`FnYReFpNF}s zCjap5;R^eTKfl4t)NFOkvh=%!~9>_ru)G2gFi9qFXFf;7w zczfmbbhT=E+^5`X8&RWhH+${H1~)Lnd;J@A6j|%oSHN;lUHkDg5jm#iTlSJQIniSw zF(|OxG@IJ)Qxw*}gVEvuzRL!S$|#FiWz-aDmYHn!r~9)Mn)VWhLB0E)1Pp1V)gR4S z69yagGlnS^Kf>r5^LvXj%(!`|-K1FtEh=rN%v*0Z2j>q3cdf?((wFFT-Fup)BP?2Xvr{bW6sBj@dE6zvu*R{{4NYV(7@CEGsQaX~8Z5L7xQCO)K=#&)1`rAiQzE!m3ifdF5gX%!`m|{HRhT3#V&j5OS1-h<>UoV z8w%Y90aE6ysIGtg-e1J&OMY>w&{x$ped7foBOEa(D%zZ^YrU{7Xe9SS1+m6FD<4$I zHE2n%^?b8LjD^^^E(&BVP9M*NjX$2-e|?XJ3-wRI zQuevkzZ1S@jB=5R^-IDgFD+rXBf50C`=jv51-q@6Is%FxQf!+&4wmyuo2jcc;r^VZ zKi$7}beReecFcH7?`_+o2Xvbe~|I2fyHMAAdT`STEK%O zwh<@#->dOhMifRL_rU~7&8>qWH=_+FG|bq70%8!z&uslYbn5L=WDIX1aIbtj%qF6} z^)JXpWsX9qg)J|3C&1VT0Icz_>p=YU#YOfsZK^wjmy|C$o55%*qSDs+d_M0UVF${56w|o&aZW zg*}oBc;Z85s^-k?JJKN(q4wq!eMhXypotBr3@XLSe{Kd5gI=@vxoDz6Ak8{qDgQW_ zMFgprjJgiVJIt|HP&1IeRHmg)Zi)VME|;tQu&bQ7*WlrP%VJOVcy}FJk*}cZ*X> z$mgjm+*uY~iV^iOk8T6t$Ut=^;&NWZNsXydC15UiCPVhzr|H$e- zXNT{kX}y6SDc-y?=3P+4NMC=GEP({S`f*Z?O0Co48S0BKx zi%|pDYYWt8yweEyw{dw~IpGAsffF;G$R~g4O`7HnCjqagz?%o)!K^62M9+&dPcSfV zKMEv-#lZy5zd{f4+e~^S*O)Qdb3X;NCQrqM1O(|?`Mx=CD*=TFEI|J7uMpu28s@5a z@TLc_c9q4hn+IL5sor6+_h~QoRZLVb`>GVB+W+PGO_(P!6hFwOsH8 z;4$)h^Pc||lza$Iw(g_M^Osxt#Y54LrEMb%=O-xD4B&XY>9m?Tv$B@j-no#3yHllv zLDsgLK|l$Y_Zf;Q-@?8p9Yn*_n|MPy_;c%ZE)g*^D$n(@UQJDsz9<@|2Ng>?lU783 zyr7n{mJ9f-N0g^hg=}UJQC_!Ir$jUv#|R?YVhqoZa9KulJ^gkMOz)=p``ps0N&T#0 z*rQa(xYgUDbNs2NL$&iuaLA*DZmSV6cY0eJ?Pqw@kL!~I0wGW|CwqzbN~nR_mMEc8$Kws93|;TQbORg^*E)` zmVr|;)ROjFt`>8+t6=orelf;QXEL}*Z(|Yg3j_EGo-vJN85Wg48Uc*1TrToOF5xvg zW>@f-3};DwBpzotcNjMm{ zjUhmgD(OOEe6cm;)M|HDQ|4hov9X#JMrT^Pzp{&lbV(ri-=JhCm~jV- z?7iGGwnW_Y`Uu$MbVOLvUn13Yn<++5yer3fn{ELC+^NFv`|3p8tHES_k-|O5>w6h4 z`A_OJOI5SkiOGXWcRj894qNrmg1p1|KWepnKpSj592&La(_+*hsow}dg1O{}u8q?s`_ zFMo=XgUq$@AVN;&bE1?j!HAZx;**>K1jr$`qsYrTZ!K5_VE)nyV}2>)VScG<^vAU) z2F{)A*ihRqRE#!<4O=+K$PPg2OT?I(@YE$NwH-D8#eBi+-gWsU(RBG=yXJ*5uHm10 zic8Zy4?2pGx>#r8S2(|^q%;tlY>2gAQDJMt1|Goa|1NHDPg;AGTB1aCu$O!D1laIb zPQZb`p6X$Ku2&?B=)4a`WF#*Z0;bxKD-=Q|H5>RnKpe36-#Rt0AuIC%FB%eWn)Rk(@fUaz*P2w zMYX(5j=-jf+J_e^4?zA&afla;cxVJ0LQ2A6;}9N3u7^+xPw2cXrzCC0Yh&W|ijfiT z{=Wv7R2)nV&cK=^=kZ^JHC=k78tksD!U9ezQs-Sp8b*bkLJpX?o$-F1aA6778@Qz` z)jQ=$xF|)cO`*P=kp}`^msRQUsU-U#NN9*UJaw?lt68g}hj|pp2){wOy};(AQ14`0 z#IGJvB1UO<{JpI#Vy}bvCb5OI>czb}CKpemhTW>Xji%LVD)rd*a~Y?`uXgt*-H9p~ z3w?Lz`O0$Pv^0rLdG9qkIqFj6MN`=#?>68? z_z?#GOPK($DpFr>{fSumT?POQRUo}V*VnVgEP)7dc0n-HuOX!O7fmVFqRX{Rnwsp7 zK+%yyPwE^d!j?Z{GP0->Lcs{SoVOeH{TznlLqN&2OFXIpSG{A&2znj!qbqU<7qjap zzYX7Jg3hBTg-oqN_WX{rBU8q_Yo3_2JViIZl!n8EBDn4?^lI*6i2Ieqk4(%gOQ8I|FkMr>vV)u{Kw}$aEAwGW0FyqZR1fibD(_(eRKV zwWOybV$f!k<~-hWj!-EKRb~PIcrXMa;JATirx&eIX-;k19;Wzv8poxsd4qOpzfTuj zsBNo_J-r~x=5{-lu60?SZup{ciBtL%pCmb&F4&BGV6%!EzB9N(AOz)S(-dw_;IM0i z?C0H5e21pLCAG7KneB9&)94VPD~=M;nwtO&%+u@9#MW=Oq#GlMi`f7}%%}%u#PFfL|moyk#?CRDRw1Lz$E&%?l8(JM;F2x^8 z_Ig|)dd!5l@FS5}Mt@lqo}t~s8+H}}>K*QR8e2056=*W+@2>Ju9qN9+{_rKms@3I> z>*$Rmw>|X|L$_}d?(3Y9GDJN+`+?QSf4m(x)Y~2N*R*6|W5abaz`e}WS7iA_jw=sy zbB>9Tr<$+c*CoV9#-+AUwd=g}C+Uo? z^vCnOLA$iGpIJ0e%cs>ZXf1uKFdjou`(Bubn0NL;Npnsl*1_lCM?}Ny58EC;YoDmoL(4hs6t2HlId4<;(ezl2~N=rvT4Yogb*V<8Zc^{qO8;AJuBKALIuA>Ag2+G7(QS}h8<&E%eq&JK7N|k_`IC1WhUl+=QH?mmGXl49I>cmcbXc-Pw1~mj>725(=xaq zt1DZZImbDR{!*gDf~}HkJ2G8i#oja5p*kb7>3S~$$7x-&Vvb#A>{yd)7Q&XNha-9% zv`t^KC@$N#E@P@ZxD~_|Zb3JJ*nI_gX=};F;+aUnzUy|enliZO%XM;5ECw4 zcd_EE!YVPd9}zB{uA%rc$WQT0eT&Pz0{rSuNeYGPGc1z~Y*)ab1VUl8U2OTLmw-G+Bi7laBJ#@t-E{tpUg z93xCY@2-F4D3E8-yRRT4-w5u2U#zG7BWGnnn=LY>44n2aO{Gy$RnCpOyV!VCc1D^XscgXnc68H%DDniiX&0A+YXpbVth zO*>qu|IhU{k(gADUZv$FpY;QMC*>Lh7l@KGR@xo=SS zU+TB;5l^GdCk0@byI;S&!iaX=s2Yv@BW)fm%6mH^EN5t8J0koJTX&(sGNdDr&6YMS zB-L@YU|&_1F9PtEDz>t|c)~##15wRHhDxw|wGF{6Mc;kfFA5V6WwlgAbzj!KYuOeS zwh@vlN7${+(u}c*st|1FpPEWy5&P8%;Rp^(~&oV#}X1YB09&{ z#}^30_Pz-+LuKly8A^3pblG4tdmUQexF;1Xyw0KbqJrcQwsNI$uap{+Y{Q2p7TVxa zm|D&4V*8T6+~(a*PJZ2ydgzm`9=dmbSQ1{apO%NH1_BQCEuMz1hB$%#0?G_h=X?a5 zz#9huBcbi|90QoYO=F501SFV}Bkyr82Y~WWp!Ph$Kha!Bryv9|1A_{#-5r|Fz+pG> zppm*!B9(r+o;bK9JkNlS58mmo^zFJ=z+!I_QaK?N`rPSIJY~u=x($FaYQuVeGoHWx z+gejm4fUT>QZLQGF>;m@m#6*o^f=rtTERMxKs%#}V!9ZF=1_h#0sp)bU*n3iEt^le z!24)lVPYncS@nZd2o)(Y*rF=yrzrvb3odF4K*Q|%bYV)Dxo^k&Yuph7v-B6u+~e4_ zq0zh2celToh(Xc*H0df47nMEyQWW&cus0fj!3SudWuT+ebonla>t)zSrh0zxs}Y|h z!1Bt@Hj-8MCGvfh*}Wh&5I)qpS?78A8`ZO1WdrXev?+n3(7gI|66Mvb!q3)yDo;?K z_n9OZ5~H^Oxy@43u-DpJEKk}N0ylcWGcqn*VCEo0LyV62bA%z*4+w@at_k6#`yE+6 zLU|4o(gDiA^@;NJ02&`yS$g6yKP!ST-cSyHSohXbDyqkmTW>@RsL_()xmnj6%+i89 zZ7b4$e%&b)Rutj6ikVIU%e7=$FYnk$&bAi>Yp7PU@uPh;OFh-T$%~xGj{w}M0>fEh zSbd~r@vX&Lcy-8N;tHsh?KHyjm`DL-zo>FoO$5dj&%Q$_92CF`J`Nrnbp z%!$Hv+>YVYTa`ztl0&Mo8kREX~e@L|tcWnLOSJcewai!Q>LRg;-EjN7o zxBZ_WaLQp}^sV*hQ$wR>W?1_YQr0noQh?{trfec20coB6n0g~DMNFZ*Withpm}n2e zNr?MCzQKKA(v4vL?8+-=9y zt=Ek5@+lDj{5JfRmKFDpx1)j`6Oj5w9-&&>(obm&RouzzSq|B2vUapUig6`9^EIVG z%XirIc%Ha4s`)C2^9HffKcGcZgabG!0V0wpIT||HD$UYYvL3r%`>nP}-v+_Zv|ZMe zF*hBxx}kS8n93-0Fe0MbYl`Z;f=ts9Fi}0?%8AeTGTZ ziOpN`Nbd>rgPEIus`DDDH2z8X4R){o5<*)!n?Rwr<2@;o35j)~vxpvUxl*~lzfD;! z{F3JfB6Lh>?XIE}w@VLprl@T13E}P4T%ikYSjfXE3YM7k*+MmWQu|^3S|s(yTmmw`g87p_J3MMPa49S6Qo{@;|Kb4vap~lJnzhcVw2R{N=pe3-8}a+(Cc|B| z`&s~m)ST_9=q5~wWLpi6bD|e>Y3du%^2F0B;jxp0#4YqFM1K0V%M>f|DqVgP#AdV? z5Qn|Z)J0(bm!DEyH2cWw>5kG0th$M5@lky^u0nS4HZA*2SFaRK?o{h#4P_Hywv=a% z)Xzl6u|V!vgq?xU!!%QdVXfy8R_?#fZPvD9LW&{}v5lC0Gbo2elMBWIrjwKY~S2C2)DIf=rikzMc& zdn`M^)me}sy6F*BSnT`=rcO{`O-?$Pz5L7=^SEjC2)Y@Xt4MG`pi4dT08uM|5)?o_ zUu!!kF3zFJz)?-X7b@Wi^kkAiHQW7yYHm~fHNNoN{qlAUA#$^sK1Y4xo5hyFQ@#4{}A&@Zpp zn?mE*DG!4DVJ}YucKe-YHp{80NT6j?_F03X|1?mM_?+EaJHGo_quXap z{-wr-?XDefv0Pi?ClCU)yLWAXa&0r3(k6!f&!-D!yWj?)rtJyB>eT&6`Ql#N`|oj@ zxENudu4-Z%LKTG_r^MKU4J(rPXqydE3n>|Nvdx&T40r%F<2 zhWn#By#0c2l~$8RD>|a%T6tJxMwVkg|KT{Cdj0gy$0pr)g3SFMpYolAMs&A9KHwFMb#<_MViSw^3Ce_y$$UQNM#2yMRmfV zInru1gKF&aM~cBi)r&A==f1CL3XUGW+K$w;&h1~Cg0U(|;oB^0fuw8`;Mh3-Zj}Dg z^OEF+sPk+eGklAM?S6p*5smu#*AiI`-gKbQ>@N4gT^5p*L{gVRVfwt-VFB%5x{_3i zouan>saRkk2!)3{<{Q-*nlplKc}NnZynIH7Q#|E*q&}lJzwVeL+^f$-eb7mWeVNn(Di%o>?AACX$2auzcCh+Jn0hRge z4OJ3qlOMr|59`w5lg2Ge4W_clFQlKhXAQ%2O9Wm~kL)8i{l7(98NDA!$bw9Fp^! zJMABu#W!f6`KBz3MF;3=0tr9{KQ9|MAqT~&%}_KS_($W4>^NB6t+ig}rEW>wwncP$-QefE1l3C{pU{Xm6z&dPe zGsX>>R0Hq3Y)$xpMd&fr0K88F>j;f!He!+qa8lCFZdeCEIce0w`-V2?@)n}Kz=~KKlV4X?c3+a&xJl;ZV^2a5ma%4pB0A|NCmGU zGo6CKNWD5h_V7QC{*Tl_{8Bwt>S{oDv5_EBUy$_g+o=tI6QK0T;*U-bG!=!~S5KeU zObHf{doi$|+6k&8CIWyUA8Qx^28|ZLC^_oye!8dv%qI6T&nslr{DV)ZZs$aU>ziC? z^B>bzj~xvQXq=gOx<3JWz(Yfp_dI68Tj>6Ul)~_LP;v<#KXgoN;a+Yik-GO&`cHtt zsh$0a=Okn&{PU1lUE@T_D6OSU*duG*P~R3ng7X@KQVZeEo*s{$WS%axpW+|4ydFO- zoF>#sIo1Cm7%j2#*vk^DvkdcsC-#o^-U3?CO}<8*Sd(>a4+mqIj<(WgxKms)a)O~=wiF6LsLDGk~ujcK6%Kyi+g}1Epa;--$ z!*03HxbyCyj^1DNAI>5|8mi~9ACyK&0~wKKUHHcEFj3|i9|>?o?5A~uuwg?kAjNs* zNl8*~U;lUJ0kjo+wz_#AZEml-dDBV(zSB9-$)fkT>DQKdy0m&qAiT&I%>nAz>CSbCysTqs{e7P+r^9T^oG`h+;fOL=+}+>2|Vi6vzcs5kla(xA}L-h;GC}K38Zk zyw%e?cVrxZvf2C;G@#+bryU7u`PfP;Cyl_g%GaNj9|=(Ydq81~NXUR*#8pn)by%BN znd?N+D52ZF{5)!cUq8Q>&?h^kVnj>g8&69{Hl(Hk>FTu&a zd!Zk^u!-89E*t=C9E0wmI0wiWT&XUb$%67ZD?`K}B>~!Q&VSfdq=03G zBdBN)J}(UHGrsa(1A*%C@!j?-pr%eCaW-R=o(gm!g3gV+E{z_LF^t?uGa`8aa!j@$ zD)*O%eIs7#6Y?}5%ID1ibTM*E(cuq9`$8&!v^xGbLUeWrVuz%W8XYxd>QT*j+snPq z0-tsl6KgG*H4#9w=c0KWXHD|3S$&|Wa3@68U5^CY(xFH_qPZrox!&p?*q~t1xa^Dk zyj#S6CnqtNbAAP-^C6?^V}C)aL+f0<=b!ONd=0i}bOwFe6KGOMJ@dIg39#r=r7A}r zz^g=v-ak{!iP3rxq(YOfACA1zuP>M~FADm8iY?UDchZLPZ6~2-cG@ge(X`VD`xW9U z{nXW}XuTP<(kidD_@FTna&+rP*~eKl>`!?=%jWjZxz@%}ilE)O!8~y*6^Hsao@cG& z`2pF4&CA?lrFT4!>7CmEJ^b8Ce)cz8NC;44Vk2-EmnR>BB&b^^^ITR$mF%R~&{D=U zAl_uA4(c`SM(th?b?gJ~S{g|7Vn=x&*LZW?fJ*KTut`R}QZt_?q}n>d)gPRCw#UWW z`dx}A4;pxF{#T{JWo4AdX7lZZ8tSLt6#(6$*?uElIhW|^y4$2`kVW$Du*qr`=o2Da z!rA-2iBCHJCo>uJ328L)$6_LUL*B#8J=oL*byX0LI9%_`s?$Y_5@##vBf$eM_M6 z1E{w@RW(XgjN69p&fx|qlhV*G(b$BtcIqO~QOYgzI55lcnk)F@>;k2|47z_YJ1FEW zT;mL5xQF^%6oA`SZ8}j{4X_Qe*P?iN7J&T=GFw0GWNrZsR-fm*e@tn+NjcvZ<>goa zHrr6!-4~H|FrHv6sB+oc1omE5Q&8XB~RjdOca2Aqn!sO;UId7h30W>Pzs9)olXN} zKVp*mxur)(*Soo;w6v3OqwFoNEq1e>kl(UuGs?!p)*|H|K(|B!+LsxFKAe5~3;2)J zWl*9(82e{|h~tMyr)Dyb6=ZD+72eaBD4L>x%=tH#f(#-*Nk|ug7#tUWAj-9h{%VU8 zpm1)~EeAjggQx0ff}ul209{BxX<$~u9BP6PK+-q3@3c?9CuWby^0rhn>XTog*^O%h zF+IN>x`5RdhS)vbQvd{0Dw!Q!kQrta929sG>y}8Sv-!_KC(S$EGyT&hG;e zP+DpI2k%WRugRfZ`rQ}R6>8wx;&^vWr36WWjfpl;$_NXT^jigN7GiaYFYHB<{Y9u# zLRX2lxNf{7t(2A0T<3mPY{XsEE8xQ3_MH)bb7AGp@H3Cnb9@l6g(FZ|K1;F&_4S{b z?jvZLJpLQV$EJMGpdHxRDlXCsQ5cb#)x4YYJl+K^fN^I=2&ASgGc?UN0MAGLIG}=Y z2cV`1RRLj)#l@kvSBX~|#FPT1h-~~&{ll&v@CCYKCdFKJ%pfdkbA#T?Wv6VBSz#<+ zR(}r4-D46>qOT5;-vX^I1CZzMy5b#Ecki~fz^`1YVH7I@=xQ+3-MnTf5zbhUEt$Dm>7dlk_yum z8;|6}MVR)zUaa^iNZld9#)$5{jR6k=gG<=I>|AJP3qgbx*E%szyI=pr>QYm1)6{m? z1S-odCih1ZL!fr)tZD!@jZLp(7eMzW%sc?Gzs5B83bmuw)oty=(zH3yG73n_7G@ewMn`mk58FS+igc3}gol%&@BAUj04>c{MpM4K zrPW+DKBMG8onj)cMnp1YziJhnLMR4U$*e|Sd z5)uA|&$&FxzQD?36&zE`0**ibe^EdjND!^mz>vj}jMF6{XJALvgZag;fcG;)S@ldDuZPS3S4JAxVcbcPW!&Nae+JbfX zza8*NO6hA3G+dtAXAsJnnnd7q09JeouKhokQD7G%G6=eSY7OM*o6+3ou!+pD(}A3g zYv|e3U^>cpv}k2Hc;PP1KJWJ%Q!zwAaj{3{Q}<^530)7m zgl3QGLQYz*ILa{|GF8W1H{N1vmRqeBg2n5Uv6JL{|Ec-0GNGc~BsJKxS=?D>`MPOE zQJ;K;t-Z1s=ISGJh-U20<2UTy*!*?r>pDz^Hv04s`5NPbVedBJVQRM6!`yhk>N&?I z!<@5sAR3*aAth23m(%kc_ta4RNlzm?b;MG(Xu*054+9}Wrw8TgT|3nYICfi`$eeN; zvoq$!`r-&c9(aLS|#)Fq}MWu|N7-U>f zUKKNukhVQggIL16ay*(y``~P2Iv?Qh)q4zO52;Qa%4eW~l@bbIt?B{ihp)91ZtMTo z1<)MkCA1mM>-FX>ad2eKB^raaRc_{Q&Q2|ZMokJ5&JfqH=hmy0$DqGgLoO_%p8hAl zgewed{KmgsjDkjJcbm%UtE;fs@MbKt9RUxU&#iKgzAyR zoouu9XUJ-{G>Wq0o1BG2*SrJ_m&e`wopYCHNK@=@&dSawxRQD8lW&8fZhqtXyT!G z*WOBzljcRAR9>{IwF<}?Ih@x!hGKg@<@BHwtML1$I2-fI!}*`&M+gm5W7%kA`m;dC ze@|>K=;YaNVi9wOEeu)Ns(M}XI`YABESn%+q|}^4Ay2tDkX)IAA+YUV)jj)(&?UA| zsP^*erdUzg|8W1zJS%j|fSF?f%jA{4ek?O?t7ESI*3UN?0{Two@A*e- zaR9@$UDyQxHdP{$t&lYj$3Nd{6}Jj$%RP59H~4ciTy3dqmptTP3l)fO2bK^jJOaYue7{&eH5OyI;fk&Dvy<%856q1U20 zA=}Rq6}^RlW@cpbb90}CyE1p~7ZIE$wnQ}hPYp$7dXni9 z$J?96(CuwT1AW^)bym)usGK$LXFnxp(14HPq7X-tuLlHd`-t2e37UU@XW1)bEDL|%@Yc4BAd4IxKnEqS@^2c)nXEJE{FL}FIupPyh0^q@HR-yswn zLsJhf{Lal8vsC6IQdS*_D!Copbu=+qjP~WHT+8`Gb_^p=UCXz7jgT*0zj7$n|6Nc-}W zqo~}nY9))ZXWDWOCD!O*m|llG3Ijp!Be(rfWlnm(@J6LMy3L&1ZZLE;`6O zQOhnCnm?)3N#G7g4No_VCqK@K@c;J!@}XW*P6T56c$^Lc6k^0YVi zyDY7;rVjP$LVj2bPZ~{sSozv=xTc=<`{18R98jNQQ_D3V?dkdK(L4QL4X6|yg+u`i zyO#jY*l*M9e_7Fi&;vDJVNjv`a&wN}kc@ZC&`{Ic!T1-BHC}UQu1*K1P8>$X}!_7F4X!nPHN}J@$NHBef z;K3DTNv%AFUpsOLSm9lQv=-2!fMo4>dxlI&Jf32C84XX9pmjh4L!xdiO%X?Ze-Kdu0 z*)4n})Je7zx{F_6Xd}%&pO!5*cv3&$y8M!+KdSIc3ZfT~%=0X5g{?|v04Hb#j8tVO_$sWN^gI2>b4#*ZNkzcIsZ z$-#F)zqc+p+1Rf6K~?v@1`bCI`#21nhUTz0?9S-EIq9I^GQsuMrsIyUsI(@0@vp*{ zj6WR9l?$7j?(s6c+ZkImG1#@C{edha4tNE~w!4wj0kF10m#`C_2cI__%Xv+jOsxE}bY5OxP>G_3d(1>}8 zfhLq85C?-lR?#Mh*4u$qL-WfG>ybMUbo-Fq!Sf@_)lY3N|B$S!*-=A)g&(6+mK$*q zGQd3uZFK(0w3$yPwDYRua%1HS1?PH!cyxli7&kcHqE+*-uE8WcV&SrJlCC`gXGg@` zhN(Sa^!9Qm&A||zVEb>n)yF>mZ(ak>?b+NotO;KpoeRf?uN9cnAtx}bBwYn5T22(o z(Sa7Ox`Fg7KmFt*Aj6n6l2h%%U{GPgc&^9iDg6>XBWDZIVUXB^jTn_xt%g0GFv z{bQtBTJZj~JIaXTN9>=sbQOB^KYf7ezCAUNOd*l%edAYW7N0`QpQyM~VXtL$Bu<}FC>1ZTx5A8D_d4i#2?aY zH3OH;@ire9ZXjV%S7;J}v03^qp^wlsVl~4hdUZ$gn|di;&&N|e@3i_n@ZCwfKg%vC z%~q|n8fxV03bMO@A~a_;&#ScXOs-x4r>(W~e?YhIynNN8843N zBJ#?IRl9L}iPbugVrf&L|2X2vd-L&~F&#Qk0IYD zR)^Tg0!P5Va$qVpD2&U{+H^oCFB5p&QM-Hn=wfD>ew>b+4a`SCuu67Iw_uF^F(&5L zxq_%in$zst?io-x{!}4s)esk0IGfFHt@_2obFgNyhSJ6Fdr~#U{Y!-qCc+kn_KXAm zpD_o%TZ%uq=(NkszxG|Z5fsfNyYn)8XIkPG)67O8+iP5Ww<%}20uv;CXt|R*4^ghD z7dt$2x@-z<87f_F#)C+;{^{=DhT#Lz(@QS%lb!w&Y8yPRS+&g4pqXDl_n3g0TC$@4 zM5mj8kp5bIK!+4Pdiy5h=2p?s?3FTeo$t@KInU+VDVBTYP86|{za?4~nG9QRMADgu z-Eh|0$52+_;@DF$!VpdU{?*YdbtcwS7#?rX#)niq{W2h(WO)HAL!J(E`#M+bq%bUS zjQF#ygtQARYge;q-mL5E_KdCV)P>MCMxNH`d;6m{-&M7TiX*3OE(W41cE7EsH&)rNFfpD*eM_+nngdJbIyWIDgbHl8u6Zyp8CU zKcXJ^zb}rHHodeKNS2Bq}e7A{%VBoS9$at?6w{#(&b=sMG|_c-F`CeY~GDiNHP} zH{2a(YLhKRI3S@z7LC&JohLt%{eQ2P`1hh__`a=*2$v3z*e>Bocd@B1D%@$a!1wd& zZM4_^OdlGROBO$Wxwycwd-~2ji4=J1*JL3Et){rm>x)r$O$m8a@jDb5e0XZ>Q1Hf1 zx*ZEDrAw(Eeap-D-~r{-(miZ?P-y_>XejLTzF6DM`=1(3O{DC2eR02~?);)D*3v!b zvzCN$P&x6than+^aqkeBZS)`hOW}I82#gbs@(vtN1VNxhEKhl@Q?a1BcG*|!F-^QW zA-gG0WRWa{h5^y>LhP0hCO6&cZIS1-+b~)2(g5>=%9`3my2~WY7@qT<4Yw!cW?rP$ z>9W1e7K}fv&i3Np{_W>7LK1Dr(x=4i3xn2npPi3jSc=`r9=Yb~%?yp_R)Tpt&&i(p zMW?m|wpQ%u!tJI5j3Wf**xojvASq$NEjjN%NLYTRmyS~IgD$4&v!L&-|6p45?=eG?9x)9_%|K$jE#x+W6)+N8jU_!(6uiV6g*icWK7w2*6|!y zKk4zQpn=?bno;A6kD6;#X!KcV$J;-H76yaFmO<>}8{`<`uesdquoy0Um3DnIS&bCR zrg2z9eC%KPC3$4-Qg}=U55_+;{m3|Gqx>~v`>&qo(nl6T(fEz*qc1qu%f}fA)BT9k zLi4WJ#}jgZG{$OTJ)7^b7(vFiKJ>M~mBbCj zt>3rS7n#0SQcQ!OL(uUF+OwG=CB?bAR4V7-x(l6%vmtzlJN`jV%_tfuX3OS(llZ}sdnnacCQi#)0sMOjMVmXuA_ zWLN+i>eq8C>{*Kuy2>)aZdGG34ivYf%8AI&ruo6ooL7#yHD}k+$Lt5sRKp|eJ(-j0 z%lye%@n5?gXA3IkLey6!Q)BKl4eHd7c`IEgIoeoGcBEa9OF(uy`e=4bLZ+eIo5Pe@ zU9HOt)u!iZoZKcpW)DBb*-PgZR33k~Zw*FT#dm3}Ha$@FNK?c#J(e^81cdWjkfFzV zXy`D#x}yl!S8H7He@vX=u06=_{4JUUb3g+ZbJpSXWYm7k^8`_Nn^-N+J#gS~SLyMP~ghiZEXL?H~%rDEj8vMj;q z--PDhP|$_MTjT^_{cug}39&4qD9L{DK_Q+8Okczr8R%{*6(gvog~H+Yx0`1bUf7zR zsujRI2W1Ufa8|0zAS7cR)b}pcr@it45Q!{^SWa2ml!!##BR&s~@Lsmm3ss)ff4X#m z4_rwb&V7xm%`w3++Mhq${}c6m2G3#G4+|-aA*OQJbeecH=`dMFOY@n3XDTy)+6_tb z+nlTH2@x&~SE2md>(|MNU+`p!iu6XoYP33bSg7Kb(p%+Zbw<|4T(F33+{=l=Tn}ix z1H@=XefUyz2bgU`Pwi|>$$;!>&RdMu{bxwkz&;~rgT_W1&9KDxViBcG|y9^ zN{UC8SLIfSC)1p6I=4D`*r600OM$7#ULCkE1UN@>n%uv=8tH;*4aHST(=Y53ss1dh z4tZDVq)cZ?Cs1a)_-2aR6iY2AO-6#eWaBMxgYa@K@W0mV`)+Lr@48L;I1MCJ>I)=d zf;{mEiKmZOD=E>qf8fNpm*PbS_epOuUcD^25iv_)SE%=t&uBPk_8ydF0BPTU?(arv z&l3TfS%YlVw=ut4;q_$w(=nGr$RUhLXK~{61+Pk+d~f!0<2a6)%Nb~}ll}vkw_Oy0 zZKZTbSVvVsCYukY)`*Y(LX36iAtcFArX<7PmH$tU!y2N894FR@VYOn(_?ocl`W0I| zGjzeLq9ZiJql3&oL@TM;+fGi-lT3DQZ`dLKo*IoCiPa9uW4Rmf@K4#wLQUz<&%hS_ zTwlU>rNv~N7*R}FKIbY3;DP<1;%?nJ&ZdgkTUFfqM6J-G;VXF!wbG`uE?SW59glp4 zI;U?86yZPKXi59ThY0CDM04f(O#DJ2q67zpLBW(r*`M`2oft{U}d;q1Lcz+nz{NGS)J5 z#gd9XtXU5uIPQ7>-7CFDe_H?a6GUdLhy*tGy6#1ITj-i<*upzY2!4Tbs%fLm_p2Di zR`|GQ`|64i1pMXw4sO4(5YS+=Kj%Ti=o_F{KUG6vA0m}*a#Eup=WlW&i^7;K8$EBf z)cr}sHSJTv-jK99!1s+90b`9@%I<|Ppv>Q)~a}Y zFoiK3zyNI5cy&}kI!f349ojM`#h=H?`Tkr(Z3r)&g!aWsA{4-6&%WI2fWt9Bl}nUB z5UXP(K)!Mfxg#>PYe9%S8cW1g*#n=j%gqUF#e%sHkBm$Ga|BCR`xL#hu6+Z zjv+}HSvPM(CyIq-;7yVr9)4w=2L6TERY}I02hEX%lD+zyzU}6x--DuWy&CNPuotyo zF!RHq{lIfo`Xe8Y%op<$tOVTSQuhvPQSS{=r z5n{PV_YLiXT*KXF1Jc&aqqbxva7V$0KKG>ocZjN81@W$zL2E(gN!W!CP?2CzpJVKz zw2b{3_#irYF(Q}zlk8Kh;kI7%CVb{)JO9`6v)^u14x3BmVfNc)H@LvLG>Z=f(z6s8 zLLcl3^6ie~f^>;8I-J8SU#98P^hi6@8Vhkiy40;H4s0b{hYt-Bn=H5ZfcH$L{kH0m z?jSO`OE=dg_KX6*vpOZ2U1wE))^m9&k?jcoq%hhJj2Mah!~D>h26#Se5lwT-z=6Fh z{2)M>*vNece7RLXX7Zs-ja!+LaZq`;pocf68go!TvYJ)7CnV5@WD*EZHKt-S|9s&U z$ow)r7bmeBa^MLfM!JkASmLY=*3|;8;E(dI`RQ<*jhZ(-*yG>SW7&7NTGtza)XT=n z*nW>S7w@o1;+&c}Le=_Kcru+Am|!ky1mEX*nygZCzAc+Ssl`u~fh3|+dOzrU@7 zFhe$nQh@kA?0+}jREcKAq% z4ck8MwrsaF4+!Eb6dx4;B8D!6UHtE&V$L_}@3@hfwKO+MZEMkTBzw7OQ}<9ttEO?j zY0#Dx9|8pl3du%Q3w&hc#Iu*wsCO$u?h@}iv@=E!jW18#`8v5uG_~$Ak2J@X=MKM> zoI6Vz*TW0H4VpA0r`cbP?sK0+*H()eF(S0n_O!ZRUmXZQUMvA!>o+|i|CrRo7+n+X zjA7@B2Yfa1U~P@+%@+r6r3@^WZxsi#TU2@Uhs+wvq8pXlw}zCO)>g|OTzB}&5d4GA zf8>DOnP6RF>*cNjxRO@-_1~KEDgGGqCE4rY*=u@qgR5!W2DOqJpWR>>Std+{Cfx2g zYxsesF9Y!dXPA4Gp-o4tvYKlp8fdQX#gU|^0N%}5CYnUKrkMoQy1MVfA4w*(!xOI% ztI2}6F=?;>8;ae?tDmdka<3UZ6O%vXozVh;a}USE)F?^mLIT2$9@#lkneUKdx{DtL z+BhO}cp!8NFh*IA9uod%P0N?i!^J#`qHT>(*9g}(T_YYF>M5Z>pa{J3$P#v6c=U=89XVD4B*0uU2Y7`=zJgqr9O!dO zbv@(JP``D}YswcMwWj>YeNVkP-{^P5eJ`#7u&-C`Pk-zYB??%zqdPyR%C5 zoM-J;aExM>+GPFoWWPFb^PXC*lWVqgY|b{;W~NuS8ale%6DP|8gf5Jk;b%X@EcQtK z^j=F@V2TeBbf2km;{cHp){xz|_=NCDY8r%I`N9HH31jugiZj);a}E*~RKJ=2xI7cy7bsma>${5CYli@ZioPPswFguY~brPKH=ZfM?s-cawh zc(X1pZb8n53WZxWf$Rj(L2mZCDA~XAVFXF9^QorZA7+|hsqS3A$TC6Yc}~fw#l_y$ zINGEFks@#Q-Dw%dL;JP8Imb2FBOXCUn2Bp-TVtFOq^4^75}YKTX%4}^m;2;@O!DHL zYwzP1Kx@eo)FigBtNy|hgWgQ$>$uk-V+u6TLpxp0b2oGf7_;-o-L6a|Nh}H@B(}re zZoOsop*)!P$3%}T3q3mCR@g|c(Z3C`sk<*QNSlpW-Lc!V37-kOY!yVnI z{mLr_A1VtEi+Peu zinaIGinDl#U1u(b*Uzj=H1b$*&Yp*Ct3TuVQHIUxtObkrlin0A6{vBr={!vS?MmCw^=ZXN_e*Ia&@7gC{B*czn}zH}-|l`HJ@HFwh)Ld|Y?jA>lo`g?>+_CW?XZod za+W^@mZwV@7F&WStfbHb1hD#8utr$0Ua??xvY=%uFr_<6!I2H&8I0R=&slg6S?=0! z*<>;g)m8|_wqsMyL%yHCPODDLaK_7kir5YI-<0~i93Dc2r-+!tpG)3MrP;gHq%mfd zN~b7`t(-LPFt)qZB#r@=13mlXEEm7K_Kb7m^z&yK)qiw7D{N!GJ427uVlQ2D|DAX` z!Yz_n^tkdU?SS?VmU`@Y29MBNYq#eWW6urX!RGKF1vtoqC5e5TigN7txjH{S=Se2& z05mikiuzYb`>%KSud!fztM~KC4Caibwd^NqPRH(#mAlqyr)5eudZ*ED88~UHKIIp- zCuzhz(9q5axARD2xMa2(i1bDMOV=!>zfjYrJRrKqwhd#;&oYzZ+MbSvYR$=Y`H7!! zm0X$iikU|RLNTVH5mV5HVX%e>>E z%5^S+wZYO{Qbry|Iiz4{B{F~YYbwIax!v10S}(S9IbHB@i7)Fab;f%w?%xl5f7cy(LkhTQ&rtvJh)HC=jVWJ@+d zr`r5;f-0`yT7cSBU&u{u;cdFk?N`>)zuXAS_R*WxdXbA5PqYFA*?a&;!iP-f;Ri(S zaxs&yC?|wyKD&Nfl^dR?J|QuPm+8P3Tk8nmA=-jpeO<;o7kZ=(j}%k+*r`U%Oe+U> zl3CK8kpz~ym5ze9H{Y+1fc99>96rvowSM7Hxcmu#rPgvXZe|udH#0iPfu~9Hy;A9) zLL2nvw;0IZ`he%eL0ZqtN3D$qS??GthH+A^v@Vr8Ka)Xn#0xi6oz(cpcsj0Yhl(ko zlP$uDPi_UMfl#DWl43-^@Nd>6%gzLPsFtD{4p?^pV>=Ip&LDOSh1lmDQ5G7>2anV)?xm zAD_r{ESGDCjKcRrs4nU8>?8X5d`#h#Vm;)8=TH+g_$2vWmSs%gm)k-_gs|Tqm;Gj~ z2@9WSj3cK`#E~04O;IdEo3Ex>=d=>%InZ>#Qn3SizyYkZM@k-&e%yix76u@antFc6=0j&F!!lMCa1_WY$ zFPyHC(8A@rwxuaX5VNEC%S2vq=iU{ z!y{zMPS*Yj2Qa+TX5JFpo+bFzl5$tQ>RH7x(Ahb0mP$DwZs|gl?B*%^uzL5su|~(z z#73ucD?>S6deD^b!XY(8cDF|KiSH~=EbNp+E z?OM3iIHL1~Wm1zAvw6)%wORv!&=*o&6j!@TtNaYU!9EOz7qcyRtc<)ipX`&0+!L5* zEVMLAu=K@6CLYF@k{8&PMYmA@8+7v;F1nXtv3(Ehv0PJy3#;35DH6d877acxmM6El zhi>tr-zYs8)>7Cb+HS@6A0En$?00vfE-J4yT7FHdi#05@%XJphC@r7=aet_6R01r^ zE9RpAW|n|xdCVhRDvW@gwkf7_= zqTS$4MnhD)nro(e-TLlCEa7wGaiPs(EZXQ6|^(JvdrO~?69D@Yt;)B~s~k!ER% zZ*OR#UxGd>XL=|QS&O_oMZn>wlzGrpniMKR>Z`{7WCMEJ6lg>M!6}n+b+3^+E%Rb7 zrbl04lCp*62I~xG9|{|!GP?u_y4vaxagl*j%D4FVVj&Un9To{-@Ep{8i3m? zmx>JW9Xd32eV{ta*kS!Soahxc{X|O;tkNbiGpb~Eaw?!GF0?XI@%Vp zucvdE6CY8e=D*T#vFUY^&3;BhqP`V~>to#cAzSBP_a?JV@5g1=ZzfCa$+Ip~H?O&l zHAp8ZO>?wTH$DDCV{4*2)cRmQog+QfhN+!!2j1-XT=;8>!QBmeIVJP#Zdk)>VU#;! zSIWYv&kGs*dh%BJ&a>=M==*xi_R(^Qtb)+Vq!&9mOCXX?5*Nd~Lzkg1{3Ho3;O=9A z8T5A513QN{IX&u{TfA?$C7td7=tBd!2jj17stuEQ-rPLm{V7|ZFg|YPbSfStkxZb~ z3NQjCWw7Yt)s5%ZXX3~=K)Qv?DvDW&j&#_j`=tO|pdCt@R5&f|o4z_qqULFxraE&Q zi#ses#8PA1fe5Dd-zR6i@oe9;=QiFoQAExtzFIi+<_&xyd94WQb|giS>YyfmgrQVl z5yVcHw;GlmLUH z+2i=<3Fhrb1cFL>R9f@_MFPI{246qiS1jEgz8E4r`5v;9T~&?<{QD|oLn z$ZY>oNY@q~(|o~vRc|F@I8TjHp@noxRy{IH0%Il=;6~yxyx&sUyV9zdA!SJY# zjQ8o-omh)3%C=&`Ml;obw=9>zp;KoVk&1BPGcg>9k-QNxP!bhkU1HS!{8ID{*`LP5 zYEM~B+K;7pPVZ1E2%>&`>*YIzH(NnzqlMHQSvXTjv^z>661Gh}{sW7d-?Be?hQ&g2 zL&hQ&xor(~205HKV}T(OQ1;J}B4Ot^K+CmG#?BZRnZe@3A?q9GH#QD39{i&2*|_Sc zgP)bl+d1d`nPLB3wrmz)O>IP(*%Y2Evvb?w9d;;xyJYZGASMVig?aCIzZ7kC0L`ea z6gaK8itb+&E+?B#T}2%HkA8MTIS=YfL}`E9p2$1+8e+CB6Bx*WxdWmJv_Z9JjB`97 zE5shxG1|GYa@pQvGrcz?M-O4`57`%TKbI$zrp3K9Um9I7Ek^~4uTmSP7wE}BPTH9I5LU9R>0#pV}w z)3A?U%Z8ov)~oIEH4OR>T&Yxak#13#h(gpKwnl=ON-h4^_KyKVy*F}9+Z-3zP7@6`g=3@I5^1@1nUQR#rNafl)_GTN#{Qy7$9{`dp@{RX{GDcottz z%-p%Ok)pGCG0Zj4IUPU`x5Zt{eVq>+AGnUpVb%NX`r5W=eY&3$X->u>Wgd;>sc9b5R?m{AxmWkN?-o>49rYzIA;q7r-lvo5J-Rm@pRH54xAdt2JL_4Rx(Cuc;TFr~!_T`jBub&IbWN=C_CX1j=1P~Y{9WNfranAaN(CGf=S8n0K5$m-rOZ)!^11I}jJq6hmPJGK9mUACf`WO@ ztE34iQ?4YhTI>hw^c7(r)OIe)U-!03KL`t0OM&N-bfP0#UfR+9k{!u4Hz536@!6kw6EfU7nAI_Q(T#Wl=eGiJ%=%YAym3jQnr=|UaV!S@?BrB)}ZXvvkX zQuxeT|H7lq+Y{|8!ER@s24S!5CrDQ-h;~?z@e!5NL(-IZmDiV9cl6GKgY_9<@)*GQ zY&e&`k*m9xqw)Qs`cI$1PkIIQ$DoK15poY^JowSEh5A>xd^+;i#OQR0$uE1Jq)7lSK8Jw7sK$sY$(~gI4~;=OU!ul;78CZ3S*8 zY&}u+p^fi9zYI>csTDYrKUbm*MVC^*=qk38dlmhRmEBIbxr4}AzEU41`+wL@7&h`2 z1yAz3WwVe&xc1KFhc|aSdd@RSB1fNg;7S`N;||}F%)$=8>ziddv8#M}KesgM6+ir( zMsjRWe=||+ml$s>D)KVkJ8=9uJDc8A)LUv`GnA%wVoG)9bBzJ;>r}Lwd&N!m3(`0Lac|6RI?VQf3>-3ndRp7`sS0cH53+7+KS2)f6p zgqs15WZ1wA5+Nf9<%Jv=Mzi}K9Pj;x`$uUv+%lJ5-mq%!v{e0Jl-r!igJEFy?;0sn$su{=vtP zk;CH9)0r9MvxI_T3)K>7au+YBOci9_IYd^8RDBYs^68jx1`1x}ebVQup3s9c)`wmb zEb9wd>p-K1g5JUEEr?JC#lRh@SLWZ^ePF_38~Aa|PJHJfdJaYxfDhara8{&TV*yBl zWs>k?mVtcq7F7E`Ist`8q zD^iSQaB3L9kNfy+2s{UM-}=U_@NZbX?&H6oI`VHkc;lw~GW0%A{PWr3`=jhTBQ+!n ze*mQBM710Ln;%LN`%cq7a`6z->>SUC`g?~u;ltMivuCqWsWPz|e&@gQ5}UXnU@+$L zfIHFC8J_v`OMmunOu#ed3crlxUauNe?|Kq8lVnK+i@T;a9THNTEgr2QSc^6#=i0NIe9cr9Ndn*Ov=^BGv0T_F)U%y^ zB63PCQUX9H3(|k}DUxpR14Vy{+kmVO$$53V_#Ooc?j6C1}G_zj^So+U@ zz?#CUVDp?!j$_Z0LRyudk`)9TY?%AhW?bdS;AHcH}g7 zzpaYF(%i?)6h(z3UY8#VvZKNOK~v?Gy2aG|F8N6QsBN<;DjBG~vE^+F{l}`@BL(Ku zf<#lXyS$RX5PU0b@($N>Qb8BjdUQTH+{8f=wb^qV2!sy3uXXduVdF&C z_M1_{4^4)Y!M6Ka#}|G+HPvdZJFPw7ff+Ow)_cZL$;{X4D=%u&wcN%+!rp!LA**5h z;PvODkuE%*c&rY-@-HE|iJ(UvI~1G)C8I{8@o~IM{P9?E;{0S0r&ZSZ&AzQ-iCDff ze=!M8%zIMVUmr*9p55%|PR!Lja$sH_5;2qMaK0gFo;E9d5R48R?F5d4M`PslX&cPY10g2;~&{g zjcroo(06o|(dZryp15J>8Qh>2v_Y#C9aHB~-K^>66zo%=bWj(ux&0u!@~3*K^t?4~ znuWx70l@0NpJ1J@>zW1VT#-|&XU%T8GzZIB%bqGD)osv@wW(^?TLx`*n_E#X^}y z{%uj%?l`9@T1-pWKU%-R%8FsuLeuUyL3g9S9$Q_s7+gNJFVi43?xSsy2u`T3 z?dLiin5MDx9~w0y9yCY@527qN%k+hW2`EmNnZ!Y>Ro_MNTO4MjnX)<+e&-Q=KVj*elMtcG-_QgYq3`>_>;#b-Yb_HLqQ#+Y}}*kk$B zA2D;ez*8Te5z^cBx}G-U1J}z6y^%+_j8f0O+-01S(w9K#_TK_GKb)hkTUYN|eda0I zzFq83++k>eLPGpV7Xy!W-@HB1?Ke;kyuy}mU2$*vo({;wBNn};ryrnoIy6^{!=HM3 zEOHF4-T-ez$~f_4awL|UyJz`s_>Q*}$pxubT1HAH{{$pWUCLMA+Gif8 z9e`0fJWTFZ&gHwlho&@X=P;QGey&~C-xlmRg!4QaSi;^Xa52i8tLksT%ynT{C$^w{%)$q%wKNT&?;eipTlZ; zKh*Q*s@V?o^Idh2b3i37nq?V|_~Kb|bL8cEBFJ#w1?3B+JF-E@-VO3TIK|v{J-5nJ zocegTC&wnColM(fY-EK%XQSu-eOGt}u?Mn+g%tyPxZR!(2xWbtFmy7T9iz<`^;DLt zl~=LxmSvm%9%C7o5{JVkl_7udEFZhu+L^F+>kzT#sUr0gkQ|0mWv>*q7L+)W$&#UG zTLMw4W&w^2BYv!p7K}%YhCLy)hZ|b+Ws3Jzb0i|96)1*GW8WX&P*EbUVBBI03F(jX zGQSau29I?$jai~;?L&4rfH%94A083Y$(Q^Z^KU4rA#4VzzL-dRTv~o@FW80IRY{E+KdTb?tGmJbVspne0*tbfTIBPleKPT$9$ z&~TDVC2Dvm+&?kjuR-9o%zhV>q$fKYQ2oUrjB^;$ytE}6Jw3~!Bwn;3y0}gM zzb=6PkG;2ys`87zMiD_NfkPZbK)R$u0qGKfBcMoki!?|{BPr5#Xi-9?ySp0!>F(~3 zzWed_ANP)X?-=j<^?tZt{2*}7dG^_R?YY*PYtA=7`%5(Zf<}<$RPxDIDkWjTnjg>+% zNeJ=26cDW)19M4Bx|^#e;kHE~0l*J#`x(IjUvXayM=gMsA0CYws4l5qz+va;rF094Hv->OXi2)C2^kJaP)T*pCGreD%v0 zoKrFqQminfIB;oyjIR zqvOv@;X4j#0$<9ETM0U;npsJi(NzCZgZusv9~>k#OSWUCLy##U^q>;eGZ zCk{5bitErTw>K}D)3%w7Cc4F$By66opAYAfHKh`6qC-ELsB?3^R~r&+ug4}SW)Adp z+>-QJZo-kUofgU)+mNoOpFs98*1T5G_QW|P_^Utaw8Vkc9I#bM6aOL+WGU`hs?W>w zBV{T5Z&{&BCkm45XQ`5u7Sq5EAvaXxg-aWr z`Qot4MvAMYM0_Pp%c)62l5lt&Jt@Xpj~Qjb0F()dMsrxncl1bZp805deRiY~KHGWT z%>veNX+`;iZ{PhbWa3ChnP2HEtn|ruW19%wHFm`n$ArAr9sl@Huf+;_XwL+udFrO1 zR~%}{Z<=(Rrm(R-B1lrt9JpZ+`?1DrYW?9m8NG=38*|5 z#yHc!mHUaI%7-TY#m;^U-hi_^>tF_sgen1fLa+Gj#OE8!)Tz(&@g++BX~s7@x0f?u zXG|6K$!!+tHgUwROmu!+et(o)v~~%pm4*0!_po0!6`m`eyPqYpJf+^*>wQ*)pTN!L z5u~lCJBRzzSI-*30_Q5?s$5k-iBY^eNEI6pT^XMmPm7w~aD|8n@+tiFa^*|oTAKiZ z$fy}B{kP+_KUb{`$3QE74k{Zf*(3k4>lNjDD?-yRqB0^Bb?_hMhl+g)eM1mOKx&mR zsCeg@9br~;a)ZC%OKh*&w+uT7@A*QvP7B~DK%}1HtVmcBETUR+l#205-EJzoGY32B zQz3TFjnL*Vb_yQj7wXK**G4Pl{wESx?z8>c zF^rr@r_}_rtf((v)*pV1gyBchcRY5pB;e|Q#kkT*rd7P^v9htlV_9-DP4u%cMpoRb zI`>A4tnmnPyILIepVTdE&okJQP%gH8)_eMuqt~ds;oV1^fklkd@dD+>g zyJ>}CII#7l-|nGO^jEY=3&FPKM-f7WxLXyc-$?41WT(ct1-jM?m45yBD=D^iHOD2D z(adQ$mqD|fg0{rYvwno3DX7sjh5Od2Yh$@Jd2wqW#Ye5yFUVe^kzbv`4=bZUT zrtxyzFypMyzaP>@WA@T2o!;hh+%MCcA(lqu;%0y9C;n7BbdL(3P$v;BfFL1DzQtzE zaIl}Z=L?B>BCKDKQo=x&Orq!A`7U+1eO5Ggve7q=7g7Ce^!8-mUq{wcGH5Osgl0 z`MqULB0FcvZVhWk=})JlE0%@fquxu^uxGDVb43bSk*pHJB=qfm<#0lV+}t7IHRgqe z_o<{|!d7{GedaaJdkPJz`6j{&eKhk`>yF-(xo3VoS`i}~YV4fJ0yg|{S2oGOW+F|F zkGIII`8~_4Z-M9VYuAcjo)>vVZ7R`%hD>6Ys4{p5TSqiUYmkD?`79R8=MP3?t5d($ z)B`Rz2ZYs~)ZZ#J51H*w!~D$dZ49TOZ}G%edX@~+RM}gemK3adUfg`Fw8z;aHXd9C z(soAGVV|ey3#$b3+b8fdKV8nbwXnOl>#*W(Pl*rImg~F$e}<6Pyz{eE61MIe4w!>` zOVUC_f5naLcU@D?8jP-8)X-LT&Of>`xUVEFr%ZKcJT^EKP4CrPz*2kZYjUAcQS8Yw zLpHm~QZAjFx-wdPb%V(ri{*3jLLU}{>2Qy7UY%Or%eIJ}snr8F(bM!Onk^DD!N4lc z&qCc3VU{z;gYTV!Bv`ERqo4RE70}L-_|zsTq*t(<8D1fij3;tI-#dWfv>D-Ss4k0W zPT~248Q7*3)u%jh$e4V7OuNF=`|^veN6%Sf?v>-V7RGu7T%mT)%EL7Wd+Y3J)K zWN_^STYnPp&wfihonrq9P@Z`g4a2j0V^19K!1ljU6KTGFz|%K<2hC^kFE<@w^##^f zIaiF~O4-Vpn22he&rWQYELtW}A3cdxec>%}99L0G zf0tCbSAB?@Wfe23ACuqBc)OEMookE#Xsft4n}xQROW=48lEYOq+xh1IZg^mg9$9=$ zC^n^_v3*71F@>auaUw7+=A066^ z9MCYdNB&XRl0~dD`tnZNAqsbvXgHrpWj>@d+=(M&9 z8E^hAmu$oiZSx^!tO*V3v_kjyuEUX?VGb#eDYNg@X_qfU8!>rTx017@N=SI|H1mrF z8%<>Keltlu=d0X~LAon^?Un2ZTX8Lho*zYKQ$HLEO{-@yso?mqx2{Bd4&fH1p*pFR z{e`RcwL_L@^Oj~g7{&?4VHCv}+eXH?@}f2R)C^MPYu4u3xF#xzp(vKU6i(*Fg3(p; zkeZ)%PmJzo3(G(@QG08f${*IJ)33p(LFmNXq?0fhy6*`i&wanm-tjy8 z@>JC&1g8$Z1;r93G`J7)tGD$0%<=28LqqAbd677b#PUa3oIKPf3Y+7|Lq646CsCQf zzz0VrettUtyOiP#*PZ?nDIA1p9PaGeQS8Mb zm<8pfb5QnToA-v~;hkJIpTv!#uZF6|n8P7LvIIz`&qbl*bmiu+3{7f^2C%11oQ(f~ z=hmO08x;rk8r|qy_l*o#UoRt9X&&ozq_nsc9B`98J$h`}|9mSK=dG!VNcs9Jar;wZ z5(;UA(s8S>K35Y_qI8ek4v*@plZed2F+SQuF9Opb%-eztR*R%@IX8l-G6t@6E3^-9 zbu%Y08X26L<$8PXux2CI3P$+4?x0oyp`rL!x#&$98Jk;4vEcm5OuCn)wxT)nva1x8 zsA>M*WSneNfBNjYVD4dH2rF&a_i|S=1HWxoASjrGq+v94uJPE@S8pygvdm>?$%mGp z;mQBG1kqIDwB^|d!jkbgM*=NX_T`Z5vDmKCKZ`)*_+xZ@{c)y17sr-yeWT9zU(dw! zbAWPVkKR?A-tuRmws_go^R={w#m_=MLki+W^!oYBw3G>}&viG%(B=`;wA(F6Z%-i5 zZ7Cg3>cB*O1TtZF0XUnZlHV+R!o+ z{FF7jxG4Sl&CnhxnCquiq|=p{9e($Yj>es$d)&_NKK02FmFr!W>Gj++O|}@K z8J106euGn;?^|f!1NxeUiO~nQniUZv}hs35a3h&i5=!0PnMn- zoEx+TgcTQ!_ioRl+28!B;j}=l1^o?SNI0wR+r}QvpN7riS`1cHE`yo@ptODQQ(LJs z*Ec^yn~F+m1n8>Uhy)9-D^I{LzXka{556y|8U;7Kjc2F=$!{0U*0>WA4px_#yT`r5#H1Ynsv*dz1Rex9`PWdVha8CaI ziJAejIih2*wvSFS&W-I$7W|Av&SYJ$e+0u6>G&Z$0o#%n!Y4l!Q?7Y=xvRFe-dHv0Hhe~^bI|1^XG(p=9z!+aO5^>_`sN$C{%!&hR+K77 zi}d@H9i4b-Dh3hIM9=V+tSY?NIFpk@Z3=2@h2-)*u)Y$n#;RsjI9v34J(o*X5fdj) zs8e>YdXO7{9E7VjsUxVKLe+j_A-}|I+*Tl4p`0(Rx`er_YOJ;yUjk;-&0q6V29?lq zmJE7daXDWB5%|BqR89IQSQ(xJ%eP|MZaFL78hD^nDQKun1H?Nhle_J(^3>~meDqN9 zW?9ib*@=GQ;V=UQEe5;aD{cw?j87dcLK;@z`IUNEtgs-lUF1H^Z;Svq~$!m@i%f;A59Y`21HIguXDU_vE~OQ_SPnRbi#crl zglh7L@%IJ5=APW4cNtiXNx**aBt{Qm59`!8(f`w)FGxiG1tP?WB2%i4Ffk67ChTHS z5-zU0>--st&MTeMc6k;$7Jph>Aq~;DaS<}Tzx%|c5xwoMEMZYTqqOaQ2Tc1_{B)}v zQqOl)=_5aA{*veQR4fzzUEYxY3+)8oy}NSAzN4-u8DI z;rS$723HuC=0Y?K>RFSKMjsJXX*b5$@2@xBtJO>{zq0wLOz@CG(p5W<2mnpduqw|V zF=tyB&JBSA&DnZ;USkSlKlLi!&86NAU=3?!DW;9vQ4{Vs=K0GOKqRzMRug>|Q7i0- z2t+ov+f;@bO!h=%1J{&4x?=^Dm*CUjq)3XYwYH`ksj0#L^>3_T>S%_|@99H;L;bV- zB$#`iJq76K2RlHE@YEo&%}>k?1X-N+y0?Jzpy>56e-h61`}M@*IikLkLgc<@N#d=? z+?j{0LSZ`(o&rmar?RQnnX?xk8v{P6HX+k*VF3;}y7bA@0@qgrVNPlIFQd^(S#VrU z=~t4-T2~F4d{7{nPzFdyd-yPX3d8?nwQfoG1YUWk$uRloh%Oiw16Mj?4)_Z-w z>e=ob{u50M_8ll~2~k1jN}^p`KxS%MmXgoT+Nn{>sdzP|RBb4MB(x?nrkr!Yx@cG` zB(|t(Y-fa!db|wZT?@^}0umi8@o0I#aEoUVTUGcj4gW%OTW!wJvD&|Qh6bq-LRH7M z=(D_6MY7jOScoNF<4@^^y-?}z8(j&;PVBy4zqxG9=id1p?)f2v%8m`ytbE_Wrl=Wd zg?=V?NsCfq%k_St|FhJ;Rk$Ou2#%HMJR_^sG)!mHdW;tm_pBwR{-(SdU>Yy5My@OhV_wZW?~nz)x! zpvD(6qi&Cow)FGhJac7t+J2|#+O4dKufw(&UpkIkf*zdYFh5)8HU;zvWA~UHOz~lC zLE-QWDZ#@|+9QfnWziGmbM`kKy7k2;DtkE?l@ql!&Awm#v8CcxE^G}fflWl6R%POH z9Z4D9I}!(Nv?|Mno2`afUvDc|MV@ry;Il2^#fg@-V&UU-JSzj(sA0$^sO8n=`PXQQ zWZPNP4=e0P(DZ9!`%+I`s8&>~gzSKouk5o(-m5%y){Ky0L%m-S^eVQv5!A%TeG4ri z8}y^=($T7DcLVo(ucKCeI%GPEp>Vza`EOT%&GC??5CcxwG9O>QU!s%r=X;UyiFu{Y z4-^;GXWfi0&)%!XFA9d%B-}=%cPK((nl603(WYbi0 zjM_z&7Vj93C>Pfw!`+R;ek)KVD2Hl^Q%1sf9e^zfM}?=1{fEqP1)>g0FZXKa3Ak>; zE@RiHt9jg}8EV$yV!VN0OZmMfNVt$Q%)s{kXk z5VyH01D8)nRzJAUK*!z)iuk_z60$E4_`4#1oACOHKYWygJ-5W0M3?}Jep!UEiG>(Yo12pmOIuyCDbt*jLY=U7qkl8(`We?hP(>aYvNoOm;2F#7 z+HONbkx8PFGEjWHhbnISmM%BeIcL3nHFw#=@X9rO{Dz?DFL06JJ5tU#;ac`lx!B+y zQ9<}wCmi@O3K=xq5bcsxu>?_ML_8e6?zLH=8Mk&wn#y>ZXx9+5?DrAsbC8aV)y#_n z0TGd$D0A&Ub1tPV!LEbJrevyrn~v2ySJW7s!Q7lf>Xxx0hvX$G?LvFx5w!(j9K-PV z+0|y-6Z8r7Rw)^iqJ|Kp`u6G1N#cLYNRMIpluO)Y0dAXiIn78c&>XVUwu(tbLD!A$ zGw_5k({F-t!Blg7>)iQ*FBx+V5z1@~q^ZMBDG9wX-O*YhU|jXYO8WEpj@;){g|;NI zJexO$x%dQ~J7`~R!WMF^fWhg2NfH$Ul9sSFR?YkwYGJxiv5Ocg_0zo7{1HC4%kgjH zj;pDCssDqUe-ss~8u})ojo~a|GIOrd{1~X+HsR=r^#JZN6?Nio*IBf4(OQgSJ9fN_ zg@xmGSXUm$II6m*Q1Rp|0+(afv&!*b8v6)`P0sVsx3|A3uYXUg5>8}^gz%r%>Cx6J zS?}%Wd<`9OyhspAGx9o?@rqHU>!jXG2^x#od$&h1O32C_fobdGsJMS}X&3IMU7r*2 zz}M(ec1e^QF&7thLYpwa-HYX~{^b5U=xmZuF%DLjm!Y(bPgvl2lH(w4fc4AIzXst^5f;LO_e|GD z!8CvRddebGgr8}_SPczx{YM`gHEWyQ$Fci>KNfV)1N}8;$}ugL4cbYcI?=3njE$A? z&C)OLzt3!FNNWqUc@T>VSV*+*PDG=(OHycVh4*})A>1g(M>@QO<5{!fLU!tEotVE8;b#sPao zuBF7-qIA!<`+m+!m%Q<+h;3_S{Qy5whY^NMu&dyR$#_HOCe@&XNdVqPrnDXJxlcz_MCN4V)*Obpx6RY4&2@+iy zp0mH+cLYBj1*vL$uMKgs{J7}=D6brkBCqqT#tgaAg*6700$_xyvyo^et8A{3zN*J3 z-;)5X$Xw_5n)Doztyb`(v69XQ2VuRT`6P^4_g$o}+Vj<9mEIJ)4fZT3D{r3MUoNQL z960Q&9@wwW`%4LbDZCUw+ycooGq^stNc~J5Q-*9fmgC0XeICqF!ZcuQWRnjjFY9)a2wE`MJq406Cl3`{v zCM$ISjhVlzE=?)^bS`U`VpG-#%*7P*aqh(X0tbVQV>>+2M3brkiE&}CrrtI^S9rd~yKgMra5)NW>| zaoUkn!?os#X?E?JHbB1uP{yjC!il(Wso8W%yS23zUcFvDi3ucR%YzU*p- zASB%V1d{C`^{1zmzq+O_{cOfg!i(MP`>$L9A=zwS#ZIC3=cL-)v91KaQZI~fwSVEH z`#_7LmoNykTZo|TBqUCs3cpyLyomm=ekS3$>gJZ$fZx86`630IDjIg2HSzBwmlu@h zfhLvVF62(V2eYKrqrJ@_L)7@edRj!o0I%`*u?bV+UUfpTI*- zqPGF__{PW;tmL+WX;p)m5IGiFF;1ec$Yue(r{IjcTRgoSwyeCiv3~kOLA3_xW(^rn_!N{+*XGli3TaNlRxwE*8+kAEsrn9J50&%8s#xMq&wmNP@NO6J=8q_upjWMGfgaNdFu{LbUZ?Q5#C$g z(q4Nmix?jw(O{Z;A9Y~4>-Uo%4$y8%)cPPpcUo*Ri&~gQC+jY>vns59nf|wEq|nZC zaq)d%@@3!D-#_9k9ynNCc4GS8ZweS})|4o9Y<|sGv*Y_a?oJgQAH#T)T#a%fx|Z?H zE?+uDzsQY@T z4ZxK;V=nyGgu)#B3arXA0bJQe)ZS@R!9mEGm{i*?^>C`^IB#HGFkd{Zq;xPXU{`Hd zQlox_&8BJ~wk7%b%-q$e8mv)U;Mho&Op?IFWiRiF^4Tiq9pT6y(?5iGScM9nJ7|5X zIGPsuB7HO=SNXgEXDaaaYT(vgcVGxTp^^Yjv4!PX6OQ;>)hN;ek6#W}R3zS|TX#@`C+nVX;;Zz%aq0)4m{!h z26a0#H5LAjN+C!BHliPJH0cnP=2nfg{8DaIk6jGu5~%cNRrGDjcihssdN^egI1)}6 zn#I6dwXoZG?M5pofwNS(ZGt&0m-F(n+gW3dJUys9$YN09+&X=vtZ?yN_{Wj&nyr$Z z<$7~9Y$1?&pjhQUK3~vn_6I^EF6Y2E+aI z!$eIW-#SWk#LSnO!u(Fx*OWE4*L!m`?@UMg+?mlA1`St4JB}|wx&g1TFTF4Nx2f;? z-r~Al{b!z7 zD|z)VP84&`Q1+8N3LnrP3p0p?ZZrSJa!Gw4e~D|E>z*;_4D1=oZO2}6qZ&w5QMuy> zVFXlE^;sPfnjT(ay4EFMJeENT*uNTzuaEi6WfIo-n5Ey<0LipgRj4bw_(T<{%&ab3 zN#3y4FDM^0mJJDGR|JX7HB;6#5&h13KockTT}c_Zeedf@nHZC5;`YY`j3>|@WVO<; zwb+QpGDwk2tY>>&WNV@Q=@BsbL1n$l1LxujD=#W~-0F8gqKY-ah@VIF?i%&A$uzYg z?4UhkiG{gn(|;s0R|Ig&EMR8o{wKP+1!)3%!)slI0q+e+qn9z>_!8J`<6Wk>KIDSM z5wkfRyn}p{o-}uB1=Eqr*e&(bz-~ZEhd-_7C?Hc}IJ?hmfd)yZh*qKyg_1=PuJP1f z7$>3>Ho!Es3W|zqA(fN@P6l5oJXT>3@ne7V8?PUtd3EAV9BP%izDb_H860QZl z4KpqTcz?}U@3Ce0K+Fx5WO2FV#YRHv4w&i}{VQku!c&$>eJe7zP+*?~wmdk|QoAx~s0ZG+m_oN+8yZ z{LZ*jGKu3ZlcC|tGi?sT=9G_VUmH)0#^&f(`O}J)AEA-3_+U6CjDD;DnNk!}HdBqR zo)Ix|k?|w@Cc9$bj;~#&FJX1X|1!;Iw3gx&oit5{8M0v?f<(SB zKJWsy!oOB;pR)d=<_^Qu-x?YY`}`pLuV^ZZ0Xm(yy#61AgM$xhLeqAzKA;UD&O(jC z1hS-2{=9H~tjVhsW^_660h6>Hd*K^?jW4eqLnUdOHStev<@wmYr&76;T)eH)!QTC( zH3q+BgY8CCMX)*ISfMEz!j5Q#=Nl+vc&@Dhuxy!H%yGM_4NT9W2qgX)YN~cQzL>2p zb{w(Mx9@fJbD(9+k*cEk56MoUf0y)Gro+E>yY7X!@(3w5I{BZzQ!z)2#cN7k!iZpZ|TVlcvz{&|LkzI$8Tb~nTSIZK_k7KB7dpkdS@b$j%kG9jkpJKEk zP(-WW8WKZxrNW0dE5?_8;LuU6q60KhdvQ<5lnzw9y zQIXn-bN?VaDEJe~7F98v%MM>(W%S>^cEUoHMr_B#<*z||Kw2F10uSQU=?o>ru>!w5JQ{`3LNlx9r5**M>`eh2UCAN0#t{Uw;67T&s>W}vvu)1{PIE0 zpuk3o^Dt|lR30^S7RZy%hL;+o*n`1S<9Wo`sS8Pc*lF&TJI|7Hz8Be5(l}jElp->8Uk<2P4KvcWNtaB@96e6uZZI(atFO6)U7no0rNuSwElbGxW{1k#MNVN z-GN@b3mPco2=sp=eQL2BV1ekRpaput0uW`| z^BUKGzpf4r5!9Id2kTg$g7su|JW^S~1}~5K@3a2~$h{-G-}*85ktNP3^iRIZ`+xgG zJ`_Bv{EWNGIrwnu5@_HzZAw98Jos6X6bODq_Q~=8-lhSY{SBX-rumjhXqP0GkAB5 zTOqJ_8tC;ma0g_CnwN+14awu4;2Jg$ckMNr3}#G4W(*hvQk2g*_Wb=0(X-0_qi2=< zJG_}MzzNB;uK(o@HaIIa*z#ZaCZS&Ud9RtsFksfHl|rVWKc)5nE^fMo%0X^mN^T+_ zG>_Jv+`5ErZsYqv3uOg$Nu!bhH=_PGd+mPnsp9Ri;se+LKyjrrfS(UA42Jr(7YcX> zauit#Xx;}p;D+BWpv0XBH3^C$OD+YBT|DkGgVQ@?4hm;HNJ{il%n z%nYy`o_)rCe<{eiSEJRBpAaf|U4{VLawp)EDcY3^sG56<6=9)OTdQ%J&=&jO29qfa zL*J<;P@nuGayc`IPoU7<=t~1FUJF>frj>b!f|1913yx(Th1bzmF0uC{UxRI*nH ze9p`JpL+!_%`U%HKUKVE#?T||m|PJeYvq-nceU#=;jFkXyj|FGF>s-KAI)?DXK80B z9_;-+y2;xLugi*mxnup!NkIifE#4^|R_LMN&26VdycvZkNvB}Cuo9?fYRJBD2U0DT zqKCAiQ7eDcMIQ2UbYt^O`4>mHjJG_FyQOt29d`B@bKX&#FK!3~Uv`1O*p3X0aO<84 z@!Nlkb1JU?9WqzH#Ww}9uik3TtcHm+WRR@-Yo~ElG+}w;(T48=&4|Oq7K+w2X}?Z- zI+J5^{Pj6J{l@mhrzqSn#=Gd;NU)a-Eb1BWPD5&ID>{jGgqEf!=+s$54&<|t%QgdZ zx*qAk?wZ$iPWz$D6hAgc6FcZ-tgWs=r=AD79%|#hbPB~_QX7fKKf|X?=;0yAPE+mw zCz5Lfe(yT2Mk1I0;wmJHKuENeqj&wj<}kKkde?nTumY+C{S+g_s+nI*+J6Ivs6NnZ zw4RHCcs?M!Y&CVS2kJ8IK!qOVaa9`n@cEg$(N_1-ZBAb1u`zD{yBR